Source code for lmflow.utils.constants

#!/usr/bin/env python
# coding=utf-8
"""
Commonly used constants.
"""

[docs] TEXT_ONLY_DATASET_DESCRIPTION = ( """ "text_only": a dataset with only raw text instances, with following format: { "type": "text_only", "instances": [ { "text": "TEXT_1" }, { "text": "TEXT_2" }, ... ] } """ ).lstrip("\n")
[docs] TEXT_TO_SCORED_TEXTLIST_DATASET_DESCRIPTION = ( """ This kind of dataset is commonly used in reward model training/prediction, as well as rl training. { "type": "text_to_scored_textlist", "instances": [ { "input": "what's your name?", "output": [ {"score": 1.0, "text": "My name is John"}, {"score": -0.8, "text": "I'm John"} ] }, { "input": "Who are you?", "output": [ {"score": 1.5, "text": "My name is Amy"}, {"score": 1.0, "text": "I'm Amy"} ] }, ] } """ ).lstrip("\n")
[docs] PAIRED_TEXT_TO_TEXT_DATASET_DESCRIPTION = ( """ This kind of dataset is commonly used in reward model training as well as rl training. { "type": "paired_text_to_text", "instances": [ { "prompt": "Who are you?", "chosen": "My name is Amy.", "rejected": "I'm Amy", "margin": 0.6 }, { "prompt": "what's your name?", "chosen": "My name is John.", "rejected": "I'm John", "margin": 0.5 } ] } """ ).lstrip("\n")
[docs] TEXT_ONLY_DATASET_DETAILS = ( """ For example, ```python from lmflow.datasets import Dataset data_dict = { "type": "text_only", "instances": [ { "text": "Human: Hello. Bot: Hi!" }, { "text": "Human: How are you today? Bot: Fine, thank you!" }, ] } dataset = Dataset.create_from_dict(data_dict) ``` You may also save the corresponding format to json, ```python import json from lmflow.args import DatasetArguments from lmflow.datasets import Dataset data_dict = { "type": "text_only", "instances": [ { "text": "Human: Hello. Bot: Hi!" }, { "text": "Human: How are you today? Bot: Fine, thank you!" }, ] } with open("data.json", "w") as fout: json.dump(data_dict, fout) data_args = DatasetArgument(dataset_path="data.json") dataset = Dataset(data_args) new_data_dict = dataset.to_dict() # `new_data_dict` Should have the same content as `data_dict` ``` """ ).lstrip("\n")
[docs] TEXT2TEXT_DATASET_DESCRIPTION = ( """ "text2text": a dataset with input & output instances, with following format: { "type": "text2text", "instances": [ { "input": "INPUT_1", "output": "OUTPUT_1" }, { "input": "INPUT_2", "output": "OUTPUT_2" }, ... ] } """ ).lstrip("\n")
[docs] CONVERSATION_DATASET_DESCRIPTION = ( """ "conversation": a dataset with conversation instances, with following format (`conversation_id`, `system` and `tools` are optional): { "type": "conversation", "instances": [ { "conversation_id": "CONVERSATION_ID", "system": "SYSTEM_PROPMT", "tools": ["TOOL_DESCRIPTION_1","TOOL_DESCRIPTION_2","TOOL_DESCRIPTION_X"], "messages": [ { "role": "user", "content": "USER_INPUT_1" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_1" }, { "role": "user", "content": "USER_INPUT_2" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_2" } ] }, { "conversation_id": "CONVERSATION_ID", "system": "SYSTEM_PROPMT", "tools": ["TOOL_DESCRIPTION_1"], "messages": [ { "role": "user", "content": "USER_INPUT_1" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_1" } ] } ] } """ ).lstrip("\n")
[docs] PAIRED_CONVERSATION_DATASET_DESCRIPTION = ( """ "paired_conversation": a dataset with paired conversation instances, with following format: { "type": "paired_conversation", "instances": [ { "chosen": { "conversation_id": "CONVERSATION_ID", "system": "SYSTEM_PROPMT", "tools": ["TOOL_DESCRIPTION_1","TOOL_DESCRIPTION_2","TOOL_DESCRIPTION_3"], "messages": [ { "role": "user", "content": "USER_INPUT_1" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_1_GOOD" }, { "role": "user", "content": "USER_INPUT_2" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_2_GOOD" } ] }, "rejected": { "conversation_id": "CONVERSATION_ID", "system": "SYSTEM_PROPMT", "tools": ["TOOL_DESCRIPTION_1","TOOL_DESCRIPTION_2","TOOL_DESCRIPTION_3"], "messages": [ { "role": "user", "content": "USER_INPUT_1" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_1_BAD" }, { "role": "user", "content": "USER_INPUT_2" }, { "role": "assistant", "content": "ASSISTANT_RESPONSE_2_BAD" } ] } } ] } """ ).lstrip("\n")
[docs] TEXT_TO_TEXTLIST_DATASET_DESCRIPTION = ( """ This kind of dataset is commonly used in reward model inference. { "type": "text_to_textlist", "instances": [ { "input": "what's your name?", "output": [ "My name is John", "I'm John", ] }, { "input": "Who are you?", "output": [ "My name is Amy", "I'm Amy", ] }, ] } """ ).lstrip("\n")
[docs] TEXT2TEXT_DATASET_DETAILS = ( """ For example, ```python from lmflow.datasets import Dataset data_dict = { "type": "text2text", "instances": [ { "input": "Human: Hello.", "output": "Bot: Hi!", }, { "input": "Human: How are you today?", "output": "Bot: Fine, thank you! And you?", } ] } dataset = Dataset.create_from_dict(data_dict) ``` You may also save the corresponding format to json, ```python import json from lmflow.args import DatasetArguments from lmflow.datasets import Dataset data_dict = { "type": "text2text", "instances": [ { "input": "Human: Hello.", "output": "Bot: Hi!", }, { "input": "Human: How are you today?", "output": "Bot: Fine, thank you! And you?", } ] } with open("data.json", "w") as fout: json.dump(data_dict, fout) data_args = DatasetArgument(dataset_path="data.json") dataset = Dataset(data_args) new_data_dict = dataset.to_dict() # `new_data_dict` Should have the same content as `data_dict` ``` """ ).lstrip("\n")
[docs] FLOAT_ONLY_DATASET_DESCRIPTION = ( """ "float_only": a dataset with only float instances, with following format: { "type": "float_only", "instances": [ { "value": "FLOAT_1" }, { "value": "FLOAT_2" }, ... ] } """ ).lstrip("\n")
[docs] TEXT_ONLY_DATASET_LONG_DESCRITION = ( TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS )
[docs] TEXT2TEXT_DATASET_LONG_DESCRITION = ( TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS )
[docs] DATASET_DESCRIPTION_MAP = { "text_only": TEXT_ONLY_DATASET_DESCRIPTION, "text2text": TEXT2TEXT_DATASET_DESCRIPTION, "float_only": FLOAT_ONLY_DATASET_DESCRIPTION, }
[docs] INSTANCE_FIELDS_MAP = { "text_only": ["text"], "text2text": ["input", "output"], "conversation": ["messages"], # system, tools and conversation_id are optional "paired_conversation": ["chosen", "rejected"], "paired_text_to_text": ["prompt", "chosen", "rejected"], "float_only": ["value"], "image_text": ["images", "text"], "text_to_textlist": ["input", "output"], "text_to_scored_textlist": ["input", "output"], }
[docs] CONVERSATION_ROLE_NAMES = { "system": "system", "user": "user", "assistant": "assistant", "function": "function", "observation": "observation" }
# LLAVA constants
[docs] CONTROLLER_HEART_BEAT_EXPIRATION = 30
[docs] WORKER_HEART_BEAT_INTERVAL = 15
[docs] LOGDIR = "."
# Model Constants
[docs] IGNORE_INDEX = -100
[docs] IMAGE_TOKEN_INDEX = -200
[docs] DEFAULT_IMAGE_TOKEN = "<image>"
[docs] DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
[docs] DEFAULT_IM_START_TOKEN = "<im_start>"
[docs] DEFAULT_IM_END_TOKEN = "<im_end>"
# Lora # NOTE: Be careful, when passing lora_target_modules through arg parser, the # value should be like'--lora_target_modules q_proj, v_proj \', while specifying # here, it should be in list format.
[docs] LMFLOW_LORA_TARGET_MODULES_MAPPING = { 'qwen2': ["q_proj", "v_proj"], 'internlm2': ["wqkv"], }
# vllm inference
[docs] MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
[docs] RETURN_CODE_ERROR_BUFFER = [ 134 ]
# return code 134: # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> # > at interpreter shutdown, possibly due to daemon threads # The above error, by our observation, is due to the kill signal with unfinished # stdout/stderr writing in the subprocess
[docs] MEMORY_SAFE_VLLM_INFERENCE_ENV_VAR_TO_REMOVE = [ "OMP_NUM_THREADS", "LOCAL_RANK", "RANK", "GROUP_RANK", "ROLE_RANK", "ROLE_NAME", "LOCAL_WORLD_SIZE", "WORLD_SIZE", "GROUP_WORLD_SIZE", "ROLE_WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "TORCHELASTIC_RESTART_COUNT", "TORCHELASTIC_MAX_RESTARTS", "TORCHELASTIC_RUN_ID", "TORCHELASTIC_USE_AGENT_STORE", "TORCH_NCCL_ASYNC_ERROR_HANDLING", "TORCHELASTIC_ERROR_FILE", ]
# dpov2 align
[docs] MEMORY_SAFE_DPOV2_ALIGN_ENV_VAR_TO_REMOVE = [ "OMP_NUM_THREADS", "LOCAL_RANK", "RANK", "GROUP_RANK", "ROLE_RANK", "ROLE_NAME", "LOCAL_WORLD_SIZE", "WORLD_SIZE", "GROUP_WORLD_SIZE", "ROLE_WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "TORCHELASTIC_RESTART_COUNT", "TORCHELASTIC_MAX_RESTARTS", "TORCHELASTIC_RUN_ID", "TORCHELASTIC_USE_AGENT_STORE", "TORCH_NCCL_ASYNC_ERROR_HANDLING", "TORCHELASTIC_ERROR_FILE", ]