lmflow.pipeline.vllm_inferencer#

Attributes#

logger

Classes#

`InferencerWithOffloading`
`VLLMInferencer`
`MemorySafeVLLMInferencer`

Module Contents#

lmflow.pipeline.vllm_inferencer.logger[source]#

class lmflow.pipeline.vllm_inferencer.InferencerWithOffloading(model_args: lmflow.args.ModelArguments, data_args: lmflow.args.DatasetArguments, inferencer_args: lmflow.args.InferencerArguments)[source]#

Bases: lmflow.pipeline.base_pipeline.BasePipeline

model_args[source]#

data_args[source]#

inferencer_args[source]#

eos_token_id[source]#

abstractmethod inference()[source]#

abstractmethod save_inference_results()[source]#

abstractmethod load_inference_results()[source]#

class lmflow.pipeline.vllm_inferencer.VLLMInferencer(model_args: lmflow.args.ModelArguments, data_args: lmflow.args.DatasetArguments, inferencer_args: lmflow.args.InferencerArguments)[source]#

Bases: InferencerWithOffloading

sampling_params[source]#

parse_to_sampling_params(inference_args: lmflow.args.InferencerArguments) → vllm.SamplingParams[source]#

inference(model: lmflow.models.hf_decoder_model.HFDecoderModel, dataset: lmflow.datasets.Dataset, enable_decode_inference_result: bool = True, release_gpu: bool = False, inference_args: lmflow.args.InferencerArguments | None = None, enable_distributed_inference: bool = False, **kwargs) → list[lmflow.utils.data_utils.VLLMInferenceResultWithInput][source]#

Perform inference using the provided model and dataset. Will save inference results if save_results is set to True in inferencer_args.

Parameters:

modelHFDecoderModel: LMFlow HFDecoderModel object
datasetDataset: LMFlow Dataset object
apply_chat_templatebool, optional: Whether to apply chat template to the input, by default True.
enable_decode_inference_resultbool, optional: Whether to decode after generation, by default False.
release_gpubool, optional: Whether to release gpu resources, by default False.
inference_argsInferencerArguments, optional: by default None

Returns:

list[VLLMInferenceResultWithInput]

Return a list of VLLMInferenceResultWithInput, where each element contains the input prompt and the corresponding output.

When enable_decode_inference_result = True, the output would be a list of strings, contains sampling_params.n samples for the corresponding prompt.

When enable_decode_inference_result = False, return a list of list of ints (token ids, no decoding after generation).

_inference(model: lmflow.models.hf_decoder_model.HFDecoderModel, model_input: list[str], sampling_params: vllm.SamplingParams, release_gpu: bool = False) → list[lmflow.utils.data_utils.VLLMInferenceResultWithInput][source]#

_distributed_inference(model: lmflow.models.hf_decoder_model.HFDecoderModel, model_input: ray.data.Dataset, sampling_params: vllm.SamplingParams, num_instances: int, batch_size: int = 4, release_gpu: bool = False) → list[lmflow.utils.data_utils.VLLMInferenceResultWithInput][source]#

save_inference_results(outputs: list[list[str]] | list[list[list[int]]], save_file_path: str)[source]#

load_inference_results(results_path: str) → list[list[str]] | list[list[list[int]]][source]#

class lmflow.pipeline.vllm_inferencer.MemorySafeVLLMInferencer(model_args: lmflow.args.ModelArguments, data_args: lmflow.args.DatasetArguments, inferencer_args: lmflow.args.InferencerArguments)[source]#

Bases: VLLMInferencer

inferencer_file_path[source]#

inference() → list[lmflow.utils.data_utils.VLLMInferenceResultWithInput][source]#