# 确保你已经安装了 vllm 和 transformers# pip install vllm transformers torch huggingface_hubfrom vllm import LLM, SamplingParamsfrom vllm.lora.request import LoRARequest # Needed for LoRAfrom transformers import AutoTokenizerimport torchimport os# --- Configuration ---BASE_MODEL_ID = "Qwen/Qwen1.5-0.5B-Chat"ADAPTER_LOCAL_PATH = "/workspace/train3/qwen_qsh_finetuned_adapter" # Your LoRA adapter path# Check if the adapter path existsif not os.path.exists(ADAPTER_LOCAL_PATH) or not os.path.exists(os.path.join(ADAPTER_LOCAL_PATH, "adapter_config.json")): print(f"ERROR: Adapter path {ADAPTER_LOCAL_PATH} or its adapter_config.json does not exist!") print("Please ensure the path is correct and contains the LoRA adapter files.") # exit() # You might want to exit if the path is critical and missingmodel_configs = [ { "id": "Qwen/Qwen1.5-0.5B-Chat (Base)", # Clarified ID "path": BASE_MODEL_ID, "is_adapter_case": False, }, { "id": "Qwen/Qwen1.5-0.5B-Chat + QinShiHuang LoRA", # Descriptive ID "is_adapter_case": True, "base_model_path": BASE_MODEL_ID, "adapter_local_path": ADAPTER_LOCAL_PATH, "lora_name": "qsh_lora", # Arbitrary name for vLLM LoRARequest "lora_id": 1, # Unique integer ID for this LoRA "lora_rank": 8 # From your training script (r=8) }]all_messages_dicts = [ [{"role": "user", "content": "你是谁?"}], [{"role": "user", "content": "你几岁了?"}], [{"role": "user", "content": "介绍你自己?"}], [{"role": "user", "content": "云是如何形成的"}], [{"role": "user", "content": "什么是云"}], [{"role": "user", "content": "你出生在哪里"}],]# vLLM Sampling Parameters# Increase max_tokens if answers are truncatedsampling_params = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=300)# --- Main Loop ---for config in model_configs: model_id_str = config["id"] is_adapter_case = config["is_adapter_case"] print(f"\n--- Processing model configuration: {model_id_str} ---") tokenizer = None llm = None try: # 1. Load Tokenizer # For LoRA, we use the base model's tokenizer. # The adapter directory also contains a tokenizer.json, but it's generally safer # to rely on the base model's original tokenizer unless explicitly stated otherwise. tokenizer_path = config["base_model_path"] if is_adapter_case else config["path"] print(f"Loading tokenizer from: {tokenizer_path}") tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"Set tokenizer.pad_token to tokenizer.eos_token: {tokenizer.eos_token}") # 2. Load LLM Model if is_adapter_case: base_model_for_lora = config["base_model_path"] adapter_path_for_lora = config["adapter_local_path"] lora_rank_for_lora = config["lora_rank"] print(f"Loading base model {base_model_for_lora} with LoRA support for adapter at {adapter_path_for_lora}") if not os.path.exists(adapter_path_for_lora) or not os.path.exists(os.path.join(adapter_path_for_lora, "adapter_config.json")): print(f"ERROR: LoRA adapter path {adapter_path_for_lora} or its adapter_config.json is missing. Skipping this configuration.") continue llm = LLM( model=base_model_for_lora, trust_remote_code=True, dtype="half", # For Tesla T4 compatibility enable_lora=True, max_loras=1, # Max number of LoRAs that can be active (we use 1 here) max_lora_rank=lora_rank_for_lora, # Max rank of LoRAs (8 in your case) # You can also specify `lora_target_modules` if needed, but vLLM often infers this # from adapter_config.json. If you face issues, you might add: # lora_extra_vocab_size=0, # Default, adjust if LoRA added tokens # max_num_seqs=256 # Default, adjust based on batch size and memory ) # The LoRA weights themselves are specified per-request using LoRARequest print(f"Base model for LoRA loaded. Adapter will be specified in generate() call.") else: # Loading a full model (not for LoRA) full_model_path = config["path"] print(f"Loading full model from: {full_model_path}") llm = LLM( model=full_model_path, trust_remote_code=True, dtype="half" # For Tesla T4 compatibility ) print(f"--- Model for '{model_id_str}' loaded successfully. ---") # 3. Prepare Prompts prompts_for_current_model = [] original_questions_for_prompts = [] for messages_dict in all_messages_dicts: try: # Ensure the conversation format matches what the tokenizer expects # For Qwen, it's usually a list of dicts: [{"role": "user", "content": "..."}] prompt_str = tokenizer.apply_chat_template( messages_dict, tokenize=False, add_generation_prompt=True # Crucial for chat models to generate a response ) prompts_for_current_model.append(prompt_str) original_questions_for_prompts.append(messages_dict[0]['content']) except Exception as e: print(f"Error applying chat template for model '{model_id_str}' with messages {messages_dict}: {e}") print("Ensure your `messages_dict` is a list of dictionaries, e.g., [{'role': 'user', 'content': '...'}]") if not prompts_for_current_model: print(f"No valid prompts generated for model '{model_id_str}'. Skipping generation.") continue # 4. Generate Responses print(f"Generating {len(prompts_for_current_model)} responses with '{model_id_str}'...") if is_adapter_case: # For LoRA, create a LoRARequest object # This tells vLLM to use the specified adapter for these requests. lora_request = LoRARequest( lora_name=config["lora_name"], # Must match a name if pre-loaded, or just an identifier lora_int_id=config["lora_id"], # Unique int ID for this LoRA lora_local_path=config["adapter_local_path"] # Path to the adapter files ) outputs = llm.generate( prompts_for_current_model, sampling_params, lora_request=lora_request # Pass the LoRA request here ) else: outputs = llm.generate(prompts_for_current_model, sampling_params) # 5. Print Results for i, output in enumerate(outputs): generated_text = output.outputs[0].text original_question = original_questions_for_prompts[i] print(f"\nModel: {model_id_str}") # print(f" Raw Prompt Sent: {output.prompt}") # For debugging prompts print(f" Q ({original_question}):") print(f" A: {generated_text.strip()}") except Exception as e: print(f"ERROR processing model configuration '{model_id_str}': {e}") import traceback traceback.print_exc() # Print full traceback for debugging print(f"Skipping this model configuration due to error.") finally: # Cleanup to free GPU memory if llm is not None: # For vLLM, explicit deletion of the LLM object should trigger resource cleanup. # If you are using an older vLLM or see memory issues, you might need to look into # specific vLLM API calls for engine shutdown if available, but `del` is standard. del llm llm = None if tokenizer is not None: del tokenizer tokenizer = None if torch.cuda.is_available(): torch.cuda.empty_cache() print(f"--- Finished processing and cleaned up for: {model_id_str} ---")print("\nAll model configurations processed.")