lora微调

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    # BitsAndBytesConfig # For QLoRA
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 
# 1. 定义模型和数据集路径
model_id = "Qwen/Qwen1.5-0.5B-Chat" # 或者 Qwen/Qwen1.5-0.5B-Chat,如果你想基于聊天版微调
dataset_file = "qin_shi_huang_dataset.jsonl" # 你创建的数据集文件
output_dir = "./qwen_qsh_finetuned_adapter" # LoRA 适配器保存路径
 
# 2. 加载数据集
# Hugging Face datasets 库可以直接加载 jsonl 文件,但需要确保格式符合其期望
# 或者手动解析jsonl并转换成Dataset对象
# 这里假设jsonl是 {"messages": [{"role":"user", "content":"..."}, {"role":"assistant", "content":"..."}]} 格式
# 需要将其转换为模型训练时需要的格式,通常是拼接成一个长字符串:"<s>[INST] User Question [/INST] Assistant Answer </s>"
# Qwen1.5 的聊天模板可能不同,需要查阅官方文档或tokenizer的chat_template
 
# 简化处理:假设我们已经将数据转换成 text 字段的 jsonl
# 例如:{"text": "<|im_start|>user\n你是谁?<|im_end|>\n<|im_start|>assistant\n朕乃秦始皇。<|im_end|>"}
# 你需要编写一个预处理函数来应用聊天模板
# dataset = load_dataset("json", data_files=dataset_file, split="train")
 
# --- 更标准的聊天格式处理 ---
raw_dataset = load_dataset("json", data_files=dataset_file, split="train")
 
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # 设置 pad_token
 
def apply_chat_template(examples):
    # Qwen1.5 的模板是 <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user...
    # 但我们这里是特定角色扮演,可以省略 system prompt 或自定义
    # 或者直接使用 tokenizer.apply_chat_template
    processed_texts = []
    for conversation in examples["messages"]:
        # 确保 conversation 是一个列表的字典
        # [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
        try:
            # 注意:Qwen1.5 的tokenizer可能没有默认的 chat_template,
            # 或者需要手动构建。查阅 Qwen1.5 文档确认正确的模板格式。
            # 以下是一个通用的 apply_chat_template 示例,你可能需要调整
            formatted_text = tokenizer.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=False # 对训练数据,不需要加 generation_prompt
            )
            processed_texts.append(formatted_text + tokenizer.eos_token) # 添加 EOS token
        except Exception as e:
            print(f"Error processing conversation: {conversation} with error: {e}")
            processed_texts.append("") # 或者跳过
 
    return {"text": processed_texts}
 
dataset = raw_dataset.map(apply_chat_template, batched=True, remove_columns=raw_dataset.column_names)
 
# 过滤掉处理失败的空字符串
dataset = dataset.filter(lambda example: example['text'] != "")
 
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512) # 根据需要调整 max_length
 
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(tokenized_dataset)
# --- 数据集处理结束 ---
 
 
# 3. QLoRA 配置 (可选, 推荐用于资源有限的情况)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="bfloat16" # torch.bfloat16
# )
 
# 4. 加载模型
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config, # 应用QLoRA配置
    device_map={"":0}, # 自动将模型加载到可用GPU或CPU
    trust_remote_code=True
)
model = prepare_model_for_kbit_training(model) # 准备模型进行k-bit训练
 
# 5. LoRA 配置
# Qwen1.5 模型中可应用LoRA的模块通常是 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'
# 具体需要查看模型结构或官方推荐
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # 根据Qwen1.5的层名调整
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
 
 
# 6. 训练参数
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1, # 根据你的显存调整
    gradient_accumulation_steps=1,  # 效果=batch_size*grad_acc_steps
    learning_rate=5e-5,
    num_train_epochs=30, # 对于小数据集,几轮就够了
    logging_steps=10,
    save_steps=50, # 每50步保存一次adapters
    # fp16=True, # 如果GPU支持且未使用4-bit,可以开启
    # bf16=True, # 如果GPU支持(如A100, H100),推荐使用 bf16
    # optim="paged_adamw_8bit", # 配合QLoRA使用
    # report_to="tensorboard", # 可选,用于可视化训练过程
)
 
# 7. Data Collator
# 对于文本生成,通常使用DataCollatorForLanguageModeling,它会处理padding和masking labels
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
 
# 8. 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)
 
# 9. 开始训练
print("开始微调...")
trainer.train()
 
# 10. 保存 LoRA adapter
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) # 同时保存tokenizer配置
print(f"微调完成,LoRA adapter 保存在 {output_dir}")

vllm测试

# 确保你已经安装了 vllm 和 transformers
# pip install vllm transformers torch huggingface_hub
 
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest # Needed for LoRA
from transformers import AutoTokenizer
import torch
import os
 
# --- Configuration ---
BASE_MODEL_ID = "Qwen/Qwen1.5-0.5B-Chat"
ADAPTER_LOCAL_PATH = "/workspace/train3/qwen_qsh_finetuned_adapter" # Your LoRA adapter path
 
# Check if the adapter path exists
if not os.path.exists(ADAPTER_LOCAL_PATH) or not os.path.exists(os.path.join(ADAPTER_LOCAL_PATH, "adapter_config.json")):
    print(f"ERROR: Adapter path {ADAPTER_LOCAL_PATH} or its adapter_config.json does not exist!")
    print("Please ensure the path is correct and contains the LoRA adapter files.")
    # exit() # You might want to exit if the path is critical and missing
 
model_configs = [
    {
        "id": "Qwen/Qwen1.5-0.5B-Chat (Base)", # Clarified ID
        "path": BASE_MODEL_ID,
        "is_adapter_case": False,
    },
    {
        "id": "Qwen/Qwen1.5-0.5B-Chat + QinShiHuang LoRA", # Descriptive ID
        "is_adapter_case": True,
        "base_model_path": BASE_MODEL_ID,
        "adapter_local_path": ADAPTER_LOCAL_PATH,
        "lora_name": "qsh_lora", # Arbitrary name for vLLM LoRARequest
        "lora_id": 1,            # Unique integer ID for this LoRA
        "lora_rank": 8           # From your training script (r=8)
    }
]
 
all_messages_dicts = [
    [{"role": "user", "content": "你是谁?"}],
    [{"role": "user", "content": "你几岁了?"}],
    [{"role": "user", "content": "介绍你自己?"}],
    [{"role": "user", "content": "云是如何形成的"}],
    [{"role": "user", "content": "什么是云"}],
    [{"role": "user", "content": "你出生在哪里"}],
]
 
# vLLM Sampling Parameters
# Increase max_tokens if answers are truncated
sampling_params = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=300)
 
# --- Main Loop ---
for config in model_configs:
    model_id_str = config["id"]
    is_adapter_case = config["is_adapter_case"]
    
    print(f"\n--- Processing model configuration: {model_id_str} ---")
 
    tokenizer = None
    llm = None
 
    try:
        # 1. Load Tokenizer
        # For LoRA, we use the base model's tokenizer.
        # The adapter directory also contains a tokenizer.json, but it's generally safer
        # to rely on the base model's original tokenizer unless explicitly stated otherwise.
        tokenizer_path = config["base_model_path"] if is_adapter_case else config["path"]
        print(f"Loading tokenizer from: {tokenizer_path}")
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Set tokenizer.pad_token to tokenizer.eos_token: {tokenizer.eos_token}")
 
 
        # 2. Load LLM Model
        if is_adapter_case:
            base_model_for_lora = config["base_model_path"]
            adapter_path_for_lora = config["adapter_local_path"]
            lora_rank_for_lora = config["lora_rank"]
            print(f"Loading base model {base_model_for_lora} with LoRA support for adapter at {adapter_path_for_lora}")
            
            if not os.path.exists(adapter_path_for_lora) or not os.path.exists(os.path.join(adapter_path_for_lora, "adapter_config.json")):
                print(f"ERROR: LoRA adapter path {adapter_path_for_lora} or its adapter_config.json is missing. Skipping this configuration.")
                continue
 
            llm = LLM(
                model=base_model_for_lora,
                trust_remote_code=True,
                dtype="half",  # For Tesla T4 compatibility
                enable_lora=True,
                max_loras=1,  # Max number of LoRAs that can be active (we use 1 here)
                max_lora_rank=lora_rank_for_lora, # Max rank of LoRAs (8 in your case)
                # You can also specify `lora_target_modules` if needed, but vLLM often infers this
                # from adapter_config.json. If you face issues, you might add:
                # lora_extra_vocab_size=0, # Default, adjust if LoRA added tokens
                # max_num_seqs=256 # Default, adjust based on batch size and memory
            )
            # The LoRA weights themselves are specified per-request using LoRARequest
            print(f"Base model for LoRA loaded. Adapter will be specified in generate() call.")
        else:
            # Loading a full model (not for LoRA)
            full_model_path = config["path"]
            print(f"Loading full model from: {full_model_path}")
            llm = LLM(
                model=full_model_path,
                trust_remote_code=True,
                dtype="half"  # For Tesla T4 compatibility
            )
        
        print(f"--- Model for '{model_id_str}' loaded successfully. ---")
 
        # 3. Prepare Prompts
        prompts_for_current_model = []
        original_questions_for_prompts = []
 
        for messages_dict in all_messages_dicts:
            try:
                # Ensure the conversation format matches what the tokenizer expects
                # For Qwen, it's usually a list of dicts: [{"role": "user", "content": "..."}]
                prompt_str = tokenizer.apply_chat_template(
                    messages_dict,
                    tokenize=False,
                    add_generation_prompt=True # Crucial for chat models to generate a response
                )
                prompts_for_current_model.append(prompt_str)
                original_questions_for_prompts.append(messages_dict[0]['content'])
            except Exception as e:
                print(f"Error applying chat template for model '{model_id_str}' with messages {messages_dict}: {e}")
                print("Ensure your `messages_dict` is a list of dictionaries, e.g., [{'role': 'user', 'content': '...'}]")
 
        if not prompts_for_current_model:
            print(f"No valid prompts generated for model '{model_id_str}'. Skipping generation.")
            continue
 
        # 4. Generate Responses
        print(f"Generating {len(prompts_for_current_model)} responses with '{model_id_str}'...")
        
        if is_adapter_case:
            # For LoRA, create a LoRARequest object
            # This tells vLLM to use the specified adapter for these requests.
            lora_request = LoRARequest(
                lora_name=config["lora_name"],      # Must match a name if pre-loaded, or just an identifier
                lora_int_id=config["lora_id"],      # Unique int ID for this LoRA
                lora_local_path=config["adapter_local_path"] # Path to the adapter files
            )
            outputs = llm.generate(
                prompts_for_current_model,
                sampling_params,
                lora_request=lora_request # Pass the LoRA request here
            )
        else:
            outputs = llm.generate(prompts_for_current_model, sampling_params)
            
        # 5. Print Results
        for i, output in enumerate(outputs):
            generated_text = output.outputs[0].text
            original_question = original_questions_for_prompts[i]
            
            print(f"\nModel: {model_id_str}")
            # print(f"  Raw Prompt Sent: {output.prompt}") # For debugging prompts
            print(f"  Q ({original_question}):")
            print(f"  A: {generated_text.strip()}")
 
    except Exception as e:
        print(f"ERROR processing model configuration '{model_id_str}': {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        print(f"Skipping this model configuration due to error.")
        
    finally:
        # Cleanup to free GPU memory
        if llm is not None:
            # For vLLM, explicit deletion of the LLM object should trigger resource cleanup.
            # If you are using an older vLLM or see memory issues, you might need to look into
            # specific vLLM API calls for engine shutdown if available, but `del` is standard.
            del llm 
            llm = None
        if tokenizer is not None:
            del tokenizer
            tokenizer = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        print(f"--- Finished processing and cleaned up for: {model_id_str} ---")
 
print("\nAll model configurations processed.")

vllm server 部署

python -m vllm.entrypoints.openai.api_server --model "${BASE_MODEL}" --trust-remote-code --dtype half --enable-lora --lora-modules "${LORA_MODULE_NAME}=${ADAPTER_PATH}" --max-lora-rank 8 --port 8000 --host 0.0.0.0

openai api应用

from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="token-abc123",
)

completion = client.chat.completions.create(
#   model="Qwen/Qwen1.5-0.5B-Chat",
  model="qsh_lora",
  messages=[
    {"role": "user", "content": "你是谁!"}
  ]
)

print(completion.choices[0].message)