|
|
from datasets import Dataset
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
|
|
from peft import LoraConfig, get_peft_model, PeftModel
|
|
|
|
|
|
raw_data_path = ""
|
|
|
with open(raw_data_path, "r", encoding="utf-8") as f:
|
|
|
raw_lines = f.readlines()
|
|
|
|
|
|
def process_line(line):
|
|
|
segments = line.strip().split("/")
|
|
|
return "/".join(segments[:-1]) if len(segments) > 1 else line.strip()
|
|
|
|
|
|
processed_samples = [process_line(line) for line in raw_lines if line.strip()]
|
|
|
dataset = Dataset.from_dict({"text": processed_samples})
|
|
|
|
|
|
model_name = ""
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
|
|
|
|
lora_config = LoraConfig(
|
|
|
r=8,
|
|
|
lora_alpha=32,
|
|
|
target_modules=["q_proj", "v_proj"],
|
|
|
lora_dropout=0.1,
|
|
|
bias="none",
|
|
|
task_type="CAUSAL_LM"
|
|
|
)
|
|
|
model = get_peft_model(model, lora_config)
|
|
|
|
|
|
def tokenize_function(examples):
|
|
|
|
|
|
prompt = "根据以下关键词生成一首歌词,歌词中包含多个句子,句子与句子之间使用/隔开,让我们一步一步的思考(思考过程包含在<think>和</think>之间):"
|
|
|
|
|
|
|
|
|
modified_texts = [prompt + text for text in examples["text"]]
|
|
|
|
|
|
|
|
|
tokenized = tokenizer(modified_texts, truncation=True, padding="max_length", max_length=256)
|
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy()
|
|
|
|
|
|
return tokenized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
|
output_dir="./lora",
|
|
|
num_train_epochs=8,
|
|
|
per_device_train_batch_size=10,
|
|
|
learning_rate=2e-5,
|
|
|
weight_decay=0.01,
|
|
|
logging_steps=10000,
|
|
|
save_steps=15000,
|
|
|
fp16=True,
|
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
|
model=model,
|
|
|
args=training_args,
|
|
|
train_dataset=tokenized_dataset,
|
|
|
tokenizer=tokenizer,
|
|
|
)
|
|
|
|
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
|
|
|
generation_config = {
|
|
|
"max_new_tokens": 1024,
|
|
|
"temperature": 1.0,
|
|
|
"top_p": 0.9,
|
|
|
"top_k": 40,
|
|
|
"repetition_penalty": 1.2,
|
|
|
"do_sample": True,
|
|
|
"encoder_no_repeat_ngram_size": 4,
|
|
|
}
|
|
|
if True:
|
|
|
prompt = "根据以下关键词生成一首歌词,歌词中包含多个句子,句子与句子之间使用/隔开,让我们一步一步的思考(思考过程包含在<think>和</think>之间):温柔,轮廓,洒脱:"
|
|
|
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
|
|
outputs = model.generate(input_ids, **generation_config)
|
|
|
decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
|
|
|
|
|
print(decoded)
|
|
|
|
|
|
model.save_pretrained("") |