|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import torch
|
| import random
|
| import json
|
| from torch.utils.data import IterableDataset, DataLoader
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
|
| from datasets import load_dataset
|
|
|
|
|
|
|
|
|
| MODEL_ID = "./models/ternary_3b_init"
|
|
|
| GENERAL_DATA_LINK = "monology/pile-uncopyrighted"
|
|
|
| CLIENT_DATA_FILE = "cultural_finetune.jsonl"
|
|
|
| MIX_RATIO = 0.3
|
| BATCH_SIZE = 4
|
| LEARNING_RATE = 1e-5
|
| MAX_LENGTH = 512
|
|
|
|
|
|
|
|
|
| class CMSDataMixer(IterableDataset):
|
| def __init__(self, tokenizer, client_file, pile_link):
|
| self.tokenizer = tokenizer
|
|
|
|
|
| print(f">>> Подключаюсь к общему датасету: {pile_link}")
|
| self.pile_stream = load_dataset(pile_link, split="train", streaming=True)
|
|
|
|
|
| print(f">>> Загружаю культурный код клиента из: {client_file}")
|
| self.cultural_data = []
|
| with open(client_file, 'r', encoding='utf-8') as f:
|
| for line in f:
|
| self.cultural_data.append(json.loads(line))
|
|
|
| print(f">>> Миксер готов: {MIX_RATIO*100}% данных будет из {client_file}")
|
|
|
| def __iter__(self):
|
| pile_iterator = iter(self.pile_stream)
|
|
|
| while True:
|
|
|
| if random.random() < MIX_RATIO:
|
|
|
| sample = random.choice(self.cultural_data)
|
| text = f"Question: {sample['question']}\nAnswer: {sample['answer']}"
|
| else:
|
|
|
| try:
|
| sample = next(pile_iterator)
|
| text = sample['text']
|
| except StopIteration:
|
|
|
| pile_iterator = iter(self.pile_stream)
|
| continue
|
|
|
|
|
| tokens = self.tokenizer(
|
| text,
|
| truncation=True,
|
| max_length=MAX_LENGTH,
|
| padding="max_length",
|
| return_tensors="pt"
|
| )
|
|
|
| yield {
|
| "input_ids": tokens["input_ids"].squeeze(0),
|
| "labels": tokens["input_ids"].squeeze(0)
|
| }
|
|
|
|
|
|
|
|
|
| def run_training():
|
| device = "cuda" if torch.cuda.is_available() else "cpu"
|
| print(f">>> Использую устройство: {device}")
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| if tokenizer.pad_token is None:
|
| tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
| model = AutoModelForCausalLM.from_pretrained(
|
| MODEL_NAME,
|
| torch_dtype=torch.bfloat16
|
| ).to(device)
|
|
|
|
|
| train_dataset = CMSDataMixer(tokenizer, CLIENT_DATA_FILE, GENERAL_DATA_LINK)
|
| train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
|
|
|
| optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
|
|
|
| print(">>> Запуск Fine-tuning...")
|
| model.train()
|
|
|
| for step, batch in enumerate(train_loader):
|
| input_ids = batch["input_ids"].to(device)
|
| labels = batch["labels"].to(device)
|
|
|
| outputs = model(input_ids, labels=labels)
|
| loss = outputs.loss
|
|
|
| loss.backward()
|
| optimizer.step()
|
| optimizer.zero_grad()
|
|
|
| if step % 50 == 0:
|
| print(f"Шаг {step} | Текущая ошибка (Loss): {loss.item():.4f}")
|
|
|
| if step % 500 == 0 and step > 0:
|
| model.save_pretrained(f"./checkpoint_step_{step}")
|
| print(f">>> Чекпоинт сохранен на шаге {step}")
|
|
|
| if __name__ == "__main__":
|
| run_training() |