|
|
|
|
|
"""quic_start.ipynb |
|
|
|
|
|
Automatically generated by Colab. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1fJ_-FvN0auPakPWWqX6j6_H6i4k5OCY_ |
|
|
""" |
|
|
|
|
|
import os |
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = '3' |
|
|
|
|
|
"""# Установка и импорт""" |
|
|
|
|
|
!python3.10 -m pip install transformers datasets accelerate peft bitsandbytes sentencepiece --quiet |
|
|
|
|
|
import json |
|
|
import os |
|
|
from datasets import Dataset, load_from_disk |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
DataCollatorForLanguageModeling, |
|
|
TrainingArguments, |
|
|
Trainer |
|
|
) |
|
|
from peft import LoraConfig, get_peft_model, PeftModel |
|
|
import torch |
|
|
|
|
|
print("Torch version:", torch.__version__) |
|
|
print("Cuda available:", torch.cuda.is_available()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""# Загрузка данных""" |
|
|
|
|
|
train_path = "train.jsonl" |
|
|
val_path = None |
|
|
|
|
|
def load_jsonl(path): |
|
|
records = [] |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
try: |
|
|
records.append(json.loads(line)) |
|
|
except: |
|
|
pass |
|
|
return records |
|
|
|
|
|
train_data_raw = load_jsonl(train_path) |
|
|
|
|
|
|
|
|
len(train_data_raw), train_data_raw[:2] |
|
|
|
|
|
"""# Создание датасета и токенизация |
|
|
|
|
|
| Модель | HF имя для загрузки | Параметры | Лицензия | Сильные стороны | Слабые стороны | Языки | |
|
|
| -------------------------- | ------------------------------------------ | --------- | ---------- | ------------------------------------------------------- | -------------------------------- | ------------------ | |
|
|
| **Mistral-7B-Instruct** | `mistralai/Mistral-7B-Instruct` | 7.3B | Apache 2.0 | Отличное качество, быстрый inference, сильный reasoning | multilingual средний | EN + базовый multi | |
|
|
| **Mistral-7B** | `mistralai/Mistral-7B-v0.1` | 7.3B | Apache 2.0 | Хороший pretrain baseline | хуже чем instruct в диалогах | EN | |
|
|
| **Mixtral 8x7B Instruct** | `mistralai/Mixtral-8x7B-Instruct-v0.1` | MoE | Apache 2.0 | Very strong reasoning/code | сложнее деплой | EN + multi | |
|
|
| **LLaMA-2-7B-Chat** | `meta-llama/Llama-2-7b-chat-hf` | 7B | Custom | Баланс качества и удобства | уступает Mistral | EN | |
|
|
| **LLaMA-2-7B** | `meta-llama/Llama-2-7b-hf` | 7B | Custom | Хороший pretrain | слабый диалог без tuning | EN | |
|
|
| **Falcon-7B-Instruct** | `tiiuae/falcon-7b-instruct` | 7B | Apache 2.0 | Сильный английский диалог | хуже reasoning чем mistral | EN | |
|
|
| **Falcon-7B** | `tiiuae/falcon-7b` | 7B | Apache 2.0 | Хороший генератор | хуже чем instruct | EN | |
|
|
| **MPT-7B-Instruct** | `mosaicml/mpt-7b-instruct` | 7B | Apache 2.0 | оптимизация для продакшн | уступает mistral | EN | |
|
|
| **MPT-7B** | `mosaicml/mpt-7b` | 7B | Apache 2.0 | хорошая скорость | average качество | EN | |
|
|
| **Baichuan2-7B-Chat** | `baichuan-inc/Baichuan2-7B-Chat` | 7B | Permissive | сильный CN+EN, диалог | ниже на EN reasoning | CN, EN | |
|
|
| **Baichuan2-7B-Base** | `baichuan-inc/Baichuan2-7B-Base` | 7B | Permissive | большой CN корпус | EN слабее | CN, EN | |
|
|
| **Qwen-7B-Chat** | `Qwen/Qwen-7B-Chat` | 7B | Apache 2.0 | сильный CN/EN, мощный чат | нужно выбирать правильную версию | CN, EN | |
|
|
| **Qwen-7B** | `Qwen/Qwen-7B` | 7B | Apache 2.0 | хорошая кодовая модель | требует tuning для диалогов | CN, EN | |
|
|
| **InternLM-7B-Chat** | `internlm/internlm-chat-7b` | 7B | Permissive | сильный CN-диалог | EN средний | CN, EN | |
|
|
| **InternLM-7B** | `internlm/internlm-7b` | 7B | Permissive | базовая CN модель | слабее чем chat | CN | |
|
|
| **Pythia-6.9B** | `EleutherAI/pythia-6.9b` | 6.9B | Apache 2.0 | отлично для research | не optimized для диалога | EN | |
|
|
| **StableLM-3B-Instruct** | `stabilityai/stablelm-3b-4e1t-instruct` | 3B | Apache 2.0 | лёгкая, быстрая | меньшее качество | EN | |
|
|
| **StableLM-Base-Alpha 3B** | `stabilityai/stablelm-base-alpha-3b` | 3B | Apache 2.0 | маленькая, удобна для LoRA | слабее instruct | EN | |
|
|
| **StableCode 3B** | `stabilityai/stablecode-instruct-alpha-3b` | 3B | Apache 2.0 | хороша для code | не для general dialogue | EN | |
|
|
|
|
|
--- |
|
|
|
|
|
```python |
|
|
import pandas as pd |
|
|
df = pd.read_csv("models.csv") |
|
|
|
|
|
def load_model_by_name(name, load_4bit=True): |
|
|
row = df[df['name'] == name].iloc[0] |
|
|
MODEL = row['hf_name'] |
|
|
print("Loading:", MODEL) |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, trust_remote_code=True) |
|
|
|
|
|
if load_4bit: |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL, |
|
|
device_map="auto", |
|
|
load_in_4bit=True, |
|
|
trust_remote_code=True |
|
|
) |
|
|
else: |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16, |
|
|
trust_remote_code=True |
|
|
) |
|
|
return tokenizer, model |
|
|
``` |
|
|
""" |
|
|
|
|
|
MODEL = "Qwen/Qwen2.5-0.5B" |
|
|
MAX_LEN = 1024 |
|
|
SEP = "\n\n### Ответ:\n\n" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) |
|
|
|
|
|
def make_dataset(records): |
|
|
texts = [r["prompt"] + SEP + r["response"] for r in records] |
|
|
ds = Dataset.from_dict({"text": texts}) |
|
|
|
|
|
def tokenize(batch): |
|
|
out = tokenizer( |
|
|
batch["text"], |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
max_length=MAX_LEN |
|
|
) |
|
|
out["labels"] = out["input_ids"].copy() |
|
|
return out |
|
|
|
|
|
ds = ds.map(tokenize, batched=True, remove_columns=["text"]) |
|
|
return ds |
|
|
|
|
|
train_ds = make_dataset(train_data_raw) |
|
|
val_ds = None |
|
|
|
|
|
train_ds |
|
|
|
|
|
"""# Загрузка модели и настройка LoRA""" |
|
|
|
|
|
USE_8BIT = False |
|
|
|
|
|
print("Загружаем модель...") |
|
|
|
|
|
if USE_8BIT: |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL, |
|
|
load_in_8bit=True, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16, |
|
|
) |
|
|
else: |
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL) |
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=8, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
|
|
|
print("LoRA слои установлены.") |
|
|
|
|
|
OUTPUT_DIR = "outputs/qwen_lora" |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) |
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=OUTPUT_DIR, |
|
|
per_device_train_batch_size=2, |
|
|
per_device_eval_batch_size=2, |
|
|
gradient_accumulation_steps=8, |
|
|
num_train_epochs=2, |
|
|
learning_rate=2e-4, |
|
|
warmup_ratio=0.03, |
|
|
logging_steps=25, |
|
|
save_steps=500, |
|
|
evaluation_strategy="steps" if val_ds else "no", |
|
|
eval_steps=500 if val_ds else None, |
|
|
fp16=True, |
|
|
save_total_limit=2, |
|
|
gradient_checkpointing=True, |
|
|
report_to="none", |
|
|
) |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_ds, |
|
|
eval_dataset=val_ds, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
trainer |
|
|
|
|
|
trainer.train() |
|
|
model.save_pretrained(OUTPUT_DIR + "/peft_lora") |
|
|
print("LoRA веса сохранены.") |
|
|
|
|
|
def generate(prompt, max_new_tokens=150): |
|
|
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) |
|
|
|
|
|
out = model.generate( |
|
|
input_ids, |
|
|
max_new_tokens=max_new_tokens, |
|
|
do_sample=True, |
|
|
temperature=0.8, |
|
|
top_p=0.95, |
|
|
top_k=50, |
|
|
repetition_penalty=1.1, |
|
|
pad_token_id=tokenizer.pad_token_id, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
return tokenizer.decode(out[0], skip_special_tokens=True) |
|
|
|
|
|
prompt = "Объясни простыми словами, что такое градиентный спуск." |
|
|
print(generate(prompt)) |
|
|
|
|
|
"""## Перезагрузка модели с LoRA из сохранённого каталога |
|
|
|
|
|
(для отдельного запуска/после рестарта kernel) |
|
|
""" |
|
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.float16, device_map="auto") |
|
|
base_tokenizer = AutoTokenizer.from_pretrained(MODEL) |
|
|
|
|
|
peft_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR + "/peft_lora") |
|
|
|
|
|
def infer_lora(prompt): |
|
|
input_ids = base_tokenizer(prompt, return_tensors="pt").input_ids.to(peft_model.device) |
|
|
out = peft_model.generate(input_ids, max_new_tokens=100, do_sample=True) |
|
|
return base_tokenizer.decode(out[0], skip_special_tokens=True) |
|
|
|
|
|
infer_lora("Расскажи, что такое нейронная сеть.") |