Spaces:
Runtime error
Runtime error
| import os | |
| import time | |
| import torch | |
| import torch.nn as nn | |
| from torch.nn import functional as F | |
| from torch.utils.data import DataLoader, Dataset | |
| import tiktoken | |
| import gradio as gr | |
| from tqdm import tqdm | |
| import numpy as np | |
| from datasets import load_dataset | |
| # ---------- 1. Жёсткие ограничения на ресурсы ---------- | |
| # Используем 12 ядер CPU и ~13 ГБ RAM | |
| torch.set_num_threads(12) | |
| torch.set_num_interop_threads(12) | |
| # Ограничение памяти PyTorch (опционально, для безопасности) | |
| # torch.cuda.empty_cache() – не нужно, так как CPU | |
| # --- Гиперпараметры модели (подобраны под 13 ГБ RAM) --- | |
| vocab_size = 50257 | |
| block_size = 256 | |
| n_embd = 384 | |
| n_head = 6 | |
| n_layer = 6 | |
| dropout = 0.1 | |
| # --- Гиперпараметры обучения (снижены для экономии памяти) --- | |
| batch_size = 24 # было 32 -> снижаем | |
| learning_rate = 5e-4 | |
| max_iters = 15000 | |
| eval_interval = 500 | |
| eval_iters = 100 | |
| warmup_iters = 500 | |
| # --- Параметры DataLoader (умеренные) --- | |
| num_workers = 6 # было 8 -> снижаем | |
| prefetch_factor = 4 | |
| pin_memory = True | |
| device = 'cpu' | |
| print(f"Устройство: {device}") | |
| print(f"Используется CPU потоков: {torch.get_num_threads()}") | |
| # ---------- 2. Датасет и токенизация ---------- | |
| print("\n[1/5] Загрузка и токенизация датасета...") | |
| dataset = load_dataset("JoshKeesee/Alfred-Indigo", split="train") | |
| dialogue_texts = [] | |
| for example in dataset: | |
| dialogue = "\n".join([f"{msg['role']}: {msg['content']}" for msg in example['messages']]) | |
| dialogue_texts.append(dialogue) | |
| all_text = "\n\n".join(dialogue_texts) | |
| print(f"Загружено {len(dialogue_texts)} диалогов. Общий объём: {len(all_text)} символов.") | |
| enc = tiktoken.get_encoding("gpt2") | |
| data = torch.tensor(enc.encode_ordinary(all_text), dtype=torch.long) | |
| n = int(0.9 * len(data)) | |
| train_data = data[:n] | |
| val_data = data[n:] | |
| class TextDataset(Dataset): | |
| def __init__(self, data, block_size): | |
| self.data = data | |
| self.block_size = block_size | |
| def __len__(self): | |
| return len(self.data) - self.block_size | |
| def __getitem__(self, idx): | |
| x = self.data[idx:idx+self.block_size] | |
| y = self.data[idx+1:idx+self.block_size+1] | |
| return x, y | |
| train_dataset = TextDataset(train_data, block_size) | |
| val_dataset = TextDataset(val_data, block_size) | |
| # DataLoader с умеренным числом воркеров | |
| train_loader = DataLoader( | |
| train_dataset, | |
| batch_size=batch_size, | |
| shuffle=True, | |
| num_workers=num_workers, | |
| pin_memory=pin_memory, | |
| prefetch_factor=prefetch_factor | |
| ) | |
| val_loader = DataLoader( | |
| val_dataset, | |
| batch_size=batch_size, | |
| shuffle=False, | |
| num_workers=num_workers, | |
| pin_memory=pin_memory, | |
| prefetch_factor=prefetch_factor | |
| ) | |
| # ---------- 3. Архитектура модели (оптимизированная) ---------- | |
| class AttentionHead(nn.Module): | |
| def __init__(self, head_size): | |
| super().__init__() | |
| self.key = nn.Linear(n_embd, head_size, bias=False) | |
| self.query = nn.Linear(n_embd, head_size, bias=False) | |
| self.value = nn.Linear(n_embd, head_size, bias=False) | |
| self.dropout = nn.Dropout(dropout) | |
| self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) | |
| def forward(self, x): | |
| B, T, C = x.shape | |
| k = self.key(x) | |
| q = self.query(x) | |
| wei = q @ k.transpose(-2, -1) * (C ** -0.5) | |
| wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) | |
| wei = F.softmax(wei, dim=-1) | |
| wei = self.dropout(wei) | |
| v = self.value(x) | |
| return wei @ v | |
| class MultiHeadAttention(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| head_size = n_embd // n_head | |
| self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(n_head)]) | |
| self.proj = nn.Linear(n_embd, n_embd) | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x): | |
| out = torch.cat([h(x) for h in self.heads], dim=-1) | |
| out = self.dropout(self.proj(out)) | |
| return out | |
| class FeedForward(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.net = nn.Sequential( | |
| nn.Linear(n_embd, 4 * n_embd), | |
| nn.GELU(), | |
| nn.Linear(4 * n_embd, n_embd), | |
| nn.Dropout(dropout) | |
| ) | |
| def forward(self, x): | |
| return self.net(x) | |
| class TransformerBlock(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.ln1 = nn.LayerNorm(n_embd) | |
| self.attn = MultiHeadAttention() | |
| self.ln2 = nn.LayerNorm(n_embd) | |
| self.ffwd = FeedForward() | |
| def forward(self, x): | |
| x = x + self.attn(self.ln1(x)) | |
| x = x + self.ffwd(self.ln2(x)) | |
| return x | |
| class GPTLanguageModel(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.token_embedding = nn.Embedding(vocab_size, n_embd) | |
| self.position_embedding = nn.Embedding(block_size, n_embd) | |
| self.blocks = nn.Sequential(*[TransformerBlock() for _ in range(n_layer)]) | |
| self.ln_f = nn.LayerNorm(n_embd) | |
| self.lm_head = nn.Linear(n_embd, vocab_size) | |
| def forward(self, idx, targets=None): | |
| B, T = idx.shape | |
| tok_emb = self.token_embedding(idx) | |
| pos_emb = self.position_embedding(torch.arange(T, device=device)) | |
| x = tok_emb + pos_emb | |
| x = self.blocks(x) | |
| x = self.ln_f(x) | |
| logits = self.lm_head(x) | |
| loss = None | |
| if targets is not None: | |
| B, T, C = logits.shape | |
| logits = logits.view(B*T, C) | |
| targets = targets.view(B*T) | |
| loss = F.cross_entropy(logits, targets) | |
| return logits, loss | |
| def generate(self, idx, max_new_tokens, temperature=0.8, top_k=40): | |
| for _ in range(max_new_tokens): | |
| idx_cond = idx[:, -block_size:] | |
| logits, _ = self.forward(idx_cond) | |
| logits = logits[:, -1, :] / temperature | |
| if top_k is not None: | |
| v, _ = torch.topk(logits, min(top_k, logits.size(-1))) | |
| logits[logits < v[:, [-1]]] = -float('Inf') | |
| probs = F.softmax(logits, dim=-1) | |
| idx_next = torch.multinomial(probs, num_samples=1) | |
| idx = torch.cat((idx, idx_next), dim=1) | |
| return idx | |
| model = GPTLanguageModel() | |
| # Компиляция (работает на PyTorch 2.x) | |
| model = torch.compile(model) | |
| print(f"Модель создана. Параметров: {sum(p.numel() for p in model.parameters())/1e6:.2f}M") | |
| # ---------- 4. Обучение ---------- | |
| def get_batch_from_loader(loader): | |
| for x, y in loader: | |
| yield x, y | |
| def estimate_loss(): | |
| out = {} | |
| model.eval() | |
| for split, loader in [('train', train_loader), ('val', val_loader)]: | |
| losses = torch.zeros(eval_iters) | |
| loader_iter = iter(loader) | |
| for k in range(eval_iters): | |
| try: | |
| X, Y = next(loader_iter) | |
| except StopIteration: | |
| loader_iter = iter(loader) | |
| X, Y = next(loader_iter) | |
| logits, loss = model(X, Y) | |
| losses[k] = loss.item() | |
| out[split] = losses.mean() | |
| model.train() | |
| return out | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.1) | |
| def get_lr(it): | |
| if it < warmup_iters: | |
| return learning_rate * (it + 1) / warmup_iters | |
| return learning_rate | |
| print("\n[2/5] Старт обучения (ограничение 12 CPU / 13 ГБ RAM)...") | |
| start_time = time.time() | |
| for iter_num in tqdm(range(max_iters), desc="Обучение"): | |
| lr = get_lr(iter_num) | |
| for param_group in optimizer.param_groups: | |
| param_group['lr'] = lr | |
| if iter_num % eval_interval == 0 or iter_num == max_iters - 1: | |
| losses = estimate_loss() | |
| elapsed = time.time() - start_time | |
| print(f"\nШаг {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} (время {elapsed:.2f} с)") | |
| xb, yb = next(iter(train_loader)) | |
| logits, loss = model(xb, yb) | |
| optimizer.zero_grad(set_to_none=True) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) | |
| optimizer.step() | |
| print(f"\nОбучение завершено! Время: {(time.time() - start_time)/60:.2f} мин") | |
| # Сохранение | |
| os.makedirs('checkpoints', exist_ok=True) | |
| torch.save(model._orig_mod.state_dict(), 'checkpoints/model_final.pth') | |
| print("Модель сохранена в 'checkpoints/model_final.pth'") | |
| # ---------- 5. Интерфейс Gradio ---------- | |
| def generate_response(prompt, max_new_tokens=150, temperature=0.7, top_k=40): | |
| context = torch.tensor(enc.encode_ordinary(prompt), dtype=torch.long, device=device).unsqueeze(0) | |
| generated_ids = model.generate(context, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)[0].tolist() | |
| return enc.decode(generated_ids) | |
| def chat_function(message, history): | |
| return generate_response(message) | |
| demo = gr.ChatInterface( | |
| fn=chat_function, | |
| title="🤖 GPT обучена с нуля (12 CPU / 13 ГБ RAM)", | |
| description="Модель обучена на Alfred-Indigo, 6 слоёв, 6 голов внимания, контекст 256 токенов. Ограничение ресурсов: 12 ядер CPU, ~13 ГБ RAM.", | |
| theme="soft" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |