import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import sentencepiece as spm
import requests
import os

TOKENIZER_PATH = "ko_unigram.model"
DATA_PATH = "corpus.txt"  # 36M 문장 텍스트 파일
max_len = 128
# ===============================
# 1️⃣ 파일 다운로드
# ===============================
def download_file(url, save_path):
    r = requests.get(url, stream=True)
    r.raise_for_status()
    with open(save_path, "wb") as f:
        for chunk in r.iter_content(8192*2):
            f.write(chunk)
    print(f"✅ {save_path} 저장됨")

if not os.path.exists(TOKENIZER_PATH):
    download_file(
        "https://huggingface.co/Yuchan5386/inlam-100m/resolve/main/ko_unigram.model?download=true",
        TOKENIZER_PATH
    )
if not os.path.exists(DATA_PATH):
    download_file(
        "https://huggingface.co/datasets/Yuchan5386/1/resolve/main/shuffled_corpus.txt?download=true",
        DATA_PATH
    )
# ===============================
# SentencePiece
# ===============================
sp = spm.SentencePieceProcessor("ko_unigram.model")

pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
start_id = sp.piece_to_id("<start>")
end_id = sp.piece_to_id("<end>")
vocab_size = sp.get_piece_size()
max_len = 512
batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def text_to_ids(text):
    return sp.encode(text, out_type=int)

def ids_to_text(ids):
    return sp.decode(ids)

# ===============================
# Dataset
# ===============================
class TextDataset(Dataset):
    def __init__(self, file_path, num_lines=None):
        self.lines = []
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if num_lines is not None and i >= num_lines:
                    break
                line = line.strip()
                if line:
                    self.lines.append(line)

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        text = self.lines[idx]
        ids = text_to_ids(text)[:max_len-1]
        full_input = ids + [end_id]
        pad_len = max_len - len(full_input)
        full_input += [pad_id]*pad_len
        target = full_input[1:] + [pad_id]
        return torch.tensor(full_input, dtype=torch.long), torch.tensor(target, dtype=torch.long)

dataset = TextDataset("corpus.txt", num_lines=100000)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# ===============================
# 모델 정의
# ===============================
class SwiGLU(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.W = nn.Linear(d_model, 3500)
        self.W1 = nn.Linear(1750, d_model)
    def forward(self, x):
        x = self.W(x.float())
        a,b = x.chunk(2, dim=-1)
        return self.W1(F.silu(a)*b).to(x.dtype)

class SparseCausalAttention(nn.Module):
    def __init__(self, num_heads, head_dim, window_size=8):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.window_size = window_size
        self.q = nn.Linear(head_dim*num_heads, num_heads*head_dim)
        self.k = nn.Linear(head_dim*num_heads, num_heads*head_dim)
        self.v = nn.Linear(head_dim*num_heads, num_heads*head_dim)
        self.out = nn.Linear(num_heads*head_dim, head_dim*num_heads)

    def forward(self, x):
        B,L,D = x.shape
        q = self.q(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
        k = self.k(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
        v = self.v(x).view(B,L,self.num_heads,self.head_dim).transpose(1,2)
        q = q / (self.head_dim ** 0.5)

        attn_scores = torch.matmul(q, k.transpose(-2,-1))
        mask = torch.tril(torch.ones(L,L, device=x.device))
        band_mask = torch.triu(mask, -self.window_size)
        attn_scores = attn_scores.masked_fill(band_mask==0, float('-inf'))
        attn_probs = F.softmax(attn_scores, dim=-1)
        out = torch.matmul(attn_probs, v)
        out = out.transpose(1,2).reshape(B,L,D)
        return self.out(out)

class Lo(nn.Module):
    def __init__(self,d_model):
        super().__init__()
        self.d = nn.Linear(d_model,64)
        self.w = nn.Linear(64,d_model)
        self.norm = nn.LayerNorm(d_model)
    def forward(self,x):
        return self.norm(self.w(F.silu(self.d(x))) + x)

class Block(nn.Module):
    def __init__(self,d_model):
        super().__init__()
        self.attn = SparseCausalAttention(num_heads=2, head_dim=64)
        self.glu = SwiGLU(d_model)
        self.norm = nn.LayerNorm(d_model)
        self.lo = Lo(d_model)
    def forward(self,x):
        x = self.attn(x)
        x = self.norm(self.glu(x)+x)
        x = self.lo(x)
        return x

class ReLM(nn.Module):
    def __init__(self,vocab_size,max_seq_len,d_model,n_layers):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,d_model)
        self.pos_embedding = nn.Embedding(max_seq_len,d_model)
        self.blocks = nn.ModuleList([Block(d_model) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.d_model = d_model

    def forward(self,x):
        B,L = x.shape
        positions = torch.arange(L,device=x.device).unsqueeze(0)
        x = self.token_embedding(x) + self.pos_embedding(positions)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = x @ self.token_embedding.weight.T
        return logits

# 모델, 옵티마이저, 스케줄러, 손실 함수
model = ReLM(vocab_size, max_len, 128, 2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)

# 정적 그래프 컴파일
model = torch.compile(model, mode="default")

scaler = torch.cuda.amp.GradScaler()
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():  # mixed precision
            logits = model(x)
            loss = loss_fn(logits.view(-1, vocab_size), y.view(-1))
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        if step % 100 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")
    
    scheduler.step()
    print(f"Epoch {epoch+1} 완료, 평균 Loss: {total_loss/len(dataloader):.4f}")

torch.save(model.state_dict(), "relm_model.pth")
print("✅ 모델 저장 완료!")

# ===============================
# Top-p 샘플링 생성
# ===============================
def generate_text_topp(model, prompt, max_len=150, max_gen=150, p=0.9, temperature=0.6, min_len=20):
    model.eval()
    model_input = text_to_ids(f"<start> {prompt}")
    model_input = model_input[:max_len]
    generated = list(model_input)
    with torch.no_grad():
        for step in range(max_gen):
            input_seq = generated[-max_len:] if len(generated)>max_len else generated
            input_tensor = torch.tensor([input_seq + [pad_id]*(max_len-len(input_seq))], device=device)
            logits = model(input_tensor)
            next_logits = logits[0,len(input_seq)-1]
            next_logits[end_id] -= 5.0
            next_logits[pad_id] -= 10.0
            probs = F.softmax(next_logits/temperature, dim=-1).cpu().numpy()
            sorted_indices = np.argsort(probs)[::-1]
            sorted_probs = probs[sorted_indices]
            cumulative_probs = np.cumsum(sorted_probs)
            cutoff = np.searchsorted(cumulative_probs,p)
            top_indices = sorted_indices[:cutoff+1]
            top_probs = sorted_probs[:cutoff+1]
            top_probs /= top_probs.sum()
            next_token = np.random.choice(top_indices, p=top_probs)
            if next_token==end_id and len(generated)>=min_len:
                break
            generated.append(int(next_token))
    return ids_to_text(generated)

# 테스트
print("\n===== 생성 결과 =====")
print(generate_text_topp(model, "지난 2년 동안 출연연이 국가가 필요한 연구를", p=0.9))