import gradio as gr
import torch
import torch.nn as nn
from torch.nn import functional as F
from safetensors.torch import load_file
import json
import os

# --- 1. CHARGEMENT DE LA CONFIGURATION ---
with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Paramètres extraits du JSON
n_embd = config["n_embd"]
n_head = config["n_head"]
n_layer = config["n_layer"]
block_size = config["block_size"]
vocab_size = config["vocab_size"]
stoi = config["stoi"]
itos = {int(k): v for k, v in config["itos"].items()}

# --- 2. ARCHITECTURE DU MODÈLE (Version Alignée Simon Chusseau Edition) ---
class SelfAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        # Utilisation de qkv_proj pour correspondre aux poids entraînés
        self.qkv_proj = nn.Linear(n_embd, 3 * n_embd, bias=False)
        self.out_proj = nn.Linear(n_embd, n_embd, bias=False)
        self.n_head = n_head

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.qkv_proj(x).split(n_embd, dim=2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        
        att = (q @ k.transpose(-2, -1)) * (1.0 / (C // self.n_head)**0.5)
        mask = torch.triu(torch.ones(T, T, device=x.device) * float('-inf'), 1)
        att = F.softmax(att + mask[:T, :T], dim=-1)
        
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(y)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.sa = SelfAttention(n_embd, n_head)
        self.ffwd = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), nn.GELU(),
            nn.Linear(4 * n_embd, n_embd), nn.Dropout(0.1)
        )
        self.ln1, self.ln2 = nn.LayerNorm(n_embd), nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class CygnisAlpha(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))
        x = self.blocks(tok_emb + pos_emb)
        logits = self.lm_head(self.ln_f(x))
        return logits

# --- 3. INITIALISATION ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CygnisAlpha().to(device)

# Nom du fichier final fusionné
model_path = "alpha_cycle_8.safetensors" 

if os.path.exists(model_path):
    state_dict = load_file(model_path)
    model.load_state_dict(state_dict)
    model.eval()
    print(f"✅ Modèle chargé : {model_path}")
else:
    print(f"⚠️ Erreur : {model_path} non trouvé dans le répertoire.")

# --- 4. LOGIQUE DE GÉNÉRATION STABILISÉE ---
def generate_response(message, history, temperature=0.4, max_tokens=150):
    # Encodage sécurisé
    input_ids = torch.tensor([stoi.get(c, stoi.get(' ', 0)) for c in message], dtype=torch.long, device=device).unsqueeze(0)
    
    generated = input_ids
    for _ in range(max_tokens):
        cond = generated[:, -block_size:]
        with torch.no_grad():
            logits = model(cond)
        
        logits = logits[:, -1, :] / max(temperature, 0.01)
        
        # Filtre anti-répétition immédiate
        if generated.size(1) >= 2:
            if generated[0, -1] == generated[0, -2]:
                logits[0, generated[0, -1]] -= 15.0
        
        # Pénalité de fréquence (évite les boucles infinies)
        response_so_far = generated[0, len(input_ids[0]):]
        if response_so_far.numel() > 0:
            char_counts = torch.bincount(response_so_far, minlength=vocab_size)
            logits[0] -= (char_counts * 0.8)

        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        generated = torch.cat((generated, next_token), dim=1)
        
        # Arrêt si point final après une longueur minimale
        char = itos.get(next_token.item(), '')
        if char == "." and len(generated[0]) > len(input_ids[0]) + 20: 
            break

    # Décodage uniquement de la réponse
    full_text = "".join([itos.get(i.item(), '') for i in generated[0, len(input_ids[0]):]])
    return full_text.strip()

# --- 5. INTERFACE GRADIO ---
demo = gr.ChatInterface(
    fn=generate_response,
    title="🌌 Cygnis Alpha v1.0",
    description="Identité scellée : Simon Chusseau. Architecture 162M.",
    examples=[
        ["Qui est ton créateur ?", 0.3, 100],
        ["Explique la singularité technologique.", 0.6, 200],
        ["Qui es-tu ?", 0.4, 100]
    ],
    additional_inputs=[
        gr.Slider(0.1, 1.2, value=0.4, label="Température (Stable < 0.5)"),
        gr.Slider(50, 500, value=150, step=10, label="Tokens Max")
    ]
)

if __name__ == "__main__":
    demo.launch()