import gradio as gr
import torch
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import sys

# Import your custom biological architecture
from moire_conv_trainer_v3 import MoireGPT, MoireGPTConfig

print("Downloading Moiré weights from HF Hub...")
# Automatically fetches the weights you uploaded earlier!
weights_path = hf_hub_download(repo_id="Aluode/MoireFormer", filename="moire_phase2_weights_final.pt")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")

print("Initializing Moiré wave-field (104.9M)...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# We use the 'large' config from your script (8 layers, 8 heads, 768 embd)
config = MoireGPTConfig(n_layer=8, n_head=8, n_embd=768)
model = MoireGPT(config)

# Load the weights into the field
state_dict = torch.load(weights_path, map_location=device)
if 'model_state_dict' in state_dict:
    state_dict = state_dict['model_state_dict']
model.load_state_dict(state_dict)
model.to(device)
model.eval()

def generate_text(prompt, max_new_tokens=80, temperature=0.7):
    """The raw physics generation loop."""
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # CROP THE INPUT: Only look at the most recent max_seq_len tokens
            # so the positional embeddings never go out of bounds (257)
            cond_input = input_ids[:, -config.max_seq_len:]
            
            # Pass the cropped signal through the Moiré field
            logits, _ = model(cond_input)
            
            # Grab the prediction for the last token
            next_token_logits = logits[:, -1, :] / temperature
            
            # Sample the next token
            probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Append it to the running sequence
            input_ids = torch.cat((input_ids, next_token), dim=1)
            
            # Stop if the field decides the thought is complete
            if next_token.item() == tokenizer.eos_token_id:
                break
                
    return tokenizer.decode(input_ids[0].tolist())

def chat_interface(message, history):
    """Formats the Gradio UI input to match your dataset's User/Bot structure."""
    prompt = ""
    
    # Inject context memory (Handles both Gradio 4 and Gradio 5 formats)
    for msg in history:
        # Gradio 5 format: dictionaries with 'role' and 'content'
        if isinstance(msg, dict):
            if msg.get("role") == "user":
                prompt += f"User: {msg.get('content')}\n"
            elif msg.get("role") == "assistant":
                prompt += f"Bot: {msg.get('content')}\n"
                
        # Fallback for Gradio 4 format: [user_msg, bot_msg]
        elif isinstance(msg, (list, tuple)) and len(msg) == 2:
            prompt += f"User: {msg[0]}\nBot: {msg[1]}\n"
    
    # Inject current message
    prompt += f"User: {message}\nBot:"
    
    # Generate the wave-interference response
    full_response = generate_text(prompt)
    
    # Strip the prompt out so the UI only shows the Bot's new reply
    response_only = full_response[len(prompt):].strip()
    return response_only

# Build the beautiful Gradio Web UI
demo = gr.ChatInterface(
    fn=chat_interface,
    title="MoireFormer (104.9M) - Phase-Interference AI",
    description="This is not a standard Transformer. It does not use dot-product attention. It calculates language via biological **Moiré wave-interference math**, proving AI can run on continuous geometric phase-space. *Note: At 100M parameters, this is a proof-of-substrate for syntax and logic, but it will hallucinate specific facts.*",
    examples=["What is the best way to make a database schema?", "Who are you?", "What is the capital of India?"]
)

if __name__ == "__main__":
    demo.launch()