from fastapi import FastAPI
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import uvicorn
import os

app = FastAPI()

# Load models and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
generator_pipe = pipeline("text-generation", model=model_name, tokenizer=tokenizer)
summarizer_pipe = pipeline("summarization", model="facebook/bart-large-cnn")

app.mount("/static", StaticFiles(directory="static"), name="static")

class GenRequest(BaseModel):
    text: str
    max_new_tokens: int = 150
    do_sample: bool = False
    mode: str = "generate"  # "generate" or "summarize"

@app.get("/", response_class=HTMLResponse)
async def read_root():
    with open("templates/index.html", "r") as f:
        return f.read()

@app.post("/generate")
def generate(req: GenRequest):
    if req.mode == "summarize":
        # Use summarization pipeline
        out = summarizer_pipe(
            req.text,
            max_length=req.max_new_tokens,
            min_length=30,
            do_sample=req.do_sample,
        )
        return {"generated_text": out[0]["summary_text"]}
    else:
        # Use text generation pipeline with token-level alternatives
        return generate_with_alternatives(req)

def generate_with_alternatives(req: GenRequest):
    """Generate text token-by-token with top-5 alternatives for each token"""
    input_text = req.text
    max_new_tokens = req.max_new_tokens
    
    tokens_data = []
    current_text = input_text
    
    for _ in range(max_new_tokens):
        inputs = tokenizer(current_text, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
            next_token_logits = outputs.logits[0, -1, :]
            
            # Get probabilities
            probs = torch.softmax(next_token_logits, dim=-1)
            
            # Get top 5 alternatives
            top_k = 5
            top_probs, top_indices = torch.topk(probs, top_k)
            
            # Choose the greedy token (highest probability)
            chosen_token_id = top_indices[0].item()
            chosen_token = tokenizer.decode([chosen_token_id])
            
            # Collect alternatives
            alternatives = []
            for i in range(top_k):
                token_id = top_indices[i].item()
                token_text = tokenizer.decode([token_id])
                probability = top_probs[i].item() * 100
                alternatives.append({
                    "token": token_text,
                    "probability": round(probability, 2)
                })
            
            tokens_data.append({
                "token": chosen_token,
                "alternatives": alternatives
            })
            
            # Update current text with chosen token
            current_text += chosen_token
            
            # Check for end of sequence
            if chosen_token_id == tokenizer.eos_token_id:
                break
    
    # Reconstruct full text
    generated_text = "".join([t["token"] for t in tokens_data])
    
    return {
        "generated_text": generated_text,
        "tokens": tokens_data
    }

@app.post("/predict_next")
def predict_next(req: GenRequest):
    """Get top predictions for next word/token"""
    inputs = tokenizer(req.text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        next_token_logits = outputs.logits[0, -1, :]
        
        # Get top 10 predictions
        top_k = 10
        probs = torch.softmax(next_token_logits, dim=-1)
        top_probs, top_indices = torch.topk(probs, top_k)
        
        predictions = []
        for prob, idx in zip(top_probs.tolist(), top_indices.tolist()):
            token = tokenizer.decode([idx])
            predictions.append({
                "token": token,
                "probability": round(prob * 100, 2)
            })
    
    return {"predictions": predictions}

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)