import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
from fastapi.middleware.cors import CORSMiddleware
import gradio as gr

# --- NORA Chat System ---

print("🔄 Loading NORA model from Adedoyinjames/NORA...")

# Load your custom NORA model
model_name = "Adedoyinjames/NORA"

try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    print("✅ NORA model loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    raise

def generate_response(query):
    """Generates response using only the NORA model"""
    try:
        # Format prompt for chat
        prompt = f"User: {query}\nAssistant:"
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1
            )
        
        # Decode response
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the assistant's response
        if "Assistant:" in full_text:
            response = full_text.split("Assistant:")[-1].strip()
        else:
            response = full_text.replace(prompt, "").strip()
        
        return response
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

# --- FastAPI App ---
app = FastAPI(title="NORA AI", description="Chat with your custom NORA model")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class QueryRequest(BaseModel):
    query: str

@app.post("/chat/")
async def chat_with_ai(query_request: QueryRequest):
    try:
        response = generate_response(query_request.query)
        return {
            "response": response, 
            "model_used": "Adedoyinjames/NORA",
            "status": "success"
        }
    
    except Exception as e:
        return {
            "response": f"Error: {str(e)}", 
            "model_used": "Adedoyinjames/NORA",
            "status": "error"
        }

@app.get("/status/")
async def get_status():
    return {
        "model_loaded": True,
        "model_name": "Adedoyinjames/NORA",
        "system_ready": True
    }

@app.get("/")
async def root():
    return {"message": "NORA AI running with custom model"}

# Simple Gradio interface
def chat_interface(message, history):
    try:
        response = generate_response(message)
        return response
    except:
        return "System busy, please try again."

gradio_app = gr.ChatInterface(
    fn=chat_interface,
    title="NORA AI",
    description="Chat with your custom NORA model (Adedoyinjames/NORA)"
)

app = gr.mount_gradio_app(app, gradio_app, path="/gradio")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)