import gradio as gr
import requests
import json

def chat_with_ai(message):
    # ⚠️ PASTE YOUR NEW AWS PUBLIC IP HERE
    url = "http://16.16.28.165/chat"
    
    payload = {
        "message": message + " Answer in 1-2 sentences.", # Forces a fast response
        "max_tokens": 100 
    }
    
    try:
        response = requests.post(url, json=payload, timeout=60)
        return response.json().get("answer", "Error: No answer in response.")
    except requests.exceptions.Timeout:
        return "The AWS server is taking too long (over 60s). Try a shorter question."
    except Exception as e:
        return f"Error connecting to AWS API: {str(e)}"

# Professional UI setup
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="gray")

# Define the clickable examples
# Each list inside the main list represents the inputs for one example
sample_prompts = [
    ["Where is my order? Can you track package #89432?"],
    ["How do I return an item that arrived damaged?"],
    ["When will the wireless headphones be back in stock?"],
    ["I was overcharged for my last purchase. How do I get a refund?"],
    ["Can I change the shipping address for an order I just placed?"]
]

demo = gr.Interface(
    fn=chat_with_ai,
    inputs=gr.Textbox(placeholder="Ask a technical question...", label="User Message"),
    outputs=gr.Textbox(label="AI Response"),
    title="AI Engineering Project: Serverless LLM",
    description="""
### Technical Overview:
* **Model:** Fine-tuned Qwen-2.5 (3B) via QLoRA.
* **Optimization:** GGUF 4-bit quantization.
* **Infrastructure:** Deployed on AWS ECS Fargate (4 vCPU) for cost-efficiency.
* **Backend:** FastAPI with strict JSON schema enforcement.

*Note: This model runs on serverless CPU infrastructure to optimize hosting costs, so inference may take 10-15 seconds.*
    """,
    theme=theme,
    examples=sample_prompts # This automatically creates the clickable buttons
)

if __name__ == "__main__":
    demo.launch()