import gradio as gr import requests import json def chat_with_ai(message): # ⚠️ PASTE YOUR NEW AWS PUBLIC IP HERE url = "http://16.16.28.165/chat" payload = { "message": message + " Answer in 1-2 sentences.", # Forces a fast response "max_tokens": 100 } try: response = requests.post(url, json=payload, timeout=60) return response.json().get("answer", "Error: No answer in response.") except requests.exceptions.Timeout: return "The AWS server is taking too long (over 60s). Try a shorter question." except Exception as e: return f"Error connecting to AWS API: {str(e)}" # Professional UI setup theme = gr.themes.Soft(primary_hue="blue", secondary_hue="gray") # Define the clickable examples # Each list inside the main list represents the inputs for one example sample_prompts = [ ["Where is my order? Can you track package #89432?"], ["How do I return an item that arrived damaged?"], ["When will the wireless headphones be back in stock?"], ["I was overcharged for my last purchase. How do I get a refund?"], ["Can I change the shipping address for an order I just placed?"] ] demo = gr.Interface( fn=chat_with_ai, inputs=gr.Textbox(placeholder="Ask a technical question...", label="User Message"), outputs=gr.Textbox(label="AI Response"), title="AI Engineering Project: Serverless LLM", description=""" ### Technical Overview: * **Model:** Fine-tuned Qwen-2.5 (3B) via QLoRA. * **Optimization:** GGUF 4-bit quantization. * **Infrastructure:** Deployed on AWS ECS Fargate (4 vCPU) for cost-efficiency. * **Backend:** FastAPI with strict JSON schema enforcement. *Note: This model runs on serverless CPU infrastructure to optimize hosting costs, so inference may take 10-15 seconds.* """, theme=theme, examples=sample_prompts # This automatically creates the clickable buttons ) if __name__ == "__main__": demo.launch()