import gradio as gr
from huggingface_hub import InferenceClient
import os

# 1. Setup the Client
# We fetch the token you just added to Secrets
client = InferenceClient(token=os.getenv("HF_TOKEN"))

# 2. Your Model ID (The Adapter)
# The API is smart enough to see it's an adapter and load the Base Model automatically.
MODEL_ID = "shri171981/medical_chat_generative"

def ask_api(message, history):
    # 3. Format the prompt (Strict Llama-3 format)
    system_prompt = "You are a helpful and empathetic medical doctor. Answer the patient's question based on the input provided."
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{system_prompt}

### Input:
{message}

### Response:
"""

    try:
        # 4. Send to the API
        response = client.text_generation(
            prompt,
            model=MODEL_ID,
            max_new_tokens=128,
            temperature=0.7,
            return_full_text=False # We only want the new part
        )
        return response
        
    except Exception as e:
        # 5. Handle "Model Loading" errors
        # If the model is cold, the API returns a 503 error. 
        if "Model is loading" in str(e):
            return "⚠️ The model is waking up (Cold Start). Please wait 30 seconds and try again!"
        return f"Error: {str(e)}"

# 6. Launch
demo = gr.ChatInterface(
    fn=ask_api,
    title="🚑 HACK_DOC (API Powered)",
    description="Running on Hugging Face Serverless GPU via API.",
    examples=["I have a sharp pain in my chest.", "What is good for a fever?"],
)

if __name__ == "__main__":
    demo.launch()