import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
import time

# =======================================================
# Session state to track multi-step questions
# =======================================================
session_answers = {}

# =======================================================
# Load Model
# =======================================================
model_name = "augtoma/qCammel-13"

print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
model.eval()

print("Model loaded successfully!")
print(f"Device map: {model.hf_device_map}")
print(f"Model device: {next(model.parameters()).device}")
print(f"GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

# =======================================================
# Generate Response with token-by-token streaming
# =======================================================
def generate_doctor_response(history, session_answers):
    user_message = history[-1]["content"]
    
    if not user_message.strip():
        history.append({"role": "assistant", "content": "⚠️ Please describe your symptoms or ask a question."})
        yield history
        return
    
    # Build conversation prompt
    prompt = """You are an experienced doctor conducting a medical consultation. Your role is to:
1. Ask one follow-up question at a time
2. Provide advice or suggestions if possible
3. Be conversational, caring, and thorough\n\n"""
    
    # Include last 5 exchanges
    recent_history = history[-11:-1] if len(history) > 11 else history[:-1]
    for msg in recent_history:
        role = "Patient" if msg["role"] == "user" else "Doctor"
        content = msg['content'].replace(
            "⚕️ *Note: This is AI-generated information and not a substitute for professional medical advice. Please consult a healthcare provider for proper diagnosis and treatment.*",
            ""
        ).strip()
        prompt += f"{role}: {content}\n"
    
    prompt += f"Patient: {user_message}\nDoctor:"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    
    gen_config = GenerationConfig(
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        max_new_tokens=120,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )
    
    input_length = inputs["input_ids"].shape[1]
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            generation_config=gen_config
        )
    
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    
    # Decode and clean response
    generated_ids = output_ids[0][input_length:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    
    # Stop at hints of patient message
    stop_patterns = [
        "Patient:", "\nPatient", "P:", "How are you", "I am feeling", "Thanks"
    ]
    min_stop_pos = len(response)
    for pattern in stop_patterns:
        pos = response.lower().find(pattern.lower())
        if pos != -1 and pos < min_stop_pos:
            min_stop_pos = pos
    response = response[:min_stop_pos].strip()
    
    if response.lower().startswith("doctor:"):
        response = response[7:].strip()
    
    if len(response) < 10:
        response = "I understand your concern. Could you please provide more details about your symptoms so I can assist you better?"
    
    # Append assistant placeholder for streaming
    history.append({"role": "assistant", "content": ""})
    
    # Stream token by token
    for i in range(0, len(response), 4):
        chunk = response[:i+4]
        history[-1]["content"] = chunk + "▌"
        yield history.copy()
        time.sleep(0.015)
    
    # Final response with disclaimer
    history[-1]["content"] = response
    yield history

# =======================================================
# Gradio Interface
# =======================================================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🩺 AI Doctor Chat Assistant")
    
    chatbot = gr.Chatbot(
        label="💬 Doctor Consultation",
        type='messages',
        avatar_images=(
            "https://cdn-icons-png.flaticon.com/512/706/706830.png",  # Patient
            "https://cdn-icons-png.flaticon.com/512/3774/3774299.png"   # Doctor
        ),
        height=500
    )
    
    with gr.Row():
        user_input = gr.Textbox(
            placeholder="Type your symptoms or question here...",
            label="🧍 Your Message",
            lines=2,
            scale=4
        )
    
    with gr.Row():
        send_btn = gr.Button("💬 Send", variant="primary", scale=1)
        clear_btn = gr.Button("🧹 Clear Chat", scale=1)
    
    gr.Examples(
        examples=[
            "I have a fever of 102°F since yesterday",
            "I've been having headaches for the past week",
            "I feel very tired all the time",
            "I have a sore throat and body aches",
        ],
        inputs=user_input,
        label="💡 Example Questions"
    )
    
    # Response function
    def respond(message, history):
        global session_answers
        if history is None:
            history = []
        if not message.strip():
            return "", history
        history.append({"role": "user", "content": message})
        for updated_history in generate_doctor_response(history, session_answers):
            yield "", updated_history
    
    # Event handlers
    send_btn.click(respond, [user_input, chatbot], [user_input, chatbot])
    user_input.submit(respond, [user_input, chatbot], [user_input, chatbot])
    clear_btn.click(lambda: [], None, chatbot, queue=False)

# Launch
if __name__ == "__main__":
    demo.queue()
    demo.launch(share=True)