import gradio as gr
from huggingface_hub import InferenceClient

# --- Configuration: Model List ---
MODELS = {
    "Qwen 2.5 Coder 32B (Recommended)": "Qwen/Qwen2.5-Coder-32B-Instruct",
    "Llama 3.1 8B (Best Logic)": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "DeepSeek Coder V2 Lite (Expert)": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
    "Mistral Nemo 12B (Strong)": "mistralai/Mistral-Nemo-Instruct-2407",
    "GLM-4 / CodeGeeX4 9B": "THUDM/codegeex4-all-9b"
}

# Configuration for Memory
MAX_HISTORY = 10

def generate_abap(message, history, model_choice):
    # 1. Get the Hugging Face Model ID
    model_id = MODELS.get(model_choice, "Qwen/Qwen2.5-Coder-32B-Instruct")
    
    client = InferenceClient()
    
    system_prompt = "You are an expert SAP ABAP Developer. Write modern, efficient ABAP 7.4+ code. Always use inline declarations."
    messages = [{"role": "system", "content": system_prompt}]
    
    # 2. Add History (Robust Fix)
    # We slice the history to keep memory usage low
    recent_history = history[-MAX_HISTORY:]
    
    for turn in recent_history:
        # CASE 1: History is a List of Lists (Standard Gradio format: [[user, bot], ...])
        if isinstance(turn, (list, tuple)):
            messages.append({"role": "user", "content": str(turn[0])})
            if len(turn) > 1 and turn[1] is not None:
                messages.append({"role": "assistant", "content": str(turn[1])})
        
        # CASE 2: History is a List of Dictionaries (Newer format: [{'role': 'user', ...}])
        elif isinstance(turn, dict):
            # We can simply append the dictionary directly if it has 'role' and 'content'
            messages.append(turn)

    # 3. Add Current Message
    messages.append({"role": "user", "content": str(message)})

    try:
        # 4. Stream Response
        stream = client.chat_completion(
            model=model_id,
            messages=messages,
            max_tokens=2048,
            temperature=0.1,
            top_p=0.9,
            stream=True
        )

        partial_message = ""
        for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content:
                partial_message += chunk.choices[0].delta.content
                yield partial_message
                
    except Exception as e:
        yield f"Error: The Free API is overloaded or model is too large. \nDetails: {str(e)}"

# --- The UI ---
with gr.Blocks(theme="soft") as demo:
    gr.Markdown("# 🚀 ABAP Coder Multi-Model")
    gr.Markdown("Select a model below. **Note:** Qwen 32B is large and may timeout on the free tier. If it fails, try Llama 3.1 8B.")
    
    model_selector = gr.Dropdown(
        choices=list(MODELS.keys()),
        value="Qwen 2.5 Coder 32B (Recommended)",
        label="Select AI Model"
    )
    
    chat = gr.ChatInterface(
        fn=generate_abap,
        additional_inputs=[model_selector],
        examples=[
            ["Write a report to select data from MARA using inline declarations.", "Qwen 2.5 Coder 32B (Recommended)"],
            ["Create a CDS View for Sales Orders (VBAK/VBAP).", "Llama 3.1 8B (Best Logic)"],
            ["Explain how to use READ TABLE with ASSIGNING FIELD-SYMBOL.", "DeepSeek Coder V2 Lite (Expert)"]
        ]
    )

if __name__ == "__main__":
    demo.launch()