import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

print("Starting model download...")

# Download model file explicitly (better control)
try:
    # Try to find the GGUF file in the repo
    # If your repo has a different filename, change it here
    model_path = hf_hub_download(
        repo_id="TheBloke/CodeLlama-7B-Python-GGUF",  # Using TheBloke's reliable repo
        filename="codellama-7b-python.Q4_K_M.gguf",  # 4.08GB file
        cache_dir="./models"
    )
    print(f"✓ Model downloaded to: {model_path}")
except Exception as e:
    print(f"Error downloading model: {e}")
    raise

# Load the GGUF model
print("Loading model into memory...")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context window
    n_threads=int(os.getenv("N_THREADS", "2")),  # CPU threads
    n_batch=512,  # Batch size for prompt processing
    verbose=True
)
print("✓ Model loaded successfully!")

def generate_code(prompt, max_tokens=500, temperature=0.7):
    """Generate code from prompt"""
    try:
        response = llm(
            prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            stop=["</s>", "###", "\n\n\n"],  # Stop sequences
            echo=False
        )
        return response['choices'][0]['text']
    except Exception as e:
        return f"Error generating code: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="CodeLlama Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🦙 CodeLlama-7B Python Assistant")
    gr.Markdown("AI-powered code generation using CodeLlama-7B (4GB GGUF model)")
    
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(
                label="Enter your coding question or task",
                placeholder="Write a Python function to...",
                lines=5
            )
            with gr.Row():
                max_tokens = gr.Slider(
                    minimum=100, 
                    maximum=1000, 
                    value=500, 
                    step=50,
                    label="Max Tokens"
                )
                temperature = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.7, 
                    step=0.1,
                    label="Temperature"
                )
            submit_btn = gr.Button("🚀 Generate Code", variant="primary", size="lg")
            clear_btn = gr.Button("🗑️ Clear", size="sm")
        
        with gr.Column():
            output = gr.Textbox(
                label="Generated Code", 
                lines=15,
                show_copy_button=True
            )
    
    # Button actions
    submit_btn.click(
        fn=generate_code,
        inputs=[prompt_input, max_tokens, temperature],
        outputs=output
    )
    
    clear_btn.click(
        fn=lambda: ("", ""),
        inputs=None,
        outputs=[prompt_input, output]
    )
    
    # Example prompts
    gr.Examples(
        examples=[
            ["Write a Python function to calculate fibonacci numbers"],
            ["Create a binary search tree class with insert and search methods"],
            ["Write a function to reverse a linked list"],
            ["Implement quicksort algorithm in Python"],
            ["Create a decorator to measure function execution time"]
        ],
        inputs=prompt_input
    )
    
    gr.Markdown("""
    ### 💡 Tips:
    - Be specific in your prompts for better results
    - Lower temperature (0.3-0.5) for more focused code
    - Higher temperature (0.7-0.9) for more creative solutions
    - Model works best for Python code generation
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)