import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import time
import os

# Configuration - Using a 6.7B model that fits free tier
MODEL_REPO_ID = "TheBloke/DeepSeek-Coder-6.7B-Instruct-GGUF"
MODEL_FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"

# Download model if not already present
def get_model_path():
    """Download model from Hugging Face Hub or use cached version."""
    try:
        model_path = hf_hub_download(
            repo_id=MODEL_REPO_ID,
            filename=MODEL_FILENAME,
            local_dir="./models",  # Save to models folder
            local_dir_use_symlinks=False,  # Avoid symlink issues
            resume_download=True  # Resume if interrupted
        )
        print(f"✅ Model downloaded to: {model_path}")
        return model_path
    except Exception as e:
        print(f"❌ Error downloading model: {e}")
        # Fallback to local path if already uploaded
        if os.path.exists(MODEL_FILENAME):
            return MODEL_FILENAME
        raise

# Initialize model
MODEL_PATH = get_model_path()
llm = None

def load_model():
    """Lazy-load the model only when needed."""
    global llm
    if llm is None:
        print(f"⏳ Loading model... This may take 1-2 minutes on first run.")
        start_time = time.time()
        
        # Optimized for free tier constraints
        llm = Llama(
            model_path=MODEL_PATH,
            n_ctx=2048,           # Context size (smaller = less memory)
            n_threads=2,          # Use 2 CPU threads (free tier has 2)
            n_gpu_layers=0,       # CPU only on free tier
            verbose=True          # Helpful for debugging
        )
        
        load_time = time.time() - start_time
        print(f"✅ Model loaded in {load_time:.1f} seconds. Ready for inference.")
    return llm

def generate_code(prompt, max_tokens=256, temperature=0.7):
    """Main generation function."""
    try:
        model = load_model()
        
        # Format prompt for DeepSeek-Coder Instruct models
        formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
        
        # Generate
        output = model(
            formatted_prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            echo=False,
            stop=["###", "\n\n\n"]
        )
        
        return output['choices'][0]['text'].strip()
        
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=generate_code,
    inputs=[
        gr.Textbox(
            label="Code Prompt",
            placeholder="Write a Python function to reverse a string...",
            lines=4
        ),
        gr.Slider(
            minimum=32,
            maximum=512,
            value=256,
            step=32,
            label="Max Tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        )
    ],
    outputs=gr.Code(
        label="Generated Code",
        language="python"
    ),
    title="💻 DeepSeek Coder 6.7B Instruct - o87Dev",
    description="**CPU Deployment** - Running on Hugging Face Spaces free tier. ⚠️ **First request loads model (~1-2 min)**",
    examples=[
        ["Write a Python function to check if a number is prime"],
        ["Create a React component for a login form"],
        ["Explain binary search algorithm in Python"]
    ]
)

# Launch with queue for better handling on free tier
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )