from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr

# 1. Download the Gemma 4 E2B GGUF model
# We are using a 4-bit quantization (Q4_K_M) for the best balance of speed and quality on a CPU
model_path = hf_hub_download(
    repo_id="ggml-org/gemma-4-E2B-it-GGUF", 
    filename="gemma-4-e2b-it-Q4_K_M.gguf" 
)

# 2. Load the model using llama.cpp
# We set threads=2 to match the 2 vCPUs provided by the free Hugging Face tier
llm = Llama(
    model_path=model_path, 
    n_ctx=2048,      # Context window limit for memory safety
    n_threads=2,     # CPU threads 
    chat_format="gemma" # Uses Gemma's native system/user/assistant roles
)

# 3. Define the generation function
def generate_text(prompt, history):
    # Format the history for llama_cpp's chat completion
    messages = []
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    # Add the current user prompt
    messages.append({"role": "user", "content": prompt})
    
    # Generate the response
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7
    )
    
    return response["choices"][0]["message"]["content"]

# 4. Launch the Gradio Chat Interface and enable the API
demo = gr.ChatInterface(
    fn=generate_text,
    title="Gemma 4 E2B CPU API",
    description="Running Google's Gemma 4 (E2B) entirely on a free Hugging Face CPU Space."
)

demo.launch()