from huggingface_hub import hf_hub_download from llama_cpp import Llama import gradio as gr # 1. Download the Gemma 4 E2B GGUF model # We are using a 4-bit quantization (Q4_K_M) for the best balance of speed and quality on a CPU model_path = hf_hub_download( repo_id="ggml-org/gemma-4-E2B-it-GGUF", filename="gemma-4-e2b-it-Q4_K_M.gguf" ) # 2. Load the model using llama.cpp # We set threads=2 to match the 2 vCPUs provided by the free Hugging Face tier llm = Llama( model_path=model_path, n_ctx=2048, # Context window limit for memory safety n_threads=2, # CPU threads chat_format="gemma" # Uses Gemma's native system/user/assistant roles ) # 3. Define the generation function def generate_text(prompt, history): # Format the history for llama_cpp's chat completion messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) # Add the current user prompt messages.append({"role": "user", "content": prompt}) # Generate the response response = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7 ) return response["choices"][0]["message"]["content"] # 4. Launch the Gradio Chat Interface and enable the API demo = gr.ChatInterface( fn=generate_text, title="Gemma 4 E2B CPU API", description="Running Google's Gemma 4 (E2B) entirely on a free Hugging Face CPU Space." ) demo.launch()