gemma / app.py
BSJ2004's picture
Create app.py
589479e verified
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr
# 1. Download the Gemma 4 E2B GGUF model
# We are using a 4-bit quantization (Q4_K_M) for the best balance of speed and quality on a CPU
model_path = hf_hub_download(
repo_id="ggml-org/gemma-4-E2B-it-GGUF",
filename="gemma-4-e2b-it-Q4_K_M.gguf"
)
# 2. Load the model using llama.cpp
# We set threads=2 to match the 2 vCPUs provided by the free Hugging Face tier
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window limit for memory safety
n_threads=2, # CPU threads
chat_format="gemma" # Uses Gemma's native system/user/assistant roles
)
# 3. Define the generation function
def generate_text(prompt, history):
# Format the history for llama_cpp's chat completion
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
# Add the current user prompt
messages.append({"role": "user", "content": prompt})
# Generate the response
response = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7
)
return response["choices"][0]["message"]["content"]
# 4. Launch the Gradio Chat Interface and enable the API
demo = gr.ChatInterface(
fn=generate_text,
title="Gemma 4 E2B CPU API",
description="Running Google's Gemma 4 (E2B) entirely on a free Hugging Face CPU Space."
)
demo.launch()