Spaces:

BSJ2004
/

gemma

Build error

gemma / app.py

Create app.py

589479e verified 18 days ago

1.56 kB

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import gradio as gr

	# 1. Download the Gemma 4 E2B GGUF model
	# We are using a 4-bit quantization (Q4_K_M) for the best balance of speed and quality on a CPU
	model_path = hf_hub_download(
	repo_id="ggml-org/gemma-4-E2B-it-GGUF",
	filename="gemma-4-e2b-it-Q4_K_M.gguf"
	)

	# 2. Load the model using llama.cpp
	# We set threads=2 to match the 2 vCPUs provided by the free Hugging Face tier
	llm = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window limit for memory safety
	n_threads=2, # CPU threads
	chat_format="gemma" # Uses Gemma's native system/user/assistant roles
	)

	# 3. Define the generation function
	def generate_text(prompt, history):
	# Format the history for llama_cpp's chat completion
	messages = []
	for user_msg, bot_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": bot_msg})

	# Add the current user prompt
	messages.append({"role": "user", "content": prompt})

	# Generate the response
	response = llm.create_chat_completion(
	messages=messages,
	max_tokens=512,
	temperature=0.7
	)

	return response["choices"][0]["message"]["content"]

	# 4. Launch the Gradio Chat Interface and enable the API
	demo = gr.ChatInterface(
	fn=generate_text,
	title="Gemma 4 E2B CPU API",
	description="Running Google's Gemma 4 (E2B) entirely on a free Hugging Face CPU Space."
	)

	demo.launch()