| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
| import gradio as gr |
|
|
| |
| |
| model_path = hf_hub_download( |
| repo_id="ggml-org/gemma-4-E2B-it-GGUF", |
| filename="gemma-4-e2b-it-Q4_K_M.gguf" |
| ) |
|
|
| |
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=2, |
| chat_format="gemma" |
| ) |
|
|
| |
| def generate_text(prompt, history): |
| |
| messages = [] |
| for user_msg, bot_msg in history: |
| messages.append({"role": "user", "content": user_msg}) |
| messages.append({"role": "assistant", "content": bot_msg}) |
| |
| |
| messages.append({"role": "user", "content": prompt}) |
| |
| |
| response = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=512, |
| temperature=0.7 |
| ) |
| |
| return response["choices"][0]["message"]["content"] |
|
|
| |
| demo = gr.ChatInterface( |
| fn=generate_text, |
| title="Gemma 4 E2B CPU API", |
| description="Running Google's Gemma 4 (E2B) entirely on a free Hugging Face CPU Space." |
| ) |
|
|
| demo.launch() |