Gemma-4 / app.py
Valtry's picture
Create app.py
5e9bfc3 verified
import gradio as gr
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# 🔥 CONFIG
REPO_ID = "Valtry/Gemma-4" # change this
FILENAME = "google_gemma-4-E2B-it-Q4_K_M.gguf"
# 📥 Download model from HF
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME
)
# ⚡ Load model
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=4, # adjust based on CPU
n_gpu_layers=0 # CPU only (HF free tier)
)
# -------- FastAPI --------
app = FastAPI()
class Request(BaseModel):
prompt: str
# -------- Streaming generator --------
def stream_generate(prompt):
formatted_prompt = f"<start_of_turn>user\n{prompt}\n<end_of_turn>\n<start_of_turn>model\n"
output = llm(
formatted_prompt,
max_tokens=256,
temperature=0.7,
top_p=0.9,
stream=True
)
for chunk in output:
if "choices" in chunk:
token = chunk["choices"][0]["text"]
yield token
# -------- API endpoint --------
@app.post("/generate")
def generate(req: Request):
return StreamingResponse(stream_generate(req.prompt), media_type="text/plain")
# -------- Gradio UI --------
def chat_fn(message, history):
response = ""
for token in stream_generate(message):
response += token
yield response
ui = gr.ChatInterface(chat_fn)
# Mount UI
app = gr.mount_gradio_app(app, ui, path="/")