import gradio as gr from fastapi import FastAPI from fastapi.responses import StreamingResponse from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download # 🔥 CONFIG REPO_ID = "Valtry/Gemma-4" # change this FILENAME = "google_gemma-4-E2B-it-Q4_K_M.gguf" # 📥 Download model from HF model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME ) # ⚡ Load model llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, # adjust based on CPU n_gpu_layers=0 # CPU only (HF free tier) ) # -------- FastAPI -------- app = FastAPI() class Request(BaseModel): prompt: str # -------- Streaming generator -------- def stream_generate(prompt): formatted_prompt = f"user\n{prompt}\n\nmodel\n" output = llm( formatted_prompt, max_tokens=256, temperature=0.7, top_p=0.9, stream=True ) for chunk in output: if "choices" in chunk: token = chunk["choices"][0]["text"] yield token # -------- API endpoint -------- @app.post("/generate") def generate(req: Request): return StreamingResponse(stream_generate(req.prompt), media_type="text/plain") # -------- Gradio UI -------- def chat_fn(message, history): response = "" for token in stream_generate(message): response += token yield response ui = gr.ChatInterface(chat_fn) # Mount UI app = gr.mount_gradio_app(app, ui, path="/")