# Hugging Face Space – tiny streaming chatbot + JSON API # • Gradio UI with incremental token streaming # • POST /api/generate → {"response": "..."} # • Easily swap the model path / prompt template later # # Tested on HF free CPU – 16 vCPU, 16 GB RAM # --------------------------------------------------------- import os, asyncio from fastapi import FastAPI, HTTPException from pydantic import BaseModel import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download from fastapi.staticfiles import StaticFiles app = FastAPI() app.mount("/static", StaticFiles(directory="static"), name="static") # keep the next line so Gradio UI is still available at / app = gr.mount_gradio_app(app, demo, path="/") MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-GGUF" GGUF_FILE = "tinyllama-1.1b-chat.q4_K_M.gguf" # 2 GB, 4-bit N_CTX = 4096 # tokens of context MAX_TOKENS = 512 # generation limit # ---------- model load (one-time) ---------- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE) llm = Llama(model_path=model_path, n_ctx=N_CTX, n_threads=int(os.getenv("NUM_CPU", "8")), # feel free to tune n_gpu_layers=0, # CPU-only logits_all=False, use_mlock=True) SYSTEM_PROMPT = "You are a helpful, concise news assistant." # ---------- streaming generation ---------- def stream_chat(prompt, history): # Llama.cpp wants the full conversation in a single string dialogue = [f"<|system|>{SYSTEM_PROMPT}"] for user, bot in history: dialogue.append(f"<|user|>{user}") dialogue.append(f"<|assistant|>{bot}") dialogue.append(f"<|user|>{prompt}") final_prompt = "\n".join(dialogue) stream = llm.create_completion( final_prompt, max_tokens=MAX_TOKENS, temperature=0.7, top_p=0.9, stream=True, stop=["<|user|>", "<|assistant|>", ""], ) partial = "" for chunk in stream: token = chunk["choices"][0]["text"] partial += token yield partial # ---------- Gradio interface ---------- with gr.Blocks(title="Tiny Chatbot") as demo: gr.Markdown("### TinyLlama Chatbot – streams as it thinks") chatbot = gr.Chatbot() with gr.Row(): txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4) send_btn = gr.Button("Send", variant="primary") def user_submit(message, chat_history): chat_history = chat_history + [[message, ""]] return "", chat_history def bot_reply(chat_history): user_msg = chat_history[-1][0] gen = stream_chat(user_msg, chat_history[:-1]) for answer in gen: chat_history[-1][1] = answer yield chat_history txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then( bot_reply, chatbot, chatbot ) send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then( bot_reply, chatbot, chatbot ) # ---------- JSON API ---------- app = FastAPI() app = gr.mount_gradio_app(app, demo, path="/") # UI on root “/” class GenRequest(BaseModel): prompt: str max_tokens: int | None = None @app.post("/api/generate") async def api_generate(req: GenRequest): if not req.prompt: raise HTTPException(400, detail="prompt missing") gen = llm.create_completion( f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}", max_tokens=req.max_tokens or MAX_TOKENS, temperature=0.7, top_p=0.9, stream=False, stop=["<|user|>", "<|assistant|>", ""], ) return {"response": gen["choices"][0]["text"].strip()} if __name__ == "__main__": # HF launches `python app.py` import uvicorn uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))