SprintReview / app.py
LeoneNL's picture
Create app.py
8115fb9 verified
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Pick ONE GGUF repo + filename that exists in that repo.
# Examples you can use:
# - repo_id="bartowski/gemma-2-2b-it-GGUF" (various quants) :contentReference[oaicite:5]{index=5}
# - repo_id="BafS/gemma-2-2b-it-Q4_K_M-GGUF" (single-file Q4_K_M) :contentReference[oaicite:6]{index=6}
REPO_ID = os.getenv("GGUF_REPO_ID", "BafS/gemma-2-2b-it-Q4_K_M-GGUF")
FILENAME = os.getenv("GGUF_FILENAME", "gemma-2-2b-it-q4_k_m.gguf") # adjust if repo uses different name
# Optional: if the repo is gated, HF_TOKEN helps.
HF_TOKEN = os.getenv("HF_TOKEN")
llm = None
def _load_model():
global llm
if llm is not None:
return
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
token=HF_TOKEN, # works if gated; harmless if public :contentReference[oaicite:7]{index=7}
)
# Conservative defaults for HF CPU Basic
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=max(1, os.cpu_count() or 2),
n_batch=256,
verbose=False,
)
SYSTEM_PROMPT = (
"You are a product delivery analyst. "
"Write a concise executive summary in English based ONLY on the data provided. "
"Be specific with numbers and percentages. Do not invent data."
)
def summarize(period_json: str, max_tokens: int = 350, temperature: float = 0.2):
_load_model()
prompt = f"""<system>{SYSTEM_PROMPT}</system>
<user>
Output format:
1) Overall health (1 sentence)
2) Capacity vs scope (2 bullets)
3) Delivery & predictability (2 bullets)
4) Churn & stability (2 bullets)
5) Risks / hotspots (2 bullets, name components)
6) Recommendation for next period (2 bullets)
Data:
{period_json}
</user>
<assistant>
"""
out = llm(
prompt,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=0.9,
stop=["</assistant>"],
)
return out["choices"][0]["text"].strip()
with gr.Blocks() as demo:
gr.Markdown("## Gemma 2B (GGUF) – Executive Summary API")
inp = gr.Textbox(label="Period summary JSON / text", lines=12, placeholder="{ ... }")
max_t = gr.Slider(64, 700, value=350, step=1, label="max_tokens")
temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="temperature")
out = gr.Textbox(label="Summary (EN)", lines=14)
btn = gr.Button("Summarize")
btn.click(summarize, inputs=[inp, max_t, temp], outputs=out, api_name="summarize")
demo.launch()