Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # Pick ONE GGUF repo + filename that exists in that repo. | |
| # Examples you can use: | |
| # - repo_id="bartowski/gemma-2-2b-it-GGUF" (various quants) :contentReference[oaicite:5]{index=5} | |
| # - repo_id="BafS/gemma-2-2b-it-Q4_K_M-GGUF" (single-file Q4_K_M) :contentReference[oaicite:6]{index=6} | |
| REPO_ID = os.getenv("GGUF_REPO_ID", "BafS/gemma-2-2b-it-Q4_K_M-GGUF") | |
| FILENAME = os.getenv("GGUF_FILENAME", "gemma-2-2b-it-q4_k_m.gguf") # adjust if repo uses different name | |
| # Optional: if the repo is gated, HF_TOKEN helps. | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| llm = None | |
| def _load_model(): | |
| global llm | |
| if llm is not None: | |
| return | |
| model_path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME, | |
| token=HF_TOKEN, # works if gated; harmless if public :contentReference[oaicite:7]{index=7} | |
| ) | |
| # Conservative defaults for HF CPU Basic | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=max(1, os.cpu_count() or 2), | |
| n_batch=256, | |
| verbose=False, | |
| ) | |
| SYSTEM_PROMPT = ( | |
| "You are a product delivery analyst. " | |
| "Write a concise executive summary in English based ONLY on the data provided. " | |
| "Be specific with numbers and percentages. Do not invent data." | |
| ) | |
| def summarize(period_json: str, max_tokens: int = 350, temperature: float = 0.2): | |
| _load_model() | |
| prompt = f"""<system>{SYSTEM_PROMPT}</system> | |
| <user> | |
| Output format: | |
| 1) Overall health (1 sentence) | |
| 2) Capacity vs scope (2 bullets) | |
| 3) Delivery & predictability (2 bullets) | |
| 4) Churn & stability (2 bullets) | |
| 5) Risks / hotspots (2 bullets, name components) | |
| 6) Recommendation for next period (2 bullets) | |
| Data: | |
| {period_json} | |
| </user> | |
| <assistant> | |
| """ | |
| out = llm( | |
| prompt, | |
| max_tokens=int(max_tokens), | |
| temperature=float(temperature), | |
| top_p=0.9, | |
| stop=["</assistant>"], | |
| ) | |
| return out["choices"][0]["text"].strip() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Gemma 2B (GGUF) – Executive Summary API") | |
| inp = gr.Textbox(label="Period summary JSON / text", lines=12, placeholder="{ ... }") | |
| max_t = gr.Slider(64, 700, value=350, step=1, label="max_tokens") | |
| temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="temperature") | |
| out = gr.Textbox(label="Summary (EN)", lines=14) | |
| btn = gr.Button("Summarize") | |
| btn.click(summarize, inputs=[inp, max_t, temp], outputs=out, api_name="summarize") | |
| demo.launch() | |