Spaces:

huggingface-projects
/

trace-reports

Running

mervenoyan commited on 8 days ago

Commit

2d3963a

1 Parent(s): 8844088

Switch back to Inference Providers (Qwen3.5-9B)

Revert the ZeroGPU/transformers local-inference path. The provider has
JSON mode and 8-way parallel digest calls were noticeably faster than
the serial GPU loop. Keeps the model swap to Qwen3.5-9B, temperature=0
(greedy), max_tokens=1500 on the bulletin call, and a per-call user
reminder restating the 3-sins-and-length-budget constraints. Drops
spaces/transformers/accelerate/torch from requirements.

Files changed (3) hide show

analyze.py +75 -152
app.py +13 -3
requirements.txt +0 -4

analyze.py CHANGED Viewed

@@ -1,101 +1,36 @@
-"""Local Qwen3.6-35B-A3B on ZeroGPU: map (per-session digests) + reduce (bulletin)."""
 import datetime as dt
 import hashlib
 import json
-import re
-import spaces
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from extract import event_role, event_tool_names
 MODEL = "Qwen/Qwen3.5-9B"
-# Lazily populated inside the GPU worker on first call.
-_tokenizer = None
-_model = None
-def _load():
-    """Load tokenizer + model on the GPU worker. Cached after first call.
-    Two ZeroGPU-specific bits:
-    - We touch CUDA once (`torch.cuda.init()` + a 1-element alloc) so the
-      caching allocator's NVML query happens in a known-good state before
-      transformers' loader starts hammering it per-tensor.
-    - `low_cpu_mem_usage=True` makes the loader use meta-tensor init and
-      stream shards onto the device, instead of materialising each tensor
-      on CPU and then `.to("cuda")` (which is what triggered the NVML
-      assert under the new core_model_loading path).
-    """
-    global _tokenizer, _model
-    if _model is None:
-        torch.cuda.init()
-        _ = torch.empty(1, device="cuda")
-        torch.cuda.synchronize()
-        _tokenizer = AutoTokenizer.from_pretrained(MODEL)
-        _model = AutoModelForCausalLM.from_pretrained(
-            MODEL,
-            torch_dtype=torch.bfloat16,
-            device_map="cuda",
-            low_cpu_mem_usage=True,
-        )
-        _model.eval()
-    return _tokenizer, _model
-def _chat(
-    tokenizer,
-    model,
-    messages: list[dict],
-    *,
-    max_new_tokens: int,
-    temperature: float,
-) -> str:
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True,
-        enable_thinking=False,
-    )
-    inputs = tokenizer(text, return_tensors="pt").to(model.device)
-    with torch.inference_mode():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            do_sample=temperature > 0,
-            pad_token_id=tokenizer.eos_token_id,
-        )
-    completion_ids = out[0][inputs.input_ids.shape[1]:]
-    return tokenizer.decode(completion_ids, skip_special_tokens=True)
-_FENCE_RE = re.compile(r"^```(?:json)?\s*|\s*```$", re.IGNORECASE | re.MULTILINE)
-def _parse_json(text: str) -> dict:
-    """Forgiving JSON parse: strip markdown fences, find the outermost {...} if needed."""
-    text = _FENCE_RE.sub("", text.strip()).strip()
-    try:
-        return json.loads(text)
-    except json.JSONDecodeError:
-        # Fall back to the first balanced { ... } block.
-        start = text.find("{")
-        end = text.rfind("}")
-        if start != -1 and end != -1 and end > start:
-            return json.loads(text[start : end + 1])
-        raise
 # ---------- map: per-session digest ----------
 _DIGEST_SYSTEM = """You are analysing a single coding-agent session transcript. The TRANSCRIPT shows messages between a HUMAN USER and an AGENT (the AI). Return signals about the HUMAN USER only — never about the agent.
-Return STRICT JSON, no prose, no markdown fences:
 {
   "session_id": <echo>,
   "intent": "<one sentence: what the user was trying to do>",
@@ -107,43 +42,43 @@ Return STRICT JSON, no prose, no markdown fences:
 Hard rules:
 - Only include things the user actually said or did. Do not attribute agent behaviour to the user.
 - top_quotes must literally appear in user messages.
-- Be concise and specific. No invented quotes.
-- Emit JSON only. No commentary."""
-def _digest_one(tokenizer, model, transcript: str, session_id: str) -> dict | None:
     user_prompt = f"session_id: {session_id}\n\nTranscript:\n{transcript}"
-    messages = [
-        {"role": "system", "content": _DIGEST_SYSTEM},
-        {"role": "user", "content": user_prompt},
-    ]
-    for attempt in range(2):
-        try:
-            raw = _chat(
-                tokenizer,
-                model,
-                messages,
-                max_new_tokens=800,
-                temperature=0.4 if attempt == 0 else 0.2,
-            )
-            data = _parse_json(raw)
-            data.setdefault("session_id", session_id)
-            return data
-        except Exception:
-            continue
-    return None
-@spaces.GPU(duration=300)
-def digest_all(transcripts: list[tuple[str, str]]) -> list[dict]:
-    """Run a digest for each (session_id, transcript) sequentially on the GPU worker."""
-    tokenizer, model = _load()
-    results = []
-    for sid, text in transcripts:
-        out = _digest_one(tokenizer, model, text, sid)
-        if out is not None:
-            results.append(out)
-    return results
 # ---------- stats from raw events ----------
@@ -192,6 +127,7 @@ def serial_for(user: str) -> str:
 # ---------- reduce: bulletin generation ----------
 _BULLETIN_SYSTEM = """You are the Hugging Face Roastery. You read agent-trace dataset digests and write a gently savage personality bulletin about the HUMAN USER who was prompting the agent — never about the agent itself. The output is a vintage printed card; every field has a strict length budget. Be specific, be funny, never punch down.
 You will receive:
@@ -220,6 +156,8 @@ Field budgets (hard limits — overflow breaks the layout):
 - sins[].meta:   30-60 chars
 - forecast.body: 270-340 chars, ends with "Lucky <x>: <y>. Avoid: <z>."
 Voice:
 - Sharp but loving — group-chat energy, not insult-comic. Roast habits a thoughtful friend would call out.
 - Sentence case for titles. Smart quotes ( " " ), en-dashes ( – ), em-dashes ( — ). No exclamation marks. No emojis.
@@ -242,58 +180,43 @@ Procedure:
 7. Emit JSON only. No code fences. No commentary."""
-def _bulletin_valid(data: dict) -> bool:
-    """The bulletin must have all 3 sins; budgets are best-effort and not enforced here."""
-    sins = data.get("sins")
-    return isinstance(sins, list) and len(sins) >= 3
-@spaces.GPU(duration=180)
-def _bulletin(digests: list[dict], user: str, dataset_id: str) -> dict:
-    tokenizer, model = _load()
     user_prompt = (
         f"user: {user}\n"
         f"dataset: {dataset_id}\n\n"
         f"digests (JSON list):\n{json.dumps(digests, ensure_ascii=False, indent=2)}\n\n"
-        "Reminder: respect every length budget AND emit EXACTLY 3 sins. "
-        "Tagline must be ≤170 chars; forecast.body must be ≤340 chars. "
-        "Output only the JSON object."
     )
-    messages = [
-        {"role": "system", "content": _BULLETIN_SYSTEM},
-        {"role": "user", "content": user_prompt},
-    ]
-    last_err = None
-    last_data = None
-    for attempt in range(3):
-        raw = _chat(
-            tokenizer,
-            model,
-            messages,
-            max_new_tokens=1500,
-            temperature=0.85 if attempt == 0 else 0.4,
-        )
-        try:
-            data = _parse_json(raw)
-        except Exception as e:
-            last_err = e
-            continue
-        last_data = data
-        if _bulletin_valid(data):
-            return data
-    if last_data is not None:
-        return last_data
-    raise RuntimeError(f"Bulletin JSON parse failed: {last_err}")
 def build_report(
     digests: list[dict],
     user: str,
     dataset_id: str,
     stats: dict,
 ) -> dict:
     """Combine model output + computed stats into the full report dict for render.py."""
-    data = _bulletin(digests, user, dataset_id)
     today = dt.date.today().strftime("%b %d, %Y")
     archetype = data.get("archetype") or ["The", "Unreadable"]
     if not isinstance(archetype, list) or len(archetype) < 2:

+"""InferenceClient calls: map (per-session digests) + reduce (bulletin)."""
 import datetime as dt
 import hashlib
 import json
+import os
+from concurrent.futures import ThreadPoolExecutor
+from huggingface_hub import InferenceClient
 from extract import event_role, event_tool_names
 MODEL = "Qwen/Qwen3.5-9B"
+_NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
+def get_client(token: str | None = None) -> InferenceClient:
+    """Build the InferenceClient. Centralised so OAuth swap is one place."""
+    if token is None:
+        token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise RuntimeError(
+            "HF_TOKEN is not set. Export it in your shell or pass token= explicitly."
+        )
+    return InferenceClient(model=MODEL, token=token)
 # ---------- map: per-session digest ----------
 _DIGEST_SYSTEM = """You are analysing a single coding-agent session transcript. The TRANSCRIPT shows messages between a HUMAN USER and an AGENT (the AI). Return signals about the HUMAN USER only — never about the agent.
+Return STRICT JSON:
 {
   "session_id": <echo>,
   "intent": "<one sentence: what the user was trying to do>",
 Hard rules:
 - Only include things the user actually said or did. Do not attribute agent behaviour to the user.
 - top_quotes must literally appear in user messages.
+- Be concise and specific. No invented quotes."""
+def digest_session(client: InferenceClient, transcript: str, session_id: str) -> dict:
     user_prompt = f"session_id: {session_id}\n\nTranscript:\n{transcript}"
+    try:
+        resp = client.chat_completion(
+            messages=[
+                {"role": "system", "content": _DIGEST_SYSTEM},
+                {"role": "user", "content": user_prompt},
+            ],
+            response_format={"type": "json_object"},
+            max_tokens=800,
+            temperature=0,
+            extra_body=_NO_THINK,
+        )
+        raw = resp.choices[0].message.content or "{}"
+        data = json.loads(raw)
+        data.setdefault("session_id", session_id)
+        return data
+    except Exception as e:
+        return {"session_id": session_id, "error": str(e)}
+def digest_all(
+    client: InferenceClient,
+    transcripts: list[tuple[str, str]],
+    max_workers: int = 8,
+) -> list[dict]:
+    """Run digest_session over all transcripts in parallel. Drops error entries."""
+    def _one(item):
+        sid, text = item
+        return digest_session(client, text, sid)
+    with ThreadPoolExecutor(max_workers=max_workers) as ex:
+        results = list(ex.map(_one, transcripts))
+    return [r for r in results if "error" not in r]
 # ---------- stats from raw events ----------
 # ---------- reduce: bulletin generation ----------
+# Adapted from the design handoff's CONTENT_PROMPT.md.
 _BULLETIN_SYSTEM = """You are the Hugging Face Roastery. You read agent-trace dataset digests and write a gently savage personality bulletin about the HUMAN USER who was prompting the agent — never about the agent itself. The output is a vintage printed card; every field has a strict length budget. Be specific, be funny, never punch down.
 You will receive:
 - sins[].meta:   30-60 chars
 - forecast.body: 270-340 chars, ends with "Lucky <x>: <y>. Avoid: <z>."
+The sins array MUST contain exactly 3 objects. Do not emit fewer.
 Voice:
 - Sharp but loving — group-chat energy, not insult-comic. Roast habits a thoughtful friend would call out.
 - Sentence case for titles. Smart quotes ( " " ), en-dashes ( – ), em-dashes ( — ). No exclamation marks. No emojis.
 7. Emit JSON only. No code fences. No commentary."""
+def bulletin(
+    client: InferenceClient,
+    digests: list[dict],
+    user: str,
+    dataset_id: str,
+) -> dict:
+    """Generate the report content (archetype, tagline, sins, forecast). One JSON call."""
     user_prompt = (
         f"user: {user}\n"
         f"dataset: {dataset_id}\n\n"
         f"digests (JSON list):\n{json.dumps(digests, ensure_ascii=False, indent=2)}\n\n"
+        "Reminder: emit EXACTLY 3 sins and respect every length budget. "
+        "Tagline ≤170 chars; forecast.body ≤340 chars."
     )
+    resp = client.chat_completion(
+        messages=[
+            {"role": "system", "content": _BULLETIN_SYSTEM},
+            {"role": "user", "content": user_prompt},
+        ],
+        response_format={"type": "json_object"},
+        max_tokens=1500,
+        temperature=0,
+        extra_body=_NO_THINK,
+    )
+    raw = resp.choices[0].message.content or "{}"
+    return json.loads(raw)
 def build_report(
+    client: InferenceClient,
     digests: list[dict],
     user: str,
     dataset_id: str,
     stats: dict,
 ) -> dict:
     """Combine model output + computed stats into the full report dict for render.py."""
+    data = bulletin(client, digests, user, dataset_id)
     today = dt.date.today().strftime("%b %d, %Y")
     archetype = data.get("archetype") or ["The", "Unreadable"]
     if not isinstance(archetype, list) or len(archetype) < 2:

app.py CHANGED Viewed

@@ -5,12 +5,13 @@ endpoint below via `@gradio/client`. Report generation logic is unchanged
 from the original Blocks app.
 """
 from pathlib import Path
 from fastapi.responses import HTMLResponse
 from gradio import Server
-from analyze import build_report, compute_stats, digest_all
 from dataset import fetch_sessions, list_sessions
 from extract import events_to_transcript, truncate_transcript
 from render import bulletin_html, empty_bulletin_html
@@ -34,6 +35,12 @@ def generate_bulletin(
     yield "Connecting…", empty_bulletin_html("Connecting…")
     try:
         yield "Listing sessions…", empty_bulletin_html("Listing sessions…")
         paths = list_sessions(repo_id)
@@ -64,10 +71,10 @@ def generate_bulletin(
         ]
         yield (
-            f"Reading {len(transcripts)} sessions on GPU…",
             empty_bulletin_html("Consulting the traces…"),
         )
-        digests = digest_all(transcripts)
         if not digests:
             yield (
                 "Every per-session digest failed. Try again or lower max sessions.",
@@ -83,6 +90,7 @@ def generate_bulletin(
         owner = _owner_from(repo_id)
         try:
             report = build_report(
                 digests=digests,
                 user=owner,
                 dataset_id=repo_id,
@@ -112,4 +120,6 @@ async def homepage():
 if __name__ == "__main__":
     app.launch(show_error=True)

 from the original Blocks app.
 """
+import os
 from pathlib import Path
 from fastapi.responses import HTMLResponse
 from gradio import Server
+from analyze import build_report, compute_stats, digest_all, get_client
 from dataset import fetch_sessions, list_sessions
 from extract import events_to_transcript, truncate_transcript
 from render import bulletin_html, empty_bulletin_html
     yield "Connecting…", empty_bulletin_html("Connecting…")
+    try:
+        client = get_client()
+    except Exception as e:
+        yield f"❌ {e}", empty_bulletin_html("HF_TOKEN missing")
+        return
     try:
         yield "Listing sessions…", empty_bulletin_html("Listing sessions…")
         paths = list_sessions(repo_id)
         ]
         yield (
+            f"Reading {len(transcripts)} sessions in parallel…",
             empty_bulletin_html("Consulting the traces…"),
         )
+        digests = digest_all(client, transcripts)
         if not digests:
             yield (
                 "Every per-session digest failed. Try again or lower max sessions.",
         owner = _owner_from(repo_id)
         try:
             report = build_report(
+                client=client,
                 digests=digests,
                 user=owner,
                 dataset_id=repo_id,
 if __name__ == "__main__":
+    if not os.environ.get("HF_TOKEN"):
+        print("warning: HF_TOKEN not set; the app will error on the first click.")
     app.launch(show_error=True)

requirements.txt CHANGED Viewed

@@ -1,7 +1,3 @@
 gradio>=6.14
 huggingface_hub>=0.28
 Pillow>=10.0
-spaces
-transformers>=4.45
-accelerate>=0.30
-torch

 gradio>=6.14
 huggingface_hub>=0.28
 Pillow>=10.0