import json, numpy as np, gradio as gr from setfit import SetFitModel from huggingface_hub import hf_hub_download from evidence import extract_evidence import os, shutil, pathlib # Optional: only clear cache if you set CLEAR_HF_CACHE=1 in the Space secrets if os.getenv("CLEAR_HF_CACHE") == "1": CACHE_DIR = os.path.expanduser("~/.cache/huggingface") shutil.rmtree(CACHE_DIR, ignore_errors=True) pathlib.Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) MODEL_ID = "DelaliScratchwerk/text-period-setfit" # thresholds (your tuned values) TOP_K = 3 UNCERTAINTY_THRESHOLD = 0.516 MARGIN_THRESHOLD = 0.387 # labels (Hub -> local fallback) try: labels_path = hf_hub_download(MODEL_ID, "labels.json") LABELS = json.load(open(labels_path)) except Exception: LABELS = json.load(open("labels.json")) model = SetFitModel.from_pretrained(MODEL_ID) def format_evidence(ev: dict) -> str: parts = [] if ev.get("years"): parts.append("**Years found:** " + ", ".join(ev["years"])) if ev.get("keyword_hits"): for bucket, keys in ev["keyword_hits"].items(): if keys: parts.append(f"**{bucket}:** " + ", ".join(keys)) return "\n\n".join(parts) if parts else "_No explicit time clues found._" def predict(txt: str): txt = (txt or "").strip() if not txt: return "—", "Paste some text.", {} probs = np.asarray(model.predict_proba([txt])[0], dtype=float).ravel() if probs.size == 0: return "—", "Model returned no probabilities.", {} if probs.size != len(LABELS): return "—", f"Label mismatch: model has {probs.size} classes, labels.json has {len(LABELS)}", {} order = np.argsort(probs)[::-1] top1 = probs[order[0]] top2 = probs[order[1]] if probs.size > 1 else 0.0 ev = extract_evidence(txt) # uncertain mode if top1 < UNCERTAINTY_THRESHOLD or (top1 - top2) < MARGIN_THRESHOLD: topk = [{"label": LABELS[i], "score": float(probs[i])} for i in order[:TOP_K]] md = "**Uncertain** — top candidates:\n" + "\n".join( f"- **{d['label']}**: {d['score']:.3f}" for d in topk ) return "uncertain", md + "\n\n" + format_evidence(ev), {LABELS[i]: float(probs[i]) for i in order} # confident best = LABELS[order[0]] md = "**Reasoning hints**\n\n" + format_evidence(ev) return best, md, {LABELS[i]: float(probs[i]) for i in order} with gr.Blocks(title="Text → Time Period (SetFit)") as demo: gr.Markdown("# Text → Time Period (SetFit)") with gr.Row(): text = gr.Textbox(lines=8, label="Paste text") with gr.Column(): pred = gr.Label(label="Predicted") reason = gr.Markdown(label="Evidence") scores = gr.JSON(label="Scores") btn = gr.Button("Submit", variant="primary") # Stable API route name; HTTP endpoint will be /api/predict btn.click( predict, inputs=text, outputs=[pred, reason, scores], api_name="predict" ) gr.Examples( examples=[ "Schools went remote during the pandemic; everyone wore N95s and used Zoom.", "Sputnik launched and kicked off the space race.", "MySpace was the most popular social network for a while.", "Creators blew up on TikTok; companies rolled out ChatGPT-powered tools.", ], inputs=text ) if __name__ == "__main__": demo.launch()