Spaces:

mediabiasgroup
/

model_compare

Sleeping

App Files Files Community

bitwise31337 commited on Aug 27, 2025

Commit

904efce

verified ·

1 Parent(s): 0479029

Create app.py

Browse files

Files changed (1) hide show

app.py +197 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+    import gradio as gr
+    import pandas as pd
+    from typing import List, Dict, Any, Tuple
+    from functools import lru_cache
+    from huggingface_hub import HfApi
+    from transformers import pipeline
+    ORG = "mediabiasgroup"
+    DEFAULT_TASK = "text-classification"
+    MAX_MODELS = 10  # safety cap to avoid loading too many models at once on CPU Spaces
+    api = HfApi()
+    @lru_cache(maxsize=1)
+    def list_org_models() -> List[Any]:
+        # full=True to fetch pipeline_tag & tags
+        return list(api.list_models(author=ORG, full=True))
+    def discover_tasks_and_models() -> Tuple[List[str], Dict[str, List[str]]]:
+        infos = list_org_models()
+        task2models: Dict[str, List[str]] = {}
+        for info in infos:
+            task = getattr(info, "pipeline_tag", None)
+            if not task:
+                # Try to infer from tags if missing
+                tags = set(getattr(info, "tags", []) or [])
+                # Very light heuristic; expand if you add other task types later
+                if "text-classification" in tags:
+                    task = "text-classification"
+            if task:
+                task2models.setdefault(task, []).append(info.modelId)
+        tasks = sorted(task2models.keys())
+        # Keep deterministic sorting of model ids within each task
+        for t in task2models:
+            task2models[t] = sorted(task2models[t])
+        return tasks, task2models
+    @lru_cache(maxsize=256)
+    def get_card_data(repo_id: str) -> Dict[str, Any]:
+        try:
+            info = api.model_info(repo_id)
+            # .cardData is already a parsed dict when available
+            data = getattr(info, "cardData", None)
+            return data or {}
+        except Exception:
+            return {}
+    def extract_model_index_metrics(repo_id: str) -> pd.DataFrame:
+        data = get_card_data(repo_id)
+        rows = []
+        if not data:
+            return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value"])
+        mi = data.get("model-index") or data.get("model_index") or []
+        for entry in mi:
+            name = entry.get("name", repo_id)
+            for res in entry.get("results", []):
+                task = res.get("task", {})
+                task_type = task.get("type", task.get("name", ""))
+                dset = res.get("dataset", {})
+                dname = dset.get("name", dset.get("type", ""))
+                for m in res.get("metrics", []):
+                    rows.append({
+                        "model": name,
+                        "dataset": dname,
+                        "task": task_type,
+                        "metric": m.get("name", ""),
+                        "value": m.get("value", None),
+                        "repo_id": repo_id
+                    })
+        if not rows:
+            return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value"])
+        df = pd.DataFrame(rows)
+        # Optional: pivot for nicer viewing in the UI
+        return df
+    # Lazy-loaded pipelines cache
+    PIPE_CACHE: Dict[str, Any] = {}
+    def get_pipeline(repo_id: str, task: str):
+        key = f"{task}::{repo_id}"
+        if key in PIPE_CACHE:
+            return PIPE_CACHE[key]
+        # Use return_all_scores=True so we can compare per-label scores
+        if task == "text-classification":
+            pipe = pipeline(task, model=repo_id, tokenizer=repo_id, return_all_scores=True, truncation=True)
+        else:
+            # Add more pipelines if you start supporting other tasks
+            pipe = pipeline(task, model=repo_id, tokenizer=repo_id)
+        PIPE_CACHE[key] = pipe
+        return pipe
+    def predict(models: List[str], task: str, text: str) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
+        if not text.strip():
+            return "Please enter some text.", pd.DataFrame(), pd.DataFrame()
+        if not models:
+            return "Please select 1–{} models.".format(MAX_MODELS), pd.DataFrame(), pd.DataFrame()
+        if len(models) > MAX_MODELS:
+            models = models[:MAX_MODELS]
+        # Run inference
+        table_rows = []
+        label_union = set()
+        per_model_outputs = {}
+        for rid in models:
+            try:
+                pipe = get_pipeline(rid, task)
+                out = pipe(text)
+                # text-classification returns: [ [ {label, score}, ... ] ]
+                if isinstance(out, list) and len(out) and isinstance(out[0], list):
+                    scores = {d["label"]: float(d["score"]) for d in out[0]}
+                elif isinstance(out, list) and len(out) and isinstance(out[0], dict) and "label" in out[0]:
+                    # Some classifiers return top-1 only
+                    scores = {out[0]["label"]: float(out[0]["score"])}
+                else:
+                    scores = {}
+                per_model_outputs[rid] = scores
+                label_union.update(scores.keys())
+            except Exception as e:
+                per_model_outputs[rid] = {"<error>": 0.0}
+                label_union.add("<error>")
+        # Build a nice table with union of labels as columns
+        label_cols = sorted(label_union)
+        for rid in models:
+            row = {"model": rid}
+            scores = per_model_outputs.get(rid, {})
+            for lab in label_cols:
+                row[lab] = scores.get(lab, 0.0)
+            # Also record the predicted (argmax) label if present
+            if scores:
+                pred = max(scores.items(), key=lambda kv: kv[1])[0]
+                row["predicted_label"] = pred
+            else:
+                row["predicted_label"] = ""
+            table_rows.append(row)
+        pred_df = pd.DataFrame(table_rows, columns=["model"] + label_cols + ["predicted_label"])
+        # Collect reported metrics if present
+        metrics_frames = []
+        for rid in models:
+            df = extract_model_index_metrics(rid)
+            if not df.empty:
+                df = df.copy()
+                df.insert(0, "repo_id", rid)
+                metrics_frames.append(df)
+        metrics_df = pd.concat(metrics_frames, ignore_index=True) if metrics_frames else pd.DataFrame()
+        msg = "✓ Done. Compared {} model(s) on task: `{}`".format(len(models), task)
+        return msg, pred_df, metrics_df
+    def refresh_models(selected_task: str) -> Tuple[List[str], List[str]]:
+        tasks, task2models = discover_tasks_and_models()
+        models = task2models.get(selected_task, [])
+        return tasks, models
+    def on_task_change(selected_task: str) -> List[str]:
+        _, task2models = discover_tasks_and_models()
+        return task2models.get(selected_task, [])
+    with gr.Blocks(fill_height=True, title="MediaBiasGroup — Model Comparator") as demo:
+        gr.Markdown(
+            "# MediaBiasGroup — Model Comparator\n"
+            "Select a **task**, choose multiple models, enter text, and compare outputs side-by-side. "
+            "If models provide a `model-index` in their cards, reported metrics are shown below."
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                tasks, task2models = discover_tasks_and_models()
+                task_dd = gr.Dropdown(choices=tasks or [DEFAULT_TASK], value=(tasks[0] if tasks else DEFAULT_TASK), label="Task")
+                model_ms = gr.Dropdown(choices=task2models.get(tasks[0], []) if tasks else [], multiselect=True, label="Models")
+                refresh_btn = gr.Button("🔄 Refresh list from Hub")
+                gr.Markdown(
+                    f"**Organization:** `{ORG}`  \n"
+                    f"**Max models per run:** {MAX_MODELS}"
+                )
+            with gr.Column(scale=2):
+                text_in = gr.Textbox(lines=4, placeholder="Paste a sentence…", label="Input text")
+                run_btn = gr.Button("Compare")
+                status = gr.Markdown("")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Predictions")
+                pred_df = gr.Dataframe(wrap=True)
+            with gr.Column():
+                gr.Markdown("### Reported metrics (from model cards)")
+                metrics_df = gr.Dataframe(wrap=True)
+        # Events wiring
+        task_dd.change(fn=on_task_change, inputs=[task_dd], outputs=[model_ms])
+        refresh_btn.click(fn=refresh_models, inputs=[task_dd], outputs=[task_dd, model_ms])
+        run_btn.click(fn=predict, inputs=[model_ms, task_dd, text_in], outputs=[status, pred_df, metrics_df])
+    if __name__ == "__main__":
+        demo.launch()