Spaces:

mediabiasgroup
/

model_compare

Sleeping

App Files Files Community

bitwise31337 commited on Aug 27, 2025

Commit

a971b64

verified ·

1 Parent(s): 507f847

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -80

app.py CHANGED Viewed

@@ -1,56 +1,103 @@
-import gradio as gr
-import pandas as pd
-from typing import List, Dict, Any, Tuple
 from functools import lru_cache
-from huggingface_hub import HfApi
 from transformers import pipeline
 ORG = "mediabiasgroup"
 DEFAULT_TASK = "text-classification"
 MAX_MODELS = 10  # safety cap to avoid loading too many models at once on CPU Spaces
 api = HfApi()
 @lru_cache(maxsize=1)
 def list_org_models() -> List[Any]:
     # full=True to fetch pipeline_tag & tags
     return list(api.list_models(author=ORG, full=True))
 def discover_tasks_and_models() -> Tuple[List[str], Dict[str, List[str]]]:
     infos = list_org_models()
     task2models: Dict[str, List[str]] = {}
     for info in infos:
         task = getattr(info, "pipeline_tag", None)
         if not task:
-            # Try to infer from tags if missing
             tags = set(getattr(info, "tags", []) or [])
-            # Very light heuristic; expand if you add other task types later
             if "text-classification" in tags:
                 task = "text-classification"
         if task:
             task2models.setdefault(task, []).append(info.modelId)
-    tasks = sorted(task2models.keys())
-    # Keep deterministic sorting of model ids within each task
     for t in task2models:
         task2models[t] = sorted(task2models[t])
     return tasks, task2models
 @lru_cache(maxsize=256)
 def get_card_data(repo_id: str) -> Dict[str, Any]:
     try:
         info = api.model_info(repo_id)
-        # .cardData is already a parsed dict when available
         data = getattr(info, "cardData", None)
         return data or {}
     except Exception:
         return {}
 def extract_model_index_metrics(repo_id: str) -> pd.DataFrame:
     data = get_card_data(repo_id)
-    rows = []
     if not data:
-        return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value"])
     mi = data.get("model-index") or data.get("model_index") or []
     for entry in mi:
         name = entry.get("name", repo_id)
@@ -60,83 +107,147 @@ def extract_model_index_metrics(repo_id: str) -> pd.DataFrame:
             dset = res.get("dataset", {})
             dname = dset.get("name", dset.get("type", ""))
             for m in res.get("metrics", []):
-                rows.append({
-                    "model": name,
-                    "dataset": dname,
-                    "task": task_type,
-                    "metric": m.get("name", ""),
-                    "value": m.get("value", None),
-                    "repo_id": repo_id
-                })
     if not rows:
-        return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value"])
-    df = pd.DataFrame(rows)
-    # Optional: pivot for nicer viewing in the UI
-    return df
-# Lazy-loaded pipelines cache
 PIPE_CACHE: Dict[str, Any] = {}
 def get_pipeline(repo_id: str, task: str):
     key = f"{task}::{repo_id}"
     if key in PIPE_CACHE:
         return PIPE_CACHE[key]
-    # Use return_all_scores=True so we can compare per-label scores
     if task == "text-classification":
-        pipe = pipeline(task, model=repo_id, tokenizer=repo_id, return_all_scores=True, truncation=True)
     else:
-        # Add more pipelines if you start supporting other tasks
-        pipe = pipeline(task, model=repo_id, tokenizer=repo_id)
     PIPE_CACHE[key] = pipe
     return pipe
 def predict(models: List[str], task: str, text: str) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
     if not text.strip():
         return "Please enter some text.", pd.DataFrame(), pd.DataFrame()
     if not models:
-        return "Please select 1–{} models.".format(MAX_MODELS), pd.DataFrame(), pd.DataFrame()
     if len(models) > MAX_MODELS:
         models = models[:MAX_MODELS]
-    # Run inference
-    table_rows = []
-    label_union = set()
-    per_model_outputs = {}
     for rid in models:
         try:
             pipe = get_pipeline(rid, task)
             out = pipe(text)
-            # text-classification returns: [ [ {label, score}, ... ] ]
-            if isinstance(out, list) and len(out) and isinstance(out[0], list):
                 scores = {d["label"]: float(d["score"]) for d in out[0]}
-            elif isinstance(out, list) and len(out) and isinstance(out[0], dict) and "label" in out[0]:
-                # Some classifiers return top-1 only
-                scores = {out[0]["label"]: float(out[0]["score"])}
             else:
                 scores = {}
             per_model_outputs[rid] = scores
             label_union.update(scores.keys())
         except Exception as e:
-            per_model_outputs[rid] = {"<error>": 0.0}
             label_union.add("<error>")
-    # Build a nice table with union of labels as columns
     label_cols = sorted(label_union)
     for rid in models:
         row = {"model": rid}
         scores = per_model_outputs.get(rid, {})
         for lab in label_cols:
             row[lab] = scores.get(lab, 0.0)
-        # Also record the predicted (argmax) label if present
         if scores:
             pred = max(scores.items(), key=lambda kv: kv[1])[0]
             row["predicted_label"] = pred
         else:
             row["predicted_label"] = ""
         table_rows.append(row)
     pred_df = pd.DataFrame(table_rows, columns=["model"] + label_cols + ["predicted_label"])
     # Collect reported metrics if present
     metrics_frames = []
     for rid in models:
@@ -146,51 +257,87 @@ def predict(models: List[str], task: str, text: str) -> Tuple[str, pd.DataFrame,
             df.insert(0, "repo_id", rid)
             metrics_frames.append(df)
     metrics_df = pd.concat(metrics_frames, ignore_index=True) if metrics_frames else pd.DataFrame()
-    msg = "✓ Done. Compared {} model(s) on task: `{}`".format(len(models), task)
     return msg, pred_df, metrics_df
 def refresh_models(selected_task: str) -> Tuple[List[str], List[str]]:
     tasks, task2models = discover_tasks_and_models()
     models = task2models.get(selected_task, [])
     return tasks, models
 def on_task_change(selected_task: str) -> List[str]:
     _, task2models = discover_tasks_and_models()
     return task2models.get(selected_task, [])
-with gr.Blocks(fill_height=True, title="MediaBiasGroup — Model Comparator") as demo:
-    gr.Markdown(
-        "# MediaBiasGroup — Model Comparator\n"
-        "Select a **task**, choose multiple models, enter text, and compare outputs side-by-side. "
-        "If models provide a `model-index` in their cards, reported metrics are shown below."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            tasks, task2models = discover_tasks_and_models()
-            task_dd = gr.Dropdown(choices=tasks or [DEFAULT_TASK], value=(tasks[0] if tasks else DEFAULT_TASK), label="Task")
-            model_ms = gr.Dropdown(choices=task2models.get(tasks[0], []) if tasks else [], multiselect=True, label="Models")
-            refresh_btn = gr.Button("🔄 Refresh list from Hub")
-            gr.Markdown(
-                f"**Organization:** `{ORG}`  \n"
-                f"**Max models per run:** {MAX_MODELS}"
-            )
-        with gr.Column(scale=2):
-            text_in = gr.Textbox(lines=4, placeholder="Paste a sentence…", label="Input text")
-            run_btn = gr.Button("Compare")
-            status = gr.Markdown("")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Predictions")
-            pred_df = gr.Dataframe(wrap=True)
-        with gr.Column():
-            gr.Markdown("### Reported metrics (from model cards)")
-            metrics_df = gr.Dataframe(wrap=True)
-    # Events wiring
-    task_dd.change(fn=on_task_change, inputs=[task_dd], outputs=[model_ms])
-    refresh_btn.click(fn=refresh_models, inputs=[task_dd], outputs=[task_dd, model_ms])
-    run_btn.click(fn=predict, inputs=[model_ms, task_dd, text_in], outputs=[status, pred_df, metrics_df])
 if __name__ == "__main__":
-    demo.launch()

+"""
+MediaBiasGroup — Model Comparator (Gradio Space)
+- Discovers models under the org and groups them by pipeline_tag
+- Lets users pick a task, select multiple models, and compare outputs on the same input
+- Reads any 'model-index' metrics from model cards and shows them in a table
+- Falls back to base_model's tokenizer if a fine-tuned repo lacks tokenizer files
+- Canonicalizes label names across models (e.g., LABEL_0 -> neutral)
+Requirements (see requirements.txt):
+  gradio>=4.31.4
+  transformers>=4.42.0
+  huggingface_hub>=0.23.0
+  torch>=2.2.0
+  pandas>=2.0.0
+"""
+from __future__ import annotations
+import os
 from functools import lru_cache
+from typing import Any, Dict, List, Tuple
+import gradio as gr
+import pandas as pd
+from huggingface_hub import HfApi, list_repo_files
 from transformers import pipeline
+# =========================
+# Configuration
+# =========================
 ORG = "mediabiasgroup"
 DEFAULT_TASK = "text-classification"
 MAX_MODELS = 10  # safety cap to avoid loading too many models at once on CPU Spaces
 api = HfApi()
+# Canonical label mapping (expand as needed)
+CANON = {
+    "LABEL_0": "neutral",
+    "LABEL_1": "lexical_bias",
+    "NEGATIVE": "neutral",
+    "POSITIVE": "lexical_bias",
+    "neutral": "neutral",
+    "not_biased": "neutral",
+    "non-biased": "neutral",
+    "unbiased": "neutral",
+    "biased": "lexical_bias",
+    "lexical_bias": "lexical_bias",
+}
+# =========================
+# Discovery & metadata
+# =========================
 @lru_cache(maxsize=1)
 def list_org_models() -> List[Any]:
     # full=True to fetch pipeline_tag & tags
     return list(api.list_models(author=ORG, full=True))
 def discover_tasks_and_models() -> Tuple[List[str], Dict[str, List[str]]]:
     infos = list_org_models()
     task2models: Dict[str, List[str]] = {}
     for info in infos:
+        # Prefer the explicit pipeline_tag
         task = getattr(info, "pipeline_tag", None)
+        # Heuristic fallback via tags if pipeline_tag is missing
         if not task:
             tags = set(getattr(info, "tags", []) or [])
             if "text-classification" in tags:
                 task = "text-classification"
         if task:
             task2models.setdefault(task, []).append(info.modelId)
+    tasks = sorted(task2models.keys()) or [DEFAULT_TASK]
     for t in task2models:
         task2models[t] = sorted(task2models[t])
     return tasks, task2models
 @lru_cache(maxsize=256)
 def get_card_data(repo_id: str) -> Dict[str, Any]:
     try:
         info = api.model_info(repo_id)
         data = getattr(info, "cardData", None)
+        if hasattr(data, "data"):  # ModelCardData -> dict
+            return dict(data.data)
         return data or {}
     except Exception:
         return {}
 def extract_model_index_metrics(repo_id: str) -> pd.DataFrame:
     data = get_card_data(repo_id)
+    rows: List[Dict[str, Any]] = []
     if not data:
+        return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value", "repo_id"])
     mi = data.get("model-index") or data.get("model_index") or []
     for entry in mi:
         name = entry.get("name", repo_id)
             dset = res.get("dataset", {})
             dname = dset.get("name", dset.get("type", ""))
             for m in res.get("metrics", []):
+                rows.append(
+                    {
+                        "model": name,
+                        "dataset": dname,
+                        "task": task_type,
+                        "metric": m.get("name", ""),
+                        "value": m.get("value", None),
+                        "repo_id": repo_id,
+                    }
+                )
     if not rows:
+        return pd.DataFrame(columns=["model", "dataset", "task", "metric", "value", "repo_id"])
+    return pd.DataFrame(rows)
+# =========================
+# Tokenizer fallback logic
+# =========================
+def _has_tokenizer_files(repo_id: str) -> bool:
+    try:
+        files = set(list_repo_files(repo_id, repo_type="model"))
+    except Exception:
+        return False
+    if "tokenizer.json" in files:
+        return True
+    if {"vocab.json", "merges.txt"}.issubset(files):
+        return True
+    if "spiece.model" in files:
+        return True
+    return False
+def _base_model_from_card(repo_id: str) -> str | None:
+    data = get_card_data(repo_id) or {}
+    base = data.get("base_model")
+    if isinstance(base, list):
+        base = base[0] if base else None
+    return base
+def _tokenizer_source(repo_id: str) -> str:
+    # prefer repo tokenizer; else fall back to base_model; else repo_id
+    if _has_tokenizer_files(repo_id):
+        return repo_id
+    base = _base_model_from_card(repo_id)
+    return base or repo_id
+# =========================
+# Pipelines & prediction
+# =========================
 PIPE_CACHE: Dict[str, Any] = {}
 def get_pipeline(repo_id: str, task: str):
     key = f"{task}::{repo_id}"
     if key in PIPE_CACHE:
         return PIPE_CACHE[key]
+    tok_src = _tokenizer_source(repo_id)
     if task == "text-classification":
+        pipe = pipeline(
+            task,
+            model=repo_id,
+            tokenizer=tok_src,
+            return_all_scores=True,
+            truncation=True,
+        )
     else:
+        # Add more tasks if you release them later
+        pipe = pipeline(task, model=repo_id, tokenizer=tok_src)
     PIPE_CACHE[key] = pipe
     return pipe
+def _canonicalize(scores: Dict[str, float]) -> Dict[str, float]:
+    out: Dict[str, float] = {}
+    for raw_label, sc in scores.items():
+        lab = CANON.get(raw_label, raw_label)
+        out[lab] = max(sc, out.get(lab, 0.0))
+    return out
 def predict(models: List[str], task: str, text: str) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
     if not text.strip():
         return "Please enter some text.", pd.DataFrame(), pd.DataFrame()
     if not models:
+        return f"Please select 1–{MAX_MODELS} models.", pd.DataFrame(), pd.DataFrame()
     if len(models) > MAX_MODELS:
         models = models[:MAX_MODELS]
+    table_rows: List[Dict[str, Any]] = []
+    label_union: set[str] = set()
+    per_model_outputs: Dict[str, Dict[str, float]] = {}
+    errors: Dict[str, str] = {}
     for rid in models:
         try:
             pipe = get_pipeline(rid, task)
             out = pipe(text)
+            # text-classification pipeline:
+            # typical shape: [ [ {label, score}, ... ] ] or [ {label, score}, ... ]
+            scores: Dict[str, float]
+            if isinstance(out, list) and out and isinstance(out[0], list):
                 scores = {d["label"]: float(d["score"]) for d in out[0]}
+            elif isinstance(out, list) and out and isinstance(out[0], dict) and "label" in out[0]:
+                # some classifiers return flat list
+                scores = {d["label"]: float(d["score"]) for d in out}
             else:
                 scores = {}
+            scores = _canonicalize(scores) or {"<no_output>": 1.0}
             per_model_outputs[rid] = scores
             label_union.update(scores.keys())
         except Exception as e:
+            per_model_outputs[rid] = {"<error>": 1.0}
             label_union.add("<error>")
+            errors[rid] = str(e)
+    # Build table with union of labels as columns
     label_cols = sorted(label_union)
     for rid in models:
         row = {"model": rid}
         scores = per_model_outputs.get(rid, {})
         for lab in label_cols:
             row[lab] = scores.get(lab, 0.0)
         if scores:
             pred = max(scores.items(), key=lambda kv: kv[1])[0]
             row["predicted_label"] = pred
         else:
             row["predicted_label"] = ""
         table_rows.append(row)
     pred_df = pd.DataFrame(table_rows, columns=["model"] + label_cols + ["predicted_label"])
     # Collect reported metrics if present
     metrics_frames = []
     for rid in models:
             df.insert(0, "repo_id", rid)
             metrics_frames.append(df)
     metrics_df = pd.concat(metrics_frames, ignore_index=True) if metrics_frames else pd.DataFrame()
+    msg = f"✓ Done. Compared {len(models)} model(s) on task: `{task}`"
+    if errors:
+        msg += "\n\n**Errors**:\n" + "\n".join(f"- {k}: {v}" for k, v in errors.items())
     return msg, pred_df, metrics_df
+# =========================
+# UI wiring
+# =========================
 def refresh_models(selected_task: str) -> Tuple[List[str], List[str]]:
     tasks, task2models = discover_tasks_and_models()
     models = task2models.get(selected_task, [])
     return tasks, models
 def on_task_change(selected_task: str) -> List[str]:
     _, task2models = discover_tasks_and_models()
     return task2models.get(selected_task, [])
+def build_ui() -> gr.Blocks:
+    with gr.Blocks(fill_height=True, title="MediaBiasGroup — Model Comparator") as demo:
+        gr.Markdown(
+            "# MediaBiasGroup — Model Comparator\n"
+            "Select a **task**, choose multiple models, enter text, and compare outputs side-by-side. "
+            "If models provide a `model-index` in their cards, reported metrics appear below."
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                tasks, task2models = discover_tasks_and_models()
+                task_choices = tasks or [DEFAULT_TASK]
+                task_default = task_choices[0] if task_choices else DEFAULT_TASK
+                task_dd = gr.Dropdown(
+                    choices=task_choices,
+                    value=task_default,
+                    label="Task",
+                )
+                model_ms = gr.Dropdown(
+                    choices=task2models.get(task_default, []),
+                    multiselect=True,
+                    label="Models",
+                )
+                refresh_btn = gr.Button("🔄 Refresh list from Hub")
+                gr.Markdown(f"**Organization:** `{ORG}`  \n**Max models per run:** {MAX_MODELS}")
+            with gr.Column(scale=2):
+                text_in = gr.Textbox(lines=4, placeholder="Paste a sentence…", label="Input text")
+                examples = gr.Examples(
+                    examples=[
+                        ["The bill passed the House on Tuesday in a 220–210 vote."],  # unbiased/factual
+                        ["Lawmakers shamelessly rammed the bill through the House on Tuesday."],  # biased/loaded
+                        ["Unemployment fell from 5.2% to 5.0% in July, according to government figures."],
+                        ["The corrupt regime bragged unemployment fell, but it's just cooking the books."],
+                    ],
+                    inputs=[text_in],
+                    label="Examples",
+                )
+                run_btn = gr.Button("Compare")
+                status = gr.Markdown("")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Predictions")
+                pred_df = gr.Dataframe(interactive=False)
+            with gr.Column():
+                gr.Markdown("### Reported metrics (from model cards)")
+                metrics_df = gr.Dataframe(interactive=False)
+        # Events
+        task_dd.change(fn=on_task_change, inputs=[task_dd], outputs=[model_ms])
+        refresh_btn.click(fn=refresh_models, inputs=[task_dd], outputs=[task_dd, model_ms])
+        run_btn.click(fn=predict, inputs=[model_ms, task_dd, text_in], outputs=[status, pred_df, metrics_df])
+    return demo
 if __name__ == "__main__":
+    demo = build_ui()
+    # queue() allows concurrent requests; adjust concurrency per Space hardware
+    demo.queue(max_size=16).launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))