Spaces:

tomerz14
/

BERT_Text_Source_Classifier

Sleeping

App Files Files Community

tomerz14 commited on Oct 4, 2025

Commit

4cf9509

verified ·

1 Parent(s): 4c97017

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -62

app.py CHANGED Viewed

@@ -1,48 +1,44 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
-Gradio App — Binary Text Classifier (Chunked Inference)
--------------------------------------------------------
-- Users upload a document file (txt, md, html, pdf*), we read the text, chunk if needed,
-  and return a prediction with probability.
-- Designed for Hugging Face Spaces.
-* For PDFs, this app uses a simple text extraction via pypdf. For production-quality
-  extraction, consider using `pymupdf` (fitz) or `pdfminer.six`.
 """
 import os
 import io
 import re
-from typing import Dict, Any
 import numpy as np
 import torch
 import gradio as gr
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-)
 # -----------------------------
 # Config
 # -----------------------------
-MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")   # e.g., "tomerz14/human-vs-AI_bert-classifier"
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
 STRIDE = int(os.getenv("STRIDE", "128"))
-# Device selection (CPU by default on Spaces)
 device = torch.device("cuda" if torch.cuda.is_available() else
                       "mps" if torch.backends.mps.is_available() else "cpu")
 if device.type == "mps":
     try:
         torch.set_float32_matmul_precision("high")
     except Exception:
         pass
-# Load model & tokenizer at startup
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, torch_dtype=torch.float32).to(device)
 model.eval()
@@ -50,7 +46,6 @@ model.eval()
 # -----------------------------
 # Utilities
 # -----------------------------
 TEXT_EXTS = {".txt", ".md", ".rtf", ".html", ".htm"}
 PDF_EXTS = {".pdf"}
@@ -87,7 +82,7 @@ def read_text_from_file(file_obj) -> str:
         except Exception as e:
             return f"[PDF parse error] {e}"
-    # Fallback: try to treat as text
     data = file_obj.read()
     if isinstance(data, bytes):
         data = data.decode("utf-8", errors="ignore")
@@ -96,8 +91,8 @@ def read_text_from_file(file_obj) -> str:
 def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
     """
-    Chunk the document using tokenizer overflow, run the classifier on each chunk,
-    and aggregate probabilities (mean or max).
     """
     if not text or not text.strip():
         return {"error": "Empty document."}
@@ -122,27 +117,32 @@ def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: st
             out = model(**batch)
             logits_list.append(out.logits)
-        logits = torch.cat(logits_list, dim=0)            # [num_chunks, num_labels]
-        probs = torch.softmax(logits, dim=-1).cpu().numpy()
         num_chunks = int(probs.shape[0])
-    doc_probs = probs.mean(axis=0) if agg == "mean" else probs.max(axis=0)
-    pred_id = int(np.argmax(doc_probs))
-    id2label = getattr(model.config, "id2label", {0: "LABEL_0", 1: "LABEL_1"})
-    label = id2label.get(pred_id, str(pred_id))
-    score = float(doc_probs[pred_id])
-    all_scores = {id2label.get(i, str(i)): float(doc_probs[i]) for i in range(len(doc_probs))}
     return {
-        "label": label,
-        "score": round(score, 6),
-        "all_scores": all_scores,
         "num_chunks": num_chunks,
-        "tokens_per_chunk": max_length,
         "stride": stride,
-        "model": MODEL_ID,
-        "device": str(device),
     }
@@ -153,10 +153,9 @@ def predict_from_upload(file, aggregation, max_length, stride):
     # Work around gradio temp file behavior
     if hasattr(file, "name") and isinstance(file.name, str):
         with open(file.name, "rb") as f:
-            raw_bytes = f.read()
-        mem = io.BytesIO(raw_bytes)
-        mem.name = os.path.basename(file.name)
-        text = read_text_from_file(mem)
     else:
         text = read_text_from_file(file)
@@ -164,36 +163,109 @@ def predict_from_upload(file, aggregation, max_length, stride):
 # -----------------------------
-# Gradio UI
 # -----------------------------
-DESCRIPTION = """
-## Binary Document Classifier (Chunked)
-Upload a document (TXT/MD/HTML/PDF) and get a **document-level prediction**.
-Long files are **split into overlapping 512-token chunks**, each chunk is classified,
-and probabilities are **aggregated** (mean or max).
-**Tip:** This Space expects a binary classifier with two labels in the loaded checkpoint.
-"""
-with gr.Blocks(title="Binary Document Classifier") as demo:
-    gr.Markdown(DESCRIPTION)
-    file_in = gr.File(label="Upload a document", file_types=[".txt", ".md", ".rtf", ".html", ".htm", ".pdf"])
-    aggregation = gr.Radio(choices=["mean", "max"], value="mean", label="Aggregation over chunks")
-    with gr.Accordion("Advanced", open=False):
-        max_len_in = gr.Slider(128, 1024, value=MAX_LENGTH, step=32, label="Tokens per chunk (max_length)")
-        stride_in  = gr.Slider(0, 512, value=STRIDE, step=16, label="Stride / overlap")
-    btn = gr.Button("Predict")
-    out_json = gr.JSON(label="Prediction")
     btn.click(
-        fn=predict_from_upload,
-        inputs=[file_in, aggregation, max_len_in, stride_in],
-        outputs=[out_json],
-        api_name="predict",
     )
 if __name__ == "__main__":
-    demo.launch()

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
+Gradio App — AI vs Human Document Classifier (Chunked Inference)
+----------------------------------------------------------------
+Features:
+- Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
+- Shows:
+  1) Probability bars with raw numbers (AI generated / Human written)
+  2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
+  3) Tabs for Basic / Advanced controls
+  4) Chunk details accordion with per-chunk probabilities
 """
 import os
 import io
 import re
+from typing import Dict, Any, List, Tuple
 import numpy as np
 import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # -----------------------------
 # Config
 # -----------------------------
+MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")  # e.g., "username/bert-binclass"
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
 STRIDE = int(os.getenv("STRIDE", "128"))
+# Device
 device = torch.device("cuda" if torch.cuda.is_available() else
                       "mps" if torch.backends.mps.is_available() else "cpu")
 if device.type == "mps":
     try:
         torch.set_float32_matmul_precision("high")
     except Exception:
         pass
+# Load model & tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, torch_dtype=torch.float32).to(device)
 model.eval()
 # -----------------------------
 # Utilities
 # -----------------------------
 TEXT_EXTS = {".txt", ".md", ".rtf", ".html", ".htm"}
 PDF_EXTS = {".pdf"}
         except Exception as e:
             return f"[PDF parse error] {e}"
+    # Fallback: try as text
     data = file_obj.read()
     if isinstance(data, bytes):
         data = data.decode("utf-8", errors="ignore")
 def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
     """
+    Chunk the document using tokenizer overflow, run classifier on each chunk,
+    aggregate probabilities, and return both doc-level and chunk-level results.
     """
     if not text or not text.strip():
         return {"error": "Empty document."}
             out = model(**batch)
             logits_list.append(out.logits)
+        logits = torch.cat(logits_list, dim=0)  # [num_chunks, num_labels]
+        probs  = torch.softmax(logits, dim=-1).cpu().numpy()
         num_chunks = int(probs.shape[0])
+    # Aggregate
+    if agg == "max":
+        doc_probs = probs.max(axis=0)
+    else:
+        doc_probs = probs.mean(axis=0)
+    # By convention: 0 -> Human, 1 -> AI
+    prob_human = float(doc_probs[0])
+    prob_ai    = float(doc_probs[1])
+    # Per-chunk table rows
+    chunk_rows = []
+    for i, p in enumerate(probs):
+        chunk_rows.append([i + 1, float(p[1]), float(p[0])])  # [chunk, AI, Human]
     return {
+        "ai_prob": prob_ai,
+        "human_prob": prob_human,
         "num_chunks": num_chunks,
+        "chunk_rows": chunk_rows,   # list of [chunk, AI, Human]
+        "max_length": max_length,
         "stride": stride,
     }
     # Work around gradio temp file behavior
     if hasattr(file, "name") and isinstance(file.name, str):
         with open(file.name, "rb") as f:
+            raw = io.BytesIO(f.read())
+        raw.name = os.path.basename(file.name)
+        text = read_text_from_file(raw)
     else:
         text = read_text_from_file(file)
 # -----------------------------
+# UI Helpers (HTML formatting)
 # -----------------------------
+def probability_bar_html(label: str, prob: float) -> str:
+    """Return an HTML row with label, percent, and a bar."""
+    pct = prob * 100.0
+    return f"""
+      <div class="prob-row"><div class="prob-label"><b>{label}</b></div>
+        <div class="prob-value">{pct:.2f}%</div>
+        <div class="prob-bar">
+          <div class="prob-fill" style="width:{pct:.2f}%"></div>
+        </div>
+      </div>
+    """
+def verdict_badge_html(prob_ai: float, threshold: float = 0.5) -> str:
+    label = "Likely AI" if prob_ai >= threshold else "Likely Human"
+    color = "#ef4444" if prob_ai >= threshold else "#10b981"  # red / green
+    return f"<span class='pill' style='background:{color}22;color:{color}'>{label}</span>"
+def format_outputs(result: Dict[str, Any], threshold: float = 0.5):
+    """Produce (verdict_html, probs_html, chunk_table_data, details_md)."""
+    if "error" in result:
+        return f"<span style='color:#ef4444'>{result['error']}</span>", "", [], ""
+    ai, human = result["ai_prob"], result["human_prob"]
+    verdict_html = verdict_badge_html(ai, threshold=threshold)
+    probs_html = ""
+    probs_html += probability_bar_html("AI generated", ai)
+    probs_html += probability_bar_html("Human written", human)
+    # Chunk table rows
+    table_data = result["chunk_rows"]
+    details_md = (
+        f"**Chunks:** `{result['num_chunks']}`  \n"
+        f"**Tokens per chunk:** `{result['max_length']}`  \n"
+        f"**Stride:** `{result['stride']}`"
+    )
+    return verdict_html, probs_html, table_data, details_md
+# -----------------------------
+# Gradio Interface
+# -----------------------------
+CSS = """
+.pill {padding:6px 12px; border-radius:999px; display:inline-block; margin: 6px 0; font-weight:600;}
+.prob-row {display:flex; align-items:center; gap:10px; margin:6px 0;}
+.prob-label {min-width:140px;}
+.prob-value {min-width:80px; text-align:right; font-variant-numeric: tabular-nums;}
+.prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
+.prob-fill {height:12px; background:#6366f1;}
+.small-note {font-size:0.9rem; color:#6b7280;}
+"""
+DESCRIPTION = """
+### 🔎 AI vs Human — Document Classifier
+Upload a file to get **document-level probabilities**.
+Long inputs are **chunked** into overlapping windows; chunk predictions are **aggregated**.
+"""
+with gr.Blocks(
+    title="AI vs Human Document Classifier",
+    theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate"),
+    css=CSS
+) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tabs():
+        with gr.Tab("Predict"):
+            file_in = gr.File(label="Upload a document", file_types=[".txt", ".md", ".rtf", ".html", ".htm", ".pdf"])
+            agg_in = gr.Radio(choices=["mean", "max"], value="mean", label="Aggregation over chunks")
+            btn = gr.Button("Predict", variant="primary")
+            verdict_html = gr.HTML(label="Verdict")
+            probs_html = gr.HTML(label="Probabilities")
+            with gr.Accordion("Chunk details", open=False):
+                chunk_table = gr.Dataframe(
+                    headers=["Chunk", "AI generated", "Human written"],
+                    datatype=["number", "number", "number"],
+                    label="Per-chunk probabilities",
+                    wrap=True,
+                    interactive=False,
+                    height=240
+                )
+                details_md = gr.Markdown("", elem_classes=["small-note"])
+        with gr.Tab("Advanced"):
+            gr.Markdown("Adjust chunking parameters below.")
+            max_len_in = gr.Slider(128, 1024, value=MAX_LENGTH, step=32, label="Tokens per chunk (max_length)")
+            stride_in  = gr.Slider(0, 512, value=STRIDE, step=16, label="Stride / overlap")
+            gr.Markdown("You can also set `MODEL_ID`, `MAX_LENGTH`, and `STRIDE` via Space Variables.")
+    def predict_and_prettify(file, aggregation, max_length=MAX_LENGTH, stride=STRIDE):
+        res = predict_from_upload(file, aggregation, max_length, stride)
+        return format_outputs(res)
     btn.click(
+        fn=predict_and_prettify,
+        inputs=[file_in, agg_in, max_len_in, stride_in],
+        outputs=[verdict_html, probs_html, chunk_table, details_md],
     )
 if __name__ == "__main__":
+    demo.launch()