Spaces:

tomerz14
/

BERT_Text_Source_Classifier

Sleeping

App Files Files Community

tomerz14 commited on Oct 4, 2025

Commit

cfaed4d

verified ·

1 Parent(s): 4d68a43

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -17

app.py CHANGED Viewed

@@ -5,11 +5,12 @@ Gradio App — AI vs Human Document Classifier (Chunked Inference)
 ----------------------------------------------------------------
 Features:
 - Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
-- Shows:
   1) Probability bars with raw numbers (AI generated / Human written)
   2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
   3) Tabs for Basic / Advanced controls
   4) Chunk details accordion with per-chunk probabilities
 """
 import os
@@ -25,7 +26,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # -----------------------------
 # Config
 # -----------------------------
-MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")  # e.g., "username/bert-binclass"
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
 STRIDE = int(os.getenv("STRIDE", "128"))
@@ -92,7 +93,8 @@ def read_text_from_file(file_obj) -> str:
 def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
     """
     Chunk the document using tokenizer overflow, run classifier on each chunk,
-    aggregate probabilities, and return both doc-level and chunk-level results.
     """
     if not text or not text.strip():
         return {"error": "Empty document."}
@@ -105,6 +107,7 @@ def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: st
             return_overflowing_tokens=True,
             stride=stride,
             padding=True,
             return_tensors="pt",
         )
@@ -131,16 +134,39 @@ def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: st
     prob_human = float(doc_probs[0])
     prob_ai    = float(doc_probs[1])
-    # Per-chunk table rows
-    chunk_rows = []
     for i, p in enumerate(probs):
-        chunk_rows.append([i + 1, float(p[1]), float(p[0])])  # [chunk, AI, Human]
     return {
         "ai_prob": prob_ai,
         "human_prob": prob_human,
         "num_chunks": num_chunks,
-        "chunk_rows": chunk_rows,   # list of [chunk, AI, Human]
         "max_length": max_length,
         "stride": stride,
     }
@@ -194,7 +220,7 @@ def format_outputs(result: Dict[str, Any], threshold: float = 0.5):
     probs_html += probability_bar_html("AI generated", ai)
     probs_html += probability_bar_html("Human written", human)
-    # Chunk table rows
     table_data = result["chunk_rows"]
     details_md = (
@@ -217,9 +243,11 @@ CSS = """
 .prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
 .prob-fill {height:12px; background:#6366f1;}
 .small-note {font-size:0.9rem; color:#6b7280;}
-#chunkbox {max-height:260px; overflow:auto;}
 #details_note { font-size: 0.9rem; color: #6b7280; }
-.gr-group { max-height: 260px; overflow: auto; }
 """
 DESCRIPTION = """
@@ -244,17 +272,17 @@ with gr.Blocks(
             probs_html = gr.HTML(label="Probabilities")
             with gr.Accordion("Chunk details", open=False):
-                with gr.Group():
                     chunk_table = gr.Dataframe(
-                        headers=["Chunk", "AI generated", "Human written"],
-                        datatype=["number", "number", "number"],
                         label="Per-chunk probabilities",
                         wrap=True,
                         interactive=False,
-                        row_count=(0, "dynamic"),   # let rows grow
-                        col_count=(3, "fixed"),     # 3 fixed columns
-        )
-                details_md = gr.Markdown("", elem_id="details_note")  # use elem_id for broad compatibility
         with gr.Tab("Advanced"):
             gr.Markdown("Adjust chunking parameters below.")

 ----------------------------------------------------------------
 Features:
 - Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
+- UI includes:
   1) Probability bars with raw numbers (AI generated / Human written)
   2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
   3) Tabs for Basic / Advanced controls
   4) Chunk details accordion with per-chunk probabilities
+  5) NEW: Per-chunk **snippet** extracted using tokenizer offset_mapping
 """
 import os
 # -----------------------------
 # Config
 # -----------------------------
+MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")
 MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
 STRIDE = int(os.getenv("STRIDE", "128"))
 def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
     """
     Chunk the document using tokenizer overflow, run classifier on each chunk,
+    aggregate probabilities, and return both doc-level and chunk-level results,
+    including a short snippet per chunk derived from offset_mapping.
     """
     if not text or not text.strip():
         return {"error": "Empty document."}
             return_overflowing_tokens=True,
             stride=stride,
             padding=True,
+            return_offsets_mapping=True,   # NEW: get character offsets per token
             return_tensors="pt",
         )
     prob_human = float(doc_probs[0])
     prob_ai    = float(doc_probs[1])
+    # --- Build snippets per chunk from offset mapping ---
+    offsets = enc["offset_mapping"]           # tensor of pairs
+    attn    = enc["attention_mask"]           # [num_chunks, seq_len]
+    snippets: List[str] = []
+    PREVIEW = 120
+    for i in range(offsets.shape[0]):
+        offs = offsets[i].tolist()
+        mask = attn[i].tolist()
+        spans = [(s, e) for (s, e), m in zip(offs, mask) if m == 1 and not (s == 0 and e == 0)]
+        if spans:
+            s0 = min(s for s, _ in spans)
+            e0 = max(e for _, e in spans)
+            raw = text[s0:e0].strip()
+            raw = " ".join(raw.split())
+            if len(raw) > PREVIEW:
+                raw = raw[:PREVIEW].rstrip() + "…"
+            snippets.append(raw)
+        else:
+            snippets.append("")
+    # Per-chunk rows: [chunk#, AI prob, Human prob, Snippet]
+    chunk_rows: List[List[Any]] = []
     for i, p in enumerate(probs):
+        ai_p = float(p[1])
+        hu_p = float(p[0])
+        chunk_rows.append([i + 1, ai_p, hu_p, snippets[i]])
     return {
         "ai_prob": prob_ai,
         "human_prob": prob_human,
         "num_chunks": num_chunks,
+        "chunk_rows": chunk_rows,   # list of [chunk, AI, Human, Snippet]
         "max_length": max_length,
         "stride": stride,
     }
     probs_html += probability_bar_html("AI generated", ai)
     probs_html += probability_bar_html("Human written", human)
+    # Chunk table rows (already built server-side)
     table_data = result["chunk_rows"]
     details_md = (
 .prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
 .prob-fill {height:12px; background:#6366f1;}
 .small-note {font-size:0.9rem; color:#6b7280;}
+/* Wrap long snippet text within the DataFrame cells */
+.gr-dataframe table td { white-space: normal; }
+/* Scrollable chunk table container */
+#chunkgroup { max-height: 260px; overflow: auto; }
 #details_note { font-size: 0.9rem; color: #6b7280; }
 """
 DESCRIPTION = """
             probs_html = gr.HTML(label="Probabilities")
             with gr.Accordion("Chunk details", open=False):
+                with gr.Group(elem_id="chunkgroup"):
                     chunk_table = gr.Dataframe(
+                        headers=["Chunk", "AI generated", "Human written", "Snippet"],
+                        datatype=["number", "number", "number", "str"],
                         label="Per-chunk probabilities",
                         wrap=True,
                         interactive=False,
+                        row_count=(0, "dynamic"),
+                        col_count=(4, "fixed"),
+                    )
+                details_md = gr.Markdown("", elem_id="details_note")
         with gr.Tab("Advanced"):
             gr.Markdown("Adjust chunking parameters below.")