Spaces:

tomerz14
/

BERT_Text_Source_Classifier

Sleeping

App Files Files Community

tomerz14 commited on Oct 4, 2025

Commit

c6f65f2

verified ·

1 Parent(s): 220b4b1

Upload 3 files

Browse files

Files changed (3) hide show

README.md +27 -0
app.py +199 -0
requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# Binary Document Classifier — Gradio Space
+This Space hosts a Gradio app for **binary text classification** on uploaded documents.
+It supports long documents by **chunking** (512-token windows with overlap) and aggregates
+chunk probabilities into a **document-level** prediction.
+## Configure
+Set the environment variable `MODEL_ID` in your Space to point to your trained model,
+e.g. `your-username/bert-binclass`. You can also set:
+- `MAX_LENGTH` — tokens per chunk (default: 512)
+- `STRIDE` — overlap tokens between chunks (default: 128)
+## Run locally
+```bash
+pip install -r requirements.txt
+python app.py
+```
+Then open the printed Gradio URL.
+## Notes
+- PDF extraction uses `pypdf` for simplicity. For higher-quality results or OCR,
+  consider `pymupdf` (fitz) or `unstructured`.

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Gradio App — Binary Text Classifier (Chunked Inference)
+-------------------------------------------------------
+- Users upload a document file (txt, md, html, pdf*), we read the text, chunk if needed,
+  and return a prediction with probability.
+- Designed for Hugging Face Spaces.
+* For PDFs, this app uses a simple text extraction via pypdf. For production-quality
+  extraction, consider using `pymupdf` (fitz) or `pdfminer.six`.
+"""
+import os
+import io
+import re
+from typing import Dict, Any
+import numpy as np
+import torch
+import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+)
+# -----------------------------
+# Config
+# -----------------------------
+MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")   # e.g., "tomerz14/human-vs-AI_bert-classifier"
+MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
+STRIDE = int(os.getenv("STRIDE", "128"))
+# Device selection (CPU by default on Spaces)
+device = torch.device("cuda" if torch.cuda.is_available() else
+                      "mps" if torch.backends.mps.is_available() else "cpu")
+if device.type == "mps":
+    try:
+        torch.set_float32_matmul_precision("high")
+    except Exception:
+        pass
+# Load model & tokenizer at startup
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, torch_dtype=torch.float32).to(device)
+model.eval()
+# -----------------------------
+# Utilities
+# -----------------------------
+TEXT_EXTS = {".txt", ".md", ".rtf", ".html", ".htm"}
+PDF_EXTS = {".pdf"}
+def read_text_from_file(file_obj) -> str:
+    """
+    Read text content from an uploaded file.
+    Supports: .txt, .md, .rtf, .html, .htm, .pdf (via pypdf).
+    """
+    name = getattr(file_obj, "name", "") or ""
+    ext = os.path.splitext(name)[-1].lower()
+    if ext in TEXT_EXTS:
+        data = file_obj.read()
+        if isinstance(data, bytes):
+            data = data.decode("utf-8", errors="ignore")
+        if ext in {".html", ".htm"}:
+            data = re.sub(r"<[^>]+>", " ", data)
+            data = re.sub(r"\s+", " ", data).strip()
+        return data
+    if ext in PDF_EXTS:
+        try:
+            from pypdf import PdfReader
+            reader = PdfReader(io.BytesIO(file_obj.read()))
+            pages = []
+            for p in reader.pages:
+                try:
+                    pages.append(p.extract_text() or "")
+                except Exception:
+                    pages.append("")
+            text = "\n".join(pages)
+            text = re.sub(r"\s+", " ", text).strip()
+            return text
+        except Exception as e:
+            return f"[PDF parse error] {e}"
+    # Fallback: try to treat as text
+    data = file_obj.read()
+    if isinstance(data, bytes):
+        data = data.decode("utf-8", errors="ignore")
+    return data
+def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
+    """
+    Chunk the document using tokenizer overflow, run the classifier on each chunk,
+    and aggregate probabilities (mean or max).
+    """
+    if not text or not text.strip():
+        return {"error": "Empty document."}
+    with torch.no_grad():
+        enc = tokenizer(
+            text,
+            truncation=True,
+            max_length=max_length,
+            return_overflowing_tokens=True,
+            stride=stride,
+            padding=True,
+            return_tensors="pt",
+        )
+        allowed = {"input_ids", "attention_mask", "token_type_ids"}
+        inputs = {k: v.to(model.device) for k, v in enc.items() if k in allowed}
+        logits_list = []
+        for i in range(inputs["input_ids"].size(0)):
+            batch = {k: v[i:i+1] for k, v in inputs.items()}
+            out = model(**batch)
+            logits_list.append(out.logits)
+        logits = torch.cat(logits_list, dim=0)            # [num_chunks, num_labels]
+        probs = torch.softmax(logits, dim=-1).cpu().numpy()
+        num_chunks = int(probs.shape[0])
+    doc_probs = probs.mean(axis=0) if agg == "mean" else probs.max(axis=0)
+    pred_id = int(np.argmax(doc_probs))
+    id2label = getattr(model.config, "id2label", {0: "LABEL_0", 1: "LABEL_1"})
+    label = id2label.get(pred_id, str(pred_id))
+    score = float(doc_probs[pred_id])
+    all_scores = {id2label.get(i, str(i)): float(doc_probs[i]) for i in range(len(doc_probs))}
+    return {
+        "label": label,
+        "score": round(score, 6),
+        "all_scores": all_scores,
+        "num_chunks": num_chunks,
+        "tokens_per_chunk": max_length,
+        "stride": stride,
+        "model": MODEL_ID,
+        "device": str(device),
+    }
+def predict_from_upload(file, aggregation, max_length, stride):
+    if file is None:
+        return {"error": "Please upload a file."}
+    # Work around gradio temp file behavior
+    if hasattr(file, "name") and isinstance(file.name, str):
+        with open(file.name, "rb") as f:
+            raw_bytes = f.read()
+        mem = io.BytesIO(raw_bytes)
+        mem.name = os.path.basename(file.name)
+        text = read_text_from_file(mem)
+    else:
+        text = read_text_from_file(file)
+    return chunked_predict(text, max_length=int(max_length), stride=int(stride), agg=aggregation)
+# -----------------------------
+# Gradio UI
+# -----------------------------
+DESCRIPTION = """
+## Binary Document Classifier (Chunked)
+Upload a document (TXT/MD/HTML/PDF) and get a **document-level prediction**.
+Long files are **split into overlapping 512-token chunks**, each chunk is classified,
+and probabilities are **aggregated** (mean or max).
+**Tip:** This Space expects a binary classifier with two labels in the loaded checkpoint.
+"""
+with gr.Blocks(title="Binary Document Classifier") as demo:
+    gr.Markdown(DESCRIPTION)
+    file_in = gr.File(label="Upload a document", file_types=[".txt", ".md", ".rtf", ".html", ".htm", ".pdf"])
+    aggregation = gr.Radio(choices=["mean", "max"], value="mean", label="Aggregation over chunks")
+    with gr.Accordion("Advanced", open=False):
+        max_len_in = gr.Slider(128, 1024, value=MAX_LENGTH, step=32, label="Tokens per chunk (max_length)")
+        stride_in  = gr.Slider(0, 512, value=STRIDE, step=16, label="Stride / overlap")
+    btn = gr.Button("Predict")
+    out_json = gr.JSON(label="Prediction")
+    btn.click(
+        fn=predict_from_upload,
+        inputs=[file_in, aggregation, max_len_in, stride_in],
+        outputs=[out_json],
+        api_name="predict",
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers>=4.44
+torch
+evaluate>=0.4.0
+datasets>=2.20
+gradio>=4.0
+pypdf>=4.0