Spaces:

KavinduHansaka
/

Toxic_Comment_Classifier

Sleeping

App Files Files Community

KavinduHansaka commited on Jan 23

Commit

4d7e6ac

verified ·

1 Parent(s): 2a6aea4

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -66

app.py CHANGED Viewed

@@ -1,32 +1,49 @@
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from typing import List, Tuple
-from pathlib import Path
-import fitz  # PyMuPDF
 import docx
 # =========================
-# Configuration
 # =========================
 MODEL_NAME = "openai-community/roberta-base-openai-detector"
 AI_THRESHOLD = 0.5
-MAX_LENGTH = 512
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
 # =========================
-# Model Loading (once)
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 model.to(DEVICE)
 model.eval()
 # =========================
-# File Loaders
 # =========================
 def load_text_from_file(file_path: str) -> str:
     path = Path(file_path)
@@ -38,26 +55,19 @@ def load_text_from_file(file_path: str) -> str:
         return path.read_text(encoding="utf-8", errors="ignore")
     if path.suffix == ".pdf":
-        return load_pdf(path)
     if path.suffix == ".docx":
-        return load_docx(path)
-def load_pdf(path: Path) -> str:
-    text = []
-    with fitz.open(path) as pdf:
-        for page in pdf:
-            text.append(page.get_text())
-    return "\n".join(text)
-def load_docx(path: Path) -> str:
-    document = docx.Document(path)
-    return "\n".join(p.text for p in document.paragraphs if p.text.strip())
 # =========================
-# Text Utilities
 # =========================
 def chunk_text(text: str, max_words: int = 200) -> List[str]:
     words = text.split()
@@ -70,26 +80,52 @@ def chunk_text(text: str, max_words: int = 200) -> List[str]:
     return chunks
 # =========================
-# Core Logic
 # =========================
 @torch.no_grad()
-def detect_ai_probability(texts: List[str]) -> List[float]:
-    inputs = tokenizer(
-        texts,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=MAX_LENGTH
-    ).to(DEVICE)
-    logits = model(**inputs).logits
-    probs = torch.softmax(logits, dim=1)[:, 1]  # AI-generated class
-    return probs.cpu().tolist()
-def classify_chunks(chunks: List[str]) -> pd.DataFrame:
-    probabilities = detect_ai_probability(chunks)
     df = pd.DataFrame({
         "Text Chunk": chunks,
@@ -97,76 +133,88 @@ def classify_chunks(chunks: List[str]) -> pd.DataFrame:
         "Prediction": [
             "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
             for p in probabilities
         ]
     })
     return df
-def run_detector(text_input: str, uploaded_files) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
-    texts: List[str] = []
-    # Manual text input
     if text_input.strip():
         texts.append(text_input.strip())
-    # File inputs
     if uploaded_files:
         for file in uploaded_files:
-            extracted_text = load_text_from_file(file.name)
-            texts.append(extracted_text)
     if not texts:
         return pd.DataFrame({"Error": ["No input provided"]}), None
-    # Chunk all inputs
-    all_chunks = []
     for text in texts:
-        all_chunks.extend(chunk_text(text))
-    if not all_chunks:
         return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
-    result_df = classify_chunks(all_chunks)
-    # Document-level summary
-    avg_score = result_df["AI Probability"].mean()
-    summary_row = pd.DataFrame([{
-        "Text Chunk": "📄 Document Summary",
-        "AI Probability": round(avg_score, 4),
-        "Prediction": "🤖 Likely AI" if avg_score >= AI_THRESHOLD else "🧍 Human"
-    }])
-    final_df = pd.concat([result_df, summary_row], ignore_index=True)
-    csv_bytes = final_df.to_csv(index=False).encode("utf-8")
-    return final_df, ("ai_document_detection.csv", csv_bytes)
 # =========================
-# Gradio UI (HF Space)
 # =========================
 with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
     gr.Markdown("## 🧪 Offline AI Document Detector")
     gr.Markdown(
-        "Analyze **PDF, Word, TXT, or pasted text** to detect whether content is AI-generated. "
-        "Runs fully offline using an open-source RoBERTa model."
     )
     text_input = gr.Textbox(
         lines=6,
-        label="✍️ Paste Text (optional)",
-        placeholder="Paste any text here..."
     )
     file_input = gr.File(
-        label="📂 Upload Documents (PDF, DOCX, TXT)",
         file_types=[".pdf", ".docx", ".txt"],
         file_count="multiple"
     )
     analyze_btn = gr.Button("🔍 Analyze")
-    output_table = gr.Dataframe(label="📊 Detection Results")
-    download_file = gr.File(label="⬇️ Download Results")
     analyze_btn.click(
         fn=run_detector,

+import os
+import tempfile
+from pathlib import Path
+from typing import List
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import docx
+try:
+    import fitz  # PyMuPDF
+except ImportError as e:
+    raise ImportError("Missing dependency: PyMuPDF") from e
 # =========================
+# CPU OPTIMIZATION
+# =========================
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+torch.set_num_threads(2)
+torch.set_grad_enabled(False)
+# =========================
+# CONFIGURATION
 # =========================
 MODEL_NAME = "openai-community/roberta-base-openai-detector"
 AI_THRESHOLD = 0.5
+MAX_LENGTH = 256
+BATCH_SIZE = 8
+DEVICE = "cpu"
 SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
 # =========================
+# MODEL LOADING (ONCE)
 # =========================
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 model.to(DEVICE)
 model.eval()
 # =========================
+# FILE LOADERS
 # =========================
 def load_text_from_file(file_path: str) -> str:
     path = Path(file_path)
         return path.read_text(encoding="utf-8", errors="ignore")
     if path.suffix == ".pdf":
+        text = []
+        with fitz.open(path) as pdf:
+            for page in pdf:
+                text.append(page.get_text())
+        return "\n".join(text)
     if path.suffix == ".docx":
+        document = docx.Document(path)
+        return "\n".join(p.text for p in document.paragraphs if p.text.strip())
 # =========================
+# TEXT UTILITIES
 # =========================
 def chunk_text(text: str, max_words: int = 200) -> List[str]:
     words = text.split()
     return chunks
 # =========================
+# CONFIDENCE CALIBRATION
+# =========================
+def calibrate_confidence(prob: float) -> str:
+    distance = abs(prob - AI_THRESHOLD)
+    if distance >= 0.35:
+        return "High"
+    elif distance >= 0.15:
+        return "Medium"
+    return "Low"
+# =========================
+# AI DETECTION (BATCHED)
 # =========================
 @torch.no_grad()
+def detect_ai_probability(texts: List[str], progress=gr.Progress()):
+    probabilities = []
+    total = len(texts)
+    for i in range(0, total, BATCH_SIZE):
+        progress((i, total))
+        batch = texts[i:i + BATCH_SIZE]
+        inputs = tokenizer(
+            batch,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=MAX_LENGTH
+        )
+        logits = model(**inputs).logits
+        probs = torch.softmax(logits, dim=1)[:, 1]
+        probabilities.extend(probs.tolist())
+    progress((total, total))
+    return probabilities
+# =========================
+# CLASSIFICATION LOGIC
+# =========================
+def classify_chunks(chunks: List[str], progress=gr.Progress()) -> pd.DataFrame:
+    probabilities = detect_ai_probability(chunks, progress)
     df = pd.DataFrame({
         "Text Chunk": chunks,
         "Prediction": [
             "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
             for p in probabilities
+        ],
+        "Confidence": [
+            calibrate_confidence(p) for p in probabilities
         ]
     })
     return df
+def document_summary(df: pd.DataFrame) -> pd.DataFrame:
+    high_conf = df[df["Confidence"] == "High"]
+    avg_score = df["AI Probability"].mean()
+    summary = pd.DataFrame([{
+        "Text Chunk": "📄 Document Summary",
+        "AI Probability": round(avg_score, 4),
+        "Prediction": "🤖 Likely AI" if len(high_conf) >= len(df) * 0.6 else "🧍 Human",
+        "Confidence": "High" if len(high_conf) >= len(df) * 0.6 else "Medium"
+    }])
+    return pd.concat([df, summary], ignore_index=True)
+# =========================
+# GRADIO ENTRY FUNCTION
+# =========================
+def run_detector(text_input: str, uploaded_files, progress=gr.Progress()):
+    texts = []
     if text_input.strip():
         texts.append(text_input.strip())
     if uploaded_files:
         for file in uploaded_files:
+            texts.append(load_text_from_file(file.name))
     if not texts:
         return pd.DataFrame({"Error": ["No input provided"]}), None
+    chunks = []
     for text in texts:
+        chunks.extend(chunk_text(text))
+    if not chunks:
         return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
+    df = classify_chunks(chunks, progress)
+    final_df = document_summary(df)
+    with tempfile.NamedTemporaryFile(
+        delete=False, suffix=".csv", mode="w", encoding="utf-8"
+    ) as tmp:
+        final_df.to_csv(tmp.name, index=False)
+        output_path = tmp.name
+    return final_df, output_path
 # =========================
+# GRADIO UI (HF SPACE)
 # =========================
 with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
     gr.Markdown("## 🧪 Offline AI Document Detector")
     gr.Markdown(
+        "Analyze **PDF, DOCX, TXT, or pasted text** using an open-source AI detector. "
+        "Optimized for **CPU-only Hugging Face Spaces**."
     )
     text_input = gr.Textbox(
         lines=6,
+        label="✍️ Paste Text (optional)"
     )
     file_input = gr.File(
+        label="📂 Upload Documents",
         file_types=[".pdf", ".docx", ".txt"],
         file_count="multiple"
     )
     analyze_btn = gr.Button("🔍 Analyze")
+    output_table = gr.Dataframe(label="📊 Results")
+    download_file = gr.File(label="⬇️ Download CSV")
     analyze_btn.click(
         fn=run_detector,