Spaces:

KavinduHansaka
/

Toxic_Comment_Classifier

Sleeping

App Files Files Community

KavinduHansaka commited on Jan 23

Commit

c3b4434

verified ·

1 Parent(s): f1f7273

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -41

app.py CHANGED Viewed

@@ -3,14 +3,20 @@ import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from typing import List, Tuple
 # =========================
 # Configuration
 # =========================
 MODEL_NAME = "openai-community/roberta-base-openai-detector"
 AI_THRESHOLD = 0.5
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
 # Model Loading (once)
 # =========================
@@ -19,20 +25,62 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 model.to(DEVICE)
 model.eval()
 # =========================
 # Core Logic
 # =========================
 @torch.no_grad()
 def detect_ai_probability(texts: List[str]) -> List[float]:
-    """
-    Returns probability that each text is AI-generated.
-    """
     inputs = tokenizer(
         texts,
         return_tensors="pt",
         padding=True,
         truncation=True,
-        max_length=512
     ).to(DEVICE)
     logits = model(**inputs).logits
@@ -40,14 +88,11 @@ def detect_ai_probability(texts: List[str]) -> List[float]:
     return probs.cpu().tolist()
-def classify_texts(texts: List[str]) -> pd.DataFrame:
-    """
-    Classify texts as AI or Human.
-    """
-    probabilities = detect_ai_probability(texts)
     df = pd.DataFrame({
-        "Comment": texts,
         "AI Probability": [round(p, 4) for p in probabilities],
         "Prediction": [
             "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
@@ -58,55 +103,74 @@ def classify_texts(texts: List[str]) -> pd.DataFrame:
     return df
-def run_detector(text_input: str, csv_file) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
-    """
-    Handles UI input and output.
-    """
     texts: List[str] = []
     if text_input.strip():
-        texts.extend([t.strip() for t in text_input.split("\n") if t.strip()])
-    if csv_file:
-        df = pd.read_csv(csv_file.name)
-        if "comment" not in df.columns:
-            return pd.DataFrame({"Error": ["CSV must contain a 'comment' column"]}), None
-        texts.extend(df["comment"].astype(str).tolist())
     if not texts:
         return pd.DataFrame({"Error": ["No input provided"]}), None
-    result_df = classify_texts(texts)
-    csv_bytes = result_df.to_csv(index=False).encode("utf-8")
-    return result_df, ("ai_detection_results.csv", csv_bytes)
 # =========================
-# Gradio UI
 # =========================
-with gr.Blocks(title="🧪 AI Text Detector") as app:
-    gr.Markdown("## 🧪 AI Text Detector")
-    gr.Markdown("Detect whether text is **AI-generated or human-written**.")
-    with gr.Row():
-        text_input = gr.Textbox(
-            lines=8,
-            label="✍️ Paste Text (one per line)",
-            placeholder="Enter multiple comments, one per line..."
-        )
-        csv_input = gr.File(
-            label="📄 Upload CSV",
-            file_types=[".csv"]
-        )
     analyze_btn = gr.Button("🔍 Analyze")
-    output_table = gr.Dataframe(label="📊 Results")
-    download_file = gr.File(label="⬇️ Download CSV")
     analyze_btn.click(
         fn=run_detector,
-        inputs=[text_input, csv_input],
         outputs=[output_table, download_file]
     )

 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from typing import List, Tuple
+from pathlib import Path
+import fitz  # PyMuPDF
+import docx
 # =========================
 # Configuration
 # =========================
 MODEL_NAME = "openai-community/roberta-base-openai-detector"
 AI_THRESHOLD = 0.5
+MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
 # =========================
 # Model Loading (once)
 # =========================
 model.to(DEVICE)
 model.eval()
+# =========================
+# File Loaders
+# =========================
+def load_text_from_file(file_path: str) -> str:
+    path = Path(file_path)
+    if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
+        raise ValueError(f"Unsupported file type: {path.suffix}")
+    if path.suffix == ".txt":
+        return path.read_text(encoding="utf-8", errors="ignore")
+    if path.suffix == ".pdf":
+        return load_pdf(path)
+    if path.suffix == ".docx":
+        return load_docx(path)
+def load_pdf(path: Path) -> str:
+    text = []
+    with fitz.open(path) as pdf:
+        for page in pdf:
+            text.append(page.get_text())
+    return "\n".join(text)
+def load_docx(path: Path) -> str:
+    document = docx.Document(path)
+    return "\n".join(p.text for p in document.paragraphs if p.text.strip())
+# =========================
+# Text Utilities
+# =========================
+def chunk_text(text: str, max_words: int = 200) -> List[str]:
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), max_words):
+        chunk = " ".join(words[i:i + max_words])
+        if len(chunk.split()) >= 20:
+            chunks.append(chunk)
+    return chunks
 # =========================
 # Core Logic
 # =========================
 @torch.no_grad()
 def detect_ai_probability(texts: List[str]) -> List[float]:
     inputs = tokenizer(
         texts,
         return_tensors="pt",
         padding=True,
         truncation=True,
+        max_length=MAX_LENGTH
     ).to(DEVICE)
     logits = model(**inputs).logits
     return probs.cpu().tolist()
+def classify_chunks(chunks: List[str]) -> pd.DataFrame:
+    probabilities = detect_ai_probability(chunks)
     df = pd.DataFrame({
+        "Text Chunk": chunks,
         "AI Probability": [round(p, 4) for p in probabilities],
         "Prediction": [
             "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
     return df
+def run_detector(text_input: str, uploaded_files) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
     texts: List[str] = []
+    # Manual text input
     if text_input.strip():
+        texts.append(text_input.strip())
+    # File inputs
+    if uploaded_files:
+        for file in uploaded_files:
+            extracted_text = load_text_from_file(file.name)
+            texts.append(extracted_text)
     if not texts:
         return pd.DataFrame({"Error": ["No input provided"]}), None
+    # Chunk all inputs
+    all_chunks = []
+    for text in texts:
+        all_chunks.extend(chunk_text(text))
+    if not all_chunks:
+        return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
+    result_df = classify_chunks(all_chunks)
+    # Document-level summary
+    avg_score = result_df["AI Probability"].mean()
+    summary_row = pd.DataFrame([{
+        "Text Chunk": "📄 Document Summary",
+        "AI Probability": round(avg_score, 4),
+        "Prediction": "🤖 Likely AI" if avg_score >= AI_THRESHOLD else "🧍 Human"
+    }])
+    final_df = pd.concat([result_df, summary_row], ignore_index=True)
+    csv_bytes = final_df.to_csv(index=False).encode("utf-8")
+    return final_df, ("ai_document_detection.csv", csv_bytes)
 # =========================
+# Gradio UI (HF Space)
 # =========================
+with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
+    gr.Markdown("## 🧪 Offline AI Document Detector")
+    gr.Markdown(
+        "Analyze **PDF, Word, TXT, or pasted text** to detect whether content is AI-generated. "
+        "Runs fully offline using an open-source RoBERTa model."
+    )
+    text_input = gr.Textbox(
+        lines=6,
+        label="✍️ Paste Text (optional)",
+        placeholder="Paste any text here..."
+    )
+    file_input = gr.File(
+        label="📂 Upload Documents (PDF, DOCX, TXT)",
+        file_types=[".pdf", ".docx", ".txt"],
+        file_count="multiple"
+    )
     analyze_btn = gr.Button("🔍 Analyze")
+    output_table = gr.Dataframe(label="📊 Detection Results")
+    download_file = gr.File(label="⬇️ Download Results")
     analyze_btn.click(
         fn=run_detector,
+        inputs=[text_input, file_input],
         outputs=[output_table, download_file]
     )