Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on 23 days ago

Commit

2b59ac0

verified ·

1 Parent(s): f2f742a

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -63

app.py CHANGED Viewed

@@ -1,113 +1,141 @@
 import torch
-import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import pandas as pd
 import gradio as gr
-# -----------------------------
-# WORKING PUBLIC AI DETECTOR
-# -----------------------------
-MODEL_NAME = "openai-community/roberta-base-openai-detector"
-# -----------------------------
-# LOAD MODEL
-# -----------------------------
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, torch_dtype=dtype)
-model.to(device).eval()
-# -----------------------------
-# SENTENCE SPLITTER (SAFE)
-# -----------------------------
 def sentence_split(text):
-    # Replace newlines with periods to avoid broken sentences
     text = text.replace("\n", ". ")
-    # Regex split on . ! ? but keep them
     sentences = re.split(r'(?<=[.!?])\s+', text)
-    # Clean and filter
     return [s.strip() for s in sentences if s.strip()]
-# -----------------------------
-# AI DETECTION FUNCTION
-# -----------------------------
 def classify_text(text):
     if not text.strip():
         return "⚠️ Please enter some text.", None, None
     sentences = sentence_split(text)
-    if not sentences:
-        return "⚠️ No content detected.", None, None
-    # Tokenize per sentence
-    inputs = tokenizer(
-        sentences,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=512
-    ).to(device)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-        probs = F.softmax(logits, dim=-1).cpu()
-        preds = torch.argmax(probs, dim=-1).cpu()
     results = []
-    highlighted_sentences = []
-    for i, sentence in enumerate(sentences):
-        pred = preds[i].item()
-        conf = probs[i, pred].item()
-        # Model: 1 = AI, 0 = Human
-        label = "AI" if pred == 1 else "Human"
-        conf_text = f"{conf:.2f}"
-        results.append([sentence, label, conf_text])
         if label == "AI":
-            highlighted_sentences.append(f"<p style='color:red; font-weight:bold'>{sentence}</p>")
         else:
-            highlighted_sentences.append(f"<p style='color:green; font-weight:bold'>{sentence}</p>")
-    # -----------------------------
-    # DOCUMENT AI SCORE
-    # -----------------------------
-    avg = torch.mean(probs, dim=0)
-    ai_percent = avg[1].item() * 100
-    highlighted_html = "\n".join(highlighted_sentences)
-    df = pd.DataFrame(results, columns=["Sentence", "Classification", "Confidence"])
-    return f"⚖️ Document AI Likelihood: {ai_percent:.1f}%", highlighted_html, df
-# -----------------------------
 # GRADIO UI
-# -----------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Writenix AI Detector (Sentence-Level, Stable Version)")
     text_input = gr.Textbox(
         label="Enter text",
         lines=14,
-        placeholder="Paste your essay, article, or content here…"
     )
     classify_btn = gr.Button("🚀 Detect AI")
     ai_score = gr.Label(label="Overall AI Likelihood")
     highlighted = gr.HTML()
-    table = gr.Dataframe(headers=["Sentence", "Classification", "Confidence"], wrap=True)
-    classify_btn.click(classify_text, inputs=text_input, outputs=[ai_score, highlighted, table])
 if __name__ == "__main__":
     demo.launch()

 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import re
+import numpy as np
 import pandas as pd
 import gradio as gr
+# ----------------------------------------------------
+# LOAD CAUSAL LM (DetectGPT requires a generative LM)
+# ----------------------------------------------------
+MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
+    device_map="auto"
+).eval()
+# ----------------------------------------------------
+# SENTENCE SPLITTER
+# ----------------------------------------------------
 def sentence_split(text):
     text = text.replace("\n", ". ")
     sentences = re.split(r'(?<=[.!?])\s+', text)
     return [s.strip() for s in sentences if s.strip()]
+# ----------------------------------------------------
+# PERPLEXITY FUNCTION
+# ----------------------------------------------------
+def perplexity(sentence):
+    inputs = tokenizer(sentence, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs, labels=inputs["input_ids"])
+        loss = outputs.loss
+    return torch.exp(loss).item()
+# ----------------------------------------------------
+# SIMPLE TEXT PERTURBATION (Synonym-like noise)
+# ----------------------------------------------------
+def perturb(text):
+    words = text.split()
+    if len(words) < 4:
+        return text  # too short to perturb
+    idx = np.random.randint(0, len(words))
+    words[idx] = words[idx] + " "  # small noise (DetectGPT paper trick)
+    return " ".join(words)
+# ----------------------------------------------------
+# DETECTGPT SCORE
+# ----------------------------------------------------
+def detectgpt_score(sentence, perturbations=5):
+    try:
+        orig = perplexity(sentence)
+    except:
+        return 0  # fallback
+    perturbed_scores = []
+    for _ in range(perturbations):
+        p = perturb(sentence)
+        try:
+            pp = perplexity(p)
+            perturbed_scores.append(pp)
+        except:
+            continue
+    if not perturbed_scores:
+        return 0
+    return np.mean(perturbed_scores) - orig  # DetectGPT signal
+# ----------------------------------------------------
+# MAIN CLASSIFIER
+# ----------------------------------------------------
 def classify_text(text):
     if not text.strip():
         return "⚠️ Please enter some text.", None, None
     sentences = sentence_split(text)
     results = []
+    highlighted = []
+    detectgpt_scores = []
+    for s in sentences:
+        score = detectgpt_score(s)
+        detectgpt_scores.append(score)
+        label = "AI" if score > 0 else "Human"
+        conf = abs(score)
+        results.append([s, label, f"{conf:.4f}"])
         if label == "AI":
+            highlighted.append(f"<p style='color:red;font-weight:bold'>{s}</p>")
         else:
+            highlighted.append(f"<p style='color:green;font-weight:bold'>{s}</p>")
+    # -------------------------
+    # DOCUMENT-LEVEL SCORE
+    # -------------------------
+    avg_score = np.mean(detectgpt_scores)
+    doc_ai_percent = max(0, min(100, (avg_score + 1) * 50))
+    df = pd.DataFrame(results, columns=["Sentence", "Label", "Score"])
+    html = "\n".join(highlighted)
+    return f"⚖️ Document AI Likelihood: {doc_ai_percent:.1f}%", html, df
+# ----------------------------------------------------
 # GRADIO UI
+# ----------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Writenix DetectGPT (Turnitin-like Detector)")
     text_input = gr.Textbox(
         label="Enter text",
         lines=14,
+        placeholder="Paste your essay here…"
     )
     classify_btn = gr.Button("🚀 Detect AI")
     ai_score = gr.Label(label="Overall AI Likelihood")
     highlighted = gr.HTML()
+    table = gr.Dataframe(headers=["Sentence", "Label", "Score"], wrap=True)
+    classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])
 if __name__ == "__main__":
     demo.launch()