Spaces:

VictorM-Coder
/

Test

Sleeping

App Files Files Community

VictorM-Coder commited on 30 days ago

Commit

dfecc14

verified ·

1 Parent(s): c059497

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -27

app.py CHANGED Viewed

@@ -1,14 +1,38 @@
 import torch
 import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import pandas as pd
 import gradio as gr
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
-# desklib/ai-text-detector-v1.01 is highly robust for academic/essay detection.
 MODEL_NAME = "desklib/ai-text-detector-v1.01"
 tokenizer = None
 model = None
@@ -17,26 +41,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def get_model():
     global tokenizer, model
     if model is None:
-        print(f"Loading High-Performance Model: {MODEL_NAME} on {device}")
-        # DeBERTa-v3 requires use_fast=False for stable SentencePiece tokenization.
-        # Ensure 'sentencepiece' is installed (pip install sentencepiece).
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
-        dtype = torch.float32
-        if device.type == "cuda" and torch.cuda.is_bf16_supported():
-            dtype = torch.bfloat16
-        model = AutoModelForSequenceClassification.from_pretrained(
-            MODEL_NAME, torch_dtype=dtype
         ).to(device).eval()
     return tokenizer, model
-# Only 81% and above is flagged as AI
 THRESHOLD = 0.81
 # -----------------------------
-# PROTECT STRUCTURE (Regex)
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
@@ -95,7 +113,6 @@ def analyze(text):
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
-    # Sliding window inference (Contextual for better accuracy)
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
@@ -103,17 +120,16 @@ def analyze(text):
         windows.append(" ".join(pure_sents[start:end]))
     inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-    logits = mod(**inputs).logits
-    # Note: Desklib uses Label 1 for AI-generated and Label 0 for Human.
-    probs = F.softmax(logits.float(), dim=-1)[:, 1].cpu().numpy().tolist()
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
     weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
-    # -----------------------------
-    # HTML RECONSTRUCTION (Strict Binary)
-    # -----------------------------
     highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
@@ -124,12 +140,10 @@ def analyze(text):
         if i in prob_map:
             score = prob_map[i]
-            # Binary logic: Threshold applied to color
             if score >= THRESHOLD:
-                color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED (AI)
             else:
-                color, bg = "#11823b", "rgba(17, 130, 59, 0.15)" # GREEN (Human)
             highlighted_html += (
                 f"<span style='background:{bg}; padding:2px 4px; border-radius:4px; border-bottom: 2px solid {color};' "
@@ -140,7 +154,6 @@ def analyze(text):
             highlighted_html += block
     highlighted_html += "</div>"
-    # --- FINAL VERDICT (Masking below 81%) ---
     if weighted_avg >= THRESHOLD:
         label = f"{weighted_avg:.0%} AI Content Detected"
         display_score = f"{weighted_avg:.1%}"
@@ -156,7 +169,7 @@ def analyze(text):
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
-    gr.Markdown(f"Using **{MODEL_NAME}**. Threshold: **{THRESHOLD*100:.0f}%**. Scores below this are marked as Human.")
     with gr.Row():
         with gr.Column(scale=3):

 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
 import re
 import pandas as pd
 import gradio as gr
+# -----------------------------
+# CUSTOM MODEL DEFINITION
+# -----------------------------
+# The Desklib model uses a custom architecture: Mean Pooling + Linear Classifier.
+class DesklibAIDetectionModel(PreTrainedModel):
+    config_class = AutoConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = AutoModel.from_config(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None):
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        last_hidden_state = outputs[0]
+        # Mean Pooling logic
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
+        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        mean_pooled = sum_embeddings / sum_mask
+        logits = self.classifier(mean_pooled)
+        return logits
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
 MODEL_NAME = "desklib/ai-text-detector-v1.01"
 tokenizer = None
 model = None
 def get_model():
     global tokenizer, model
     if model is None:
+        print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
+        # Load the weights into our custom class
+        model = DesklibAIDetectionModel.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float32 # Use float16/bfloat16 if your GPU supports it
         ).to(device).eval()
     return tokenizer, model
 THRESHOLD = 0.81
 # -----------------------------
+# UTILITIES (Sentence Splitting & Structure)
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
         windows.append(" ".join(pure_sents[start:end]))
     inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+    logits = mod(inputs['input_ids'], inputs['attention_mask'])
+    # Sigmoid for single-logit probability
+    probs = torch.sigmoid(logits).cpu().numpy().flatten().tolist()
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
     weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
+    # HTML Heatmap
     highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
         if i in prob_map:
             score = prob_map[i]
             if score >= THRESHOLD:
+                color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED
             else:
+                color, bg = "#11823b", "rgba(17, 130, 59, 0.15)" # GREEN
             highlighted_html += (
                 f"<span style='background:{bg}; padding:2px 4px; border-radius:4px; border-bottom: 2px solid {color};' "
             highlighted_html += block
     highlighted_html += "</div>"
     if weighted_avg >= THRESHOLD:
         label = f"{weighted_avg:.0%} AI Content Detected"
         display_score = f"{weighted_avg:.1%}"
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
+    gr.Markdown(f"Using **{MODEL_NAME}** (DeBERTa-v3-Large). Threshold: **{THRESHOLD*100:.0f}%**.")
     with gr.Row():
         with gr.Column(scale=3):