Spaces:

VictorM-Coder
/

Test

Sleeping

App Files Files Community

VictorM-Coder commited on Feb 12

Commit

38debf0

verified ·

1 Parent(s): dfecc14

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -34

app.py CHANGED Viewed

@@ -1,35 +1,10 @@
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel
 import re
 import pandas as pd
 import gradio as gr
-# -----------------------------
-# CUSTOM MODEL DEFINITION
-# -----------------------------
-# The Desklib model uses a custom architecture: Mean Pooling + Linear Classifier.
-class DesklibAIDetectionModel(PreTrainedModel):
-    config_class = AutoConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = AutoModel.from_config(config)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-        self.init_weights()
-    def forward(self, input_ids, attention_mask=None):
-        outputs = self.model(input_ids, attention_mask=attention_mask)
-        last_hidden_state = outputs[0]
-        # Mean Pooling logic
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
-        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
-        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-        mean_pooled = sum_embeddings / sum_mask
-        logits = self.classifier(mean_pooled)
-        return logits
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
@@ -42,19 +17,25 @@ def get_model():
     global tokenizer, model
     if model is None:
         print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
-        # Load the weights into our custom class
-        model = DesklibAIDetectionModel.from_pretrained(
             MODEL_NAME,
-            torch_dtype=torch.float32 # Use float16/bfloat16 if your GPU supports it
         ).to(device).eval()
     return tokenizer, model
 THRESHOLD = 0.81
 # -----------------------------
-# UTILITIES (Sentence Splitting & Structure)
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
@@ -113,6 +94,7 @@ def analyze(text):
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
@@ -120,10 +102,10 @@ def analyze(text):
         windows.append(" ".join(pure_sents[start:end]))
     inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-    logits = mod(inputs['input_ids'], inputs['attention_mask'])
-    # Sigmoid for single-logit probability
-    probs = torch.sigmoid(logits).cpu().numpy().flatten().tolist()
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
@@ -169,7 +151,7 @@ def analyze(text):
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
-    gr.Markdown(f"Using **{MODEL_NAME}** (DeBERTa-v3-Large). Threshold: **{THRESHOLD*100:.0f}%**.")
     with gr.Row():
         with gr.Column(scale=3):

 import torch
 import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import re
 import pandas as pd
 import gradio as gr
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
     global tokenizer, model
     if model is None:
         print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
+        # DeBERTa-v3 requires use_fast=False for stable tokenization.
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
+        # We load as Sequence Classification with 1 label (Single Logit).
+        # ignore_mismatched_sizes=True allows us to load the custom Desklib head.
+        model = AutoModelForSequenceClassification.from_pretrained(
             MODEL_NAME,
+            num_labels=1,
+            ignore_mismatched_sizes=True
         ).to(device).eval()
     return tokenizer, model
+# Only 81% and above is flagged as AI
 THRESHOLD = 0.81
 # -----------------------------
+# UTILITIES (Regex & Structure)
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
+    # Contextual Sliding Window
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
         windows.append(" ".join(pure_sents[start:end]))
     inputs = tok(windows, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+    output = mod(**inputs)
+    # Since num_labels=1, we use Sigmoid on the single logit per window
+    probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🕵️ AI Detector Pro (Academic Edition)")
+    gr.Markdown(f"Using **{MODEL_NAME}**. Threshold: **{THRESHOLD*100:.0f}%**.")
     with gr.Row():
         with gr.Column(scale=3):