Spaces:

VictorM-Coder
/

Test

Sleeping

App Files Files Community

VictorM-Coder commited on Feb 12

Commit

0a84024

verified ·

1 Parent(s): 5237e13

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -10

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ import gradio as gr
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
-MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -16,21 +17,24 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def get_model():
     global tokenizer, model
     if model is None:
-        print(f"Loading model: {MODEL_NAME} on {device}")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         dtype = torch.float32
         if device.type == "cuda" and torch.cuda.is_bf16_supported():
             dtype = torch.bfloat16
         model = AutoModelForSequenceClassification.from_pretrained(
             MODEL_NAME, torch_dtype=dtype
         ).to(device).eval()
     return tokenizer, model
-# UPDATED THRESHOLD: Only 81% and above is flagged as AI
 THRESHOLD = 0.81
 # -----------------------------
-# PROTECT STRUCTURE
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
@@ -89,6 +93,7 @@ def analyze(text):
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
@@ -104,7 +109,7 @@ def analyze(text):
     weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
     # -----------------------------
-    # HTML RECONSTRUCTION
     # -----------------------------
     highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
@@ -117,7 +122,7 @@ def analyze(text):
         if i in prob_map:
             score = prob_map[i]
-            # Logic: Red for > 0.81, Green for everything else (<= 0.81)
             if score >= THRESHOLD:
                 color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED
             else:
@@ -132,7 +137,7 @@ def analyze(text):
             highlighted_html += block
     highlighted_html += "</div>"
-    # --- FINAL VERDICT ---
     if weighted_avg >= THRESHOLD:
         label = f"{weighted_avg:.0%} AI Content Detected"
         display_score = f"{weighted_avg:.1%}"
@@ -147,8 +152,8 @@ def analyze(text):
 # GRADIO INTERFACE
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🕵️ AI Detector Pro")
-    gr.Markdown(f"Strict Analysis. Threshold: **{THRESHOLD*100:.0f}%**. Everything below this is considered Human.")
     with gr.Row():
         with gr.Column(scale=3):

 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
+# This is a DeBERTa-v3-Large model fine-tuned on the DAIGT (Student Writing vs AI) dataset.
+MODEL_NAME = "Hamidreza/DeBERTa-v3-large-AI-Detector-v2"
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def get_model():
     global tokenizer, model
     if model is None:
+        print(f"Loading High-Performance Model: {MODEL_NAME} on {device}")
+        # DeBERTa-v3 requires use_fast=False for stable SentencePiece tokenization
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
         dtype = torch.float32
         if device.type == "cuda" and torch.cuda.is_bf16_supported():
             dtype = torch.bfloat16
         model = AutoModelForSequenceClassification.from_pretrained(
             MODEL_NAME, torch_dtype=dtype
         ).to(device).eval()
     return tokenizer, model
+# Only 81% and above is flagged as AI
 THRESHOLD = 0.81
 # -----------------------------
+# PROTECT STRUCTURE (Regex)
 # -----------------------------
 ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
     if not pure_sents:
         return "—", "—", "<em>No sentences detected.</em>", None
+    # Sliding window inference (Contextual)
     windows = []
     for i in range(len(pure_sents)):
         start = max(0, i - 1)
     weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
     # -----------------------------
+    # HTML RECONSTRUCTION (Strict Binary)
     # -----------------------------
     highlighted_html = "<div style='font-family: sans-serif; line-height: 1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
         if i in prob_map:
             score = prob_map[i]
+            # Binary logic: Threshold applied to color
             if score >= THRESHOLD:
                 color, bg = "#b80d0d", "rgba(184, 13, 13, 0.15)" # RED
             else:
             highlighted_html += block
     highlighted_html += "</div>"
+    # --- FINAL VERDICT (Masking below 81%) ---
     if weighted_avg >= THRESHOLD:
         label = f"{weighted_avg:.0%} AI Content Detected"
         display_score = f"{weighted_avg:.1%}"
 # GRADIO INTERFACE
 # -----------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🕵️ AI Detector Pro (DeBERTa Edition)")
+    gr.Markdown(f"Advanced Academic Analysis. Threshold: **{THRESHOLD*100:.0f}%**. Everything below is categorized as Human.")
     with gr.Row():
         with gr.Column(scale=3):