Spaces:

VictorM-Coder
/

Test

Sleeping

File size: 7,732 Bytes

ea2c6a2
 
38debf0
ea2c6a2
 
 
24abfdf
ea2c6a2
 
 
c059497
ea2c6a2
 
205f405
ea2c6a2
 
 
 
205f405
 
 
38debf0
dfecc14
205f405
ea2c6a2
 
 
34ea950
ea2c6a2
 
c1d0bb0
ea2c6a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205f405
ea2c6a2
 
205f405
7e7e4c6
 
ea2c6a2
205f405
7e7e4c6
205f405
 
 
7e7e4c6
ea2c6a2
 
 
205f405
ea2c6a2
 
 
 
f6d6e24
ea2c6a2
205f405
ea2c6a2
205f405
ea2c6a2
 
 
 
 
 
205f405
 
24abfdf
 
 
 
 
205f405
 
 
 
 
 
 
24abfdf
 
205f405
ea2c6a2
 
 
 
205f405
 
 
 
ea2c6a2
f6d6e24
ea2c6a2
 
 
 
205f405
ea2c6a2
 
205f405
 
 
 
 
 
 
 
ea2c6a2
205f405
 
 
 
ea2c6a2
 
 
41bba56
205f405
 
 
c1d0bb0
205f405
 
ea2c6a2
 
 
 
205f405
 
 
f6d6e24
ea2c6a2
 
205f405
 
 
 
 
ea2c6a2
205f405
 
 
 
 
 
ea2c6a2
 
205f405
 
ea2c6a2
 
 
205f405
ea2c6a2

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL INITIALIZATION
# -----------------------------
MODEL_NAME = "desklib/ai-text-detector-v1.01"
tokenizer = None
model = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_model():
    global tokenizer, model
    if model is None:
        print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        # Load with default labels; if the model has 2 (Human/AI), we handle it in analyze()
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            ignore_mismatched_sizes=True
        ).to(device).eval()
    return tokenizer, model

THRESHOLD = 0.59 

# -----------------------------
# UTILITIES
# -----------------------------
ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)

def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text

def _restore(text):
    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")

def split_preserving_structure(text):
    blocks = re.split(r"(\n+)", text)
    final_blocks = []
    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)
            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i+1] if i+1 < len(parts) else ""
                space = parts[i+2] if i+2 < len(parts) else ""
                if sentence.strip():
                    final_blocks.append(_restore(sentence + punct))
                if space:
                    final_blocks.append(space)
    return final_blocks

# -----------------------------
# ANALYSIS
# -----------------------------
@torch.inference_mode()
def analyze(text):
    text = text.strip()
    if not text:
        return "—", "—", "<em>Please enter text...</em>", None
        
    word_count = len(text.split())
    if word_count < 300: # Slightly lowered for testing flexibility
        warning_msg = f"⚠️ <b>Insufficient Text:</b> Your input has {word_count} words. Please enter at least 250-300 words for accurate results."
        return "Too Short", "N/A", f"<div style='color: #b80d0d; padding: 20px; border: 1px solid #b80d0d; border-radius: 8px;'>{warning_msg}</div>", None

    try:
        tok, mod = get_model()
    except Exception as e:
        return "ERROR", "0%", f"Failed to load model: {str(e)}", None

    blocks = split_preserving_structure(text)
    pure_sents_indices = [i for i, b in enumerate(blocks) if b.strip() and not b.startswith("\n")]
    pure_sents = [blocks[i] for i in pure_sents_indices]
    
    if not pure_sents:
        return "—", "—", "<em>No sentences detected.</em>", None

    # Sliding Window Generation (Context of 3 sentences)
    windows = []
    for i in range(len(pure_sents)):
        start = max(0, i - 1)
        end = min(len(pure_sents), i + 2)
        windows.append(" ".join(pure_sents[start:end]))

    # --- BATCHED INFERENCE (Prevents OOM) ---
    batch_size = 8
    probs = []
    for i in range(0, len(windows), batch_size):
        batch = windows[i : i + batch_size]
        inputs = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        output = mod(**inputs)
        
        # Check if model is binary classification (2 labels) or regression (1 label)
        if output.logits.shape[1] > 1:
            # Assumes Label 1 is 'AI'
            batch_probs = F.softmax(output.logits, dim=-1)[:, 1].cpu().numpy().tolist()
        else:
            batch_probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
        probs.extend(batch_probs)

    # Calculation for Final Score
    lengths = [len(s.split()) for s in pure_sents]
    total_words = sum(lengths)
    weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0

    # -----------------------------
    # HTML RECONSTRUCTION
    # -----------------------------
    highlighted_html = "<div style='font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, sans-serif; line-height: 1.8;'>"
    prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
    
    for i, block in enumerate(blocks):
        if block.startswith("\n") or block.isspace():
            highlighted_html += block.replace("\n", "<br>")
            continue
        
        if i in prob_map:
            score = prob_map[i]
            # Color logic based on Threshold
            if score >= THRESHOLD:
                color, bg = "#d32f2f", "rgba(211, 47, 47, 0.12)" # Soft Red
                border = "2px solid #d32f2f"
            else:
                color, bg = "#2e7d32", "rgba(46, 125, 50, 0.08)" # Soft Green
                border = "1px solid transparent"
            
            highlighted_html += (
                f"<span style='background:{bg}; padding:1px 2px; border-radius:3px; border-bottom: {border}; cursor: help;' "
                f"title='AI Confidence: {score:.2%}'>"
                f"<span style='color:{color}; font-weight: bold; font-size: 0.75em; vertical-align: super; margin-right: 2px;'>{score:.0%}</span>"
                f"{block}</span>"
            )
        else:
            highlighted_html += block
    highlighted_html += "</div>"

    label = f"{weighted_avg:.1%} AI Written"
    display_score = f"{weighted_avg:.2%}"
    df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
    
    return label, display_score, highlighted_html, df

# -----------------------------
# GRADIO INTERFACE
# -----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="AI Detector Pro") as demo:
    gr.Markdown("# 🕵️ AI Detector Pro")
    gr.Markdown(f"Utilizing **{MODEL_NAME}**. Values above **{THRESHOLD*100:.0f}%** are flagged as highly likely AI.")
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your essay here (minimum 250 words for accuracy)...")
            with gr.Row():
                clear_btn = gr.Button("Clear")
                run_btn = gr.Button("Analyze Text", variant="primary")
        
        with gr.Column(scale=1):
            verdict_out = gr.Label(label="Global Verdict")
            score_out = gr.Label(label="Weighted Probability")
            gr.Markdown("---")
            gr.Markdown("### How to read:")
            gr.Markdown("- **Red Highlight:** High AI probability\n- **Green Highlight:** Likely Human\n- **Super-script:** Exact sentence-level AI score")
    
    with gr.Tabs():
        with gr.TabItem("Visual Heatmap"):
            html_out = gr.HTML(label="Heatmap")
        with gr.TabItem("Data Breakdown"):
            table_out = gr.Dataframe(headers=["Sentence", "AI Confidence"], wrap=True)
            
    run_btn.click(analyze, inputs=text_input, outputs=[verdict_out, score_out, html_out, table_out])
    clear_btn.click(lambda: ["", "", "", "", None], outputs=[text_input, verdict_out, score_out, html_out, table_out])

if __name__ == "__main__":
    demo.launch()