Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Feb 22

Commit

46d3fde

verified ·

1 Parent(s): eea664e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -83

app.py CHANGED Viewed

@@ -1,91 +1,105 @@
 import os
 import re
 import shutil
 import torch
 import torch.nn.functional as F
 import pandas as pd
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
-MODEL_NAME = "desklib/ai-text-detector-v1.03"
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-def purge_model_cache(model_id: str) -> None:
-    """
-    Remove cached weights/tokenizer for this model from common HF cache locations.
-    This fixes the 'state dictionary ... corrupted' error caused by partial downloads.
-    """
-    safe = model_id.replace("/", "--")
-    candidates = [
-        os.path.expanduser(f"~/.cache/huggingface/hub/models--{safe}"),
-        os.path.expanduser(f"~/.cache/huggingface/transformers/models--{safe}"),
-        os.path.expanduser(f"~/.cache/huggingface/modules/models--{safe}"),
-    ]
-    for path in candidates:
-        if os.path.exists(path):
-            try:
-                shutil.rmtree(path, ignore_errors=True)
-                print(f"🧹 Removed cache: {path}")
-            except Exception as e:
-                print(f"⚠️ Failed to remove cache at {path}: {e}")
-def get_model():
     """
-    Loads tokenizer + model with safetensors preferred.
-    If load fails (often due to corrupted HF cache), purge cache + force download.
     """
-    global tokenizer, model
-    if model is not None and tokenizer is not None:
-        return tokenizer, model
-    print(f"🚀 Loading Model: {MODEL_NAME} on {device}")
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-        model = AutoModelForSequenceClassification.from_pretrained(
-            MODEL_NAME,
-            use_safetensors=True,          # ✅ prefer safetensors
-            ignore_mismatched_sizes=True,
-            low_cpu_mem_usage=True,
-        ).to(device).eval()
-        return tokenizer, model
-    except Exception as e:
-        print(f"⚠️ Initial load failed: {e}")
-        print("🔁 Attempting recovery: purge cache + force re-download...")
-        purge_model_cache(MODEL_NAME)
-        # Redownload everything cleanly
-        tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_NAME,
-            force_download=True,
-        )
-        model = AutoModelForSequenceClassification.from_pretrained(
-            MODEL_NAME,
-            use_safetensors=True,          # ✅ keep safetensors on recovery too
-            ignore_mismatched_sizes=True,
-            force_download=True,
-        ).to(device).eval()
-        return tokenizer, model
-THRESHOLD = 0.59
 # -----------------------------
@@ -98,22 +112,18 @@ ABBR = [
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
-def _protect(text: str) -> str:
     text = text.replace("...", "⟨ELLIPSIS⟩")
     text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
     text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
     return text
-def _restore(text: str) -> str:
-    return (
-        text.replace("⟨ABBRDOT⟩", ".")
-            .replace("⟨DECIMAL⟩", ".")
-            .replace("⟨ELLIPSIS⟩", "...")
-    )
-def split_preserving_structure(text: str):
     blocks = re.split(r"(\n+)", text)
     final_blocks = []
     for block in blocks:
@@ -177,35 +187,21 @@ def analyze(text):
     batch_size = 8
     probs = []
     for i in range(0, len(windows), batch_size):
         batch = windows[i: i + batch_size]
-        inputs = tok(
-            batch,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512,
-        ).to(device)
         output = mod(**inputs)
         if output.logits.shape[1] > 1:
             batch_probs = F.softmax(output.logits, dim=-1)[:, 1].detach().cpu().numpy().tolist()
         else:
             batch_probs = torch.sigmoid(output.logits).detach().cpu().numpy().flatten().tolist()
         probs.extend(batch_probs)
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
-    weighted_avg = (
-        sum(p * l for p, l in zip(probs, lengths)) / total_words
-        if total_words > 0
-        else 0
-    )
-    # HTML Heatmap
     highlighted_html = "<div style='font-family:sans-serif; line-height:1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
@@ -250,11 +246,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Detector Pro") as demo:
     with gr.Row():
         with gr.Column(scale=3):
-            text_input = gr.Textbox(
-                label="Input Text",
-                lines=15,
-                placeholder="Enter at least 250 words..."
-            )
             with gr.Row():
                 clear_btn = gr.Button("Clear")
                 run_btn = gr.Button("Analyze Text", variant="primary")

 import os
+# ============================================================
+# ✅ FIX FOR "state dict corrupted" ON SPACES (Xet downloads)
+# ============================================================
+# Disable hf-xet usage (forces download via LFS bridge instead).
+# HF docs: HF_HUB_DISABLE_XET disables using hf-xet. :contentReference[oaicite:2]{index=2}
+os.environ["HF_HUB_DISABLE_XET"] = "1"
+# Optional: place HF cache in a writable/temp location (Spaces friendly)
+# You can comment this out if you prefer default cache locations.
+os.environ.setdefault("HF_HOME", "/tmp/hf")
+os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/tmp/hf/hub")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
+# (Optional) reduce parallelism issues
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import re
 import shutil
 import torch
 import torch.nn.functional as F
 import pandas as pd
 import gradio as gr
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
+MODEL_NAME = "desklib/ai-text-detector-v1.01"
+LOCAL_MODEL_DIR = "/tmp/desklib_ai_text_detector_v1_01"  # local snapshot dir
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+THRESHOLD = 0.59
+def _rm_dir(path: str) -> None:
+    if os.path.exists(path):
+        shutil.rmtree(path, ignore_errors=True)
+def download_model_snapshot() -> str:
     """
+    Download the HF repo snapshot to a local folder, forcing a clean download.
+    Disabling Xet via env var helps avoid corrupted large-file downloads on some envs.
     """
+    # wipe local dir to ensure truly clean download
+    _rm_dir(LOCAL_MODEL_DIR)
+    print(f"⬇️ Downloading snapshot for: {MODEL_NAME}")
+    local_dir = snapshot_download(
+        repo_id=MODEL_NAME,
+        local_dir=LOCAL_MODEL_DIR,
+        local_dir_use_symlinks=False,
+        force_download=True,
+        resume_download=False,
+    )
+    # Basic integrity sanity check: make sure model.safetensors looks real
+    st_path = os.path.join(local_dir, "model.safetensors")
+    if not os.path.exists(st_path):
+        raise RuntimeError(f"model.safetensors not found in snapshot at: {st_path}")
+    size_gb = os.path.getsize(st_path) / (1024**3)
+    print(f"✅ model.safetensors size: {size_gb:.2f} GB")
+    # The HF repo shows ~1.74GB for model.safetensors. :contentReference[oaicite:3]{index=3}
+    # If the file is drastically smaller, it's likely truncated.
+    if size_gb < 1.0:
+        raise RuntimeError(
+            f"Downloaded model.safetensors looks too small ({size_gb:.2f} GB). "
+            "Likely truncated download."
+        )
+    return local_dir
+def get_model():
+    global tokenizer, model
+    if model is not None and tokenizer is not None:
+        return tokenizer, model
+    print(f"🚀 Loading Model: {MODEL_NAME} on {device}")
+    local_dir = download_model_snapshot()
+    tokenizer = AutoTokenizer.from_pretrained(local_dir)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        local_dir,
+        use_safetensors=True,
+        ignore_mismatched_sizes=True,
+        low_cpu_mem_usage=True,
+    ).to(device).eval()
+    return tokenizer, model
 # -----------------------------
 ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
+def _protect(text):
     text = text.replace("...", "⟨ELLIPSIS⟩")
     text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
     text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
     return text
+def _restore(text):
+    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
+def split_preserving_structure(text):
     blocks = re.split(r"(\n+)", text)
     final_blocks = []
     for block in blocks:
     batch_size = 8
     probs = []
     for i in range(0, len(windows), batch_size):
         batch = windows[i: i + batch_size]
+        inputs = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
         output = mod(**inputs)
         if output.logits.shape[1] > 1:
             batch_probs = F.softmax(output.logits, dim=-1)[:, 1].detach().cpu().numpy().tolist()
         else:
             batch_probs = torch.sigmoid(output.logits).detach().cpu().numpy().flatten().tolist()
         probs.extend(batch_probs)
     lengths = [len(s.split()) for s in pure_sents]
     total_words = sum(lengths)
+    weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
     highlighted_html = "<div style='font-family:sans-serif; line-height:1.8;'>"
     prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
     with gr.Row():
         with gr.Column(scale=3):
+            text_input = gr.Textbox(label="Input Text", lines=15, placeholder="Enter at least 250 words...")
             with gr.Row():
                 clear_btn = gr.Button("Clear")
                 run_btn = gr.Button("Analyze Text", variant="primary")