Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

VictorM-Coder commited on Feb 22

Commit

3b0d005

verified ·

1 Parent(s): 344cbaa

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -84

app.py CHANGED Viewed

@@ -1,44 +1,28 @@
-import os, shutil, glob
-# Put HF caches somewhere "fresh" (avoid reusing an old corrupt cache)
-os.environ["HF_HOME"] = "/tmp/hf"
-os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/hf/hub"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
-os.environ["HF_HUB_DISABLE_XET"] = "1"     # also avoids xet-related partial downloads
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def wipe_model_cache(model_id: str):
-    safe = model_id.replace("/", "--")
-    paths = [
-        f"/tmp/hf/hub/models--{safe}",
-        f"/tmp/hf/transformers/models--{safe}",
-        # also wipe common defaults in case something else wrote there
-        os.path.expanduser(f"~/.cache/huggingface/hub/models--{safe}"),
-        os.path.expanduser(f"~/.cache/huggingface/transformers/models--{safe}"),
-    ]
-    for p in paths:
-        if os.path.exists(p):
-            shutil.rmtree(p, ignore_errors=True)
-# wipe the specific model cache on startup
-wipe_model_cache("desklib/ai-text-detector-v1.01")
 import re
 import shutil
 import torch
 import torch.nn.functional as F
 import pandas as pd
 import gradio as gr
-from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
 MODEL_NAME = "desklib/ai-text-detector-v1.01"
-LOCAL_MODEL_DIR = "/tmp/desklib_ai_text_detector_v1_01"  # local snapshot dir
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -46,63 +30,74 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 THRESHOLD = 0.59
-def _rm_dir(path: str) -> None:
-    if os.path.exists(path):
-        shutil.rmtree(path, ignore_errors=True)
-def download_model_snapshot() -> str:
     """
-    Download the HF repo snapshot to a local folder, forcing a clean download.
-    Disabling Xet via env var helps avoid corrupted large-file downloads on some envs.
     """
-    # wipe local dir to ensure truly clean download
-    _rm_dir(LOCAL_MODEL_DIR)
-    print(f"⬇️ Downloading snapshot for: {MODEL_NAME}")
-    local_dir = snapshot_download(
-        repo_id=MODEL_NAME,
-        local_dir=LOCAL_MODEL_DIR,
-        local_dir_use_symlinks=False,
-        force_download=True,
-        resume_download=False,
-    )
-    # Basic integrity sanity check: make sure model.safetensors looks real
-    st_path = os.path.join(local_dir, "model.safetensors")
-    if not os.path.exists(st_path):
-        raise RuntimeError(f"model.safetensors not found in snapshot at: {st_path}")
-    size_gb = os.path.getsize(st_path) / (1024**3)
-    print(f"✅ model.safetensors size: {size_gb:.2f} GB")
-    # The HF repo shows ~1.74GB for model.safetensors. :contentReference[oaicite:3]{index=3}
-    # If the file is drastically smaller, it's likely truncated.
-    if size_gb < 1.0:
-        raise RuntimeError(
-            f"Downloaded model.safetensors looks too small ({size_gb:.2f} GB). "
-            "Likely truncated download."
-        )
-    return local_dir
-def get_model():
     global tokenizer, model
-    if model is not None and tokenizer is not None:
         return tokenizer, model
-    print(f"🚀 Loading Model: {MODEL_NAME} on {device}")
-    local_dir = download_model_snapshot()
-    tokenizer = AutoTokenizer.from_pretrained(local_dir)
     model = AutoModelForSequenceClassification.from_pretrained(
-        local_dir,
         use_safetensors=True,
         ignore_mismatched_sizes=True,
         low_cpu_mem_usage=True,
     ).to(device).eval()
     return tokenizer, model
@@ -158,7 +153,7 @@ def split_preserving_structure(text):
 def analyze(text):
     text = (text or "").strip()
     if not text:
-        return "—", "—", "<em>Please enter text...</em>", None
     word_count = len(text.split())
     if word_count < 250:
@@ -166,24 +161,19 @@ def analyze(text):
             f"⚠️ <b>Insufficient Text:</b> Your input has {word_count} words. "
             f"Please enter at least 250 words for accurate results."
         )
-        return (
-            "Too Short",
-            "N/A",
-            f"<div style='color:#b80d0d; padding:20px; border:1px solid #b80d0d; border-radius:8px;'>{warning_msg}</div>",
-            None,
-        )
     try:
-        tok, mod = get_model()
     except Exception as e:
-        return "ERROR", "0%", f"Failed to load model: {str(e)}", None
     blocks = split_preserving_structure(text)
     pure_sents_indices = [i for i, b in enumerate(blocks) if b.strip() and not b.startswith("\n")]
     pure_sents = [blocks[i] for i in pure_sents_indices]
     if not pure_sents:
-        return "—", "—", "<em>No sentences detected.</em>", None
     windows = []
     for i in range(len(pure_sents)):
@@ -240,7 +230,29 @@ def analyze(text):
     display_score = f"{weighted_avg:.2%}"
     df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
-    return label, display_score, highlighted_html, df
 # -----------------------------
@@ -256,23 +268,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Detector Pro") as demo:
             with gr.Row():
                 clear_btn = gr.Button("Clear")
                 run_btn = gr.Button("Analyze Text", variant="primary")
         with gr.Column(scale=1):
             verdict_out = gr.Label(label="Global Verdict")
             score_out = gr.Label(label="Weighted Probability")
     with gr.Tabs():
         with gr.TabItem("Visual Heatmap"):
             html_out = gr.HTML()
         with gr.TabItem("Data Breakdown"):
             table_out = gr.Dataframe(headers=["Sentence", "AI Confidence"], wrap=True)
-    run_btn.click(analyze, inputs=text_input, outputs=[verdict_out, score_out, html_out, table_out])
     def _clear():
-        return "", "—", "—", "<em>Please enter text...</em>", None
-    clear_btn.click(_clear, outputs=[text_input, verdict_out, score_out, html_out, table_out])
 if __name__ == "__main__":
     demo.launch()

+import os
 import re
 import shutil
 import torch
 import torch.nn.functional as F
 import pandas as pd
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ============================================================
+# ENV (set BEFORE loading models)
+# ============================================================
+# Use a predictable cache location (helps avoid reusing a corrupt home cache)
+os.environ.setdefault("HF_HOME", "/tmp/hf")
+os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/tmp/hf/hub")
+os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers")
+# Disable Xet (helps avoid partial/corrupt downloads in some environments)
+os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 # -----------------------------
 # MODEL INITIALIZATION
 # -----------------------------
 MODEL_NAME = "desklib/ai-text-detector-v1.01"
 tokenizer = None
 model = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 THRESHOLD = 0.59
+def wipe_model_cache(model_id: str) -> int:
     """
+    Delete cached files for this model from common HF cache locations.
+    Returns number of cache directories removed.
     """
+    safe = model_id.replace("/", "--")
+    candidates = [
+        # our /tmp cache (recommended)
+        f"/tmp/hf/hub/models--{safe}",
+        f"/tmp/hf/transformers/models--{safe}",
+        # default home cache (in case something wrote there)
+        os.path.expanduser(f"~/.cache/huggingface/hub/models--{safe}"),
+        os.path.expanduser(f"~/.cache/huggingface/transformers/models--{safe}"),
+        os.path.expanduser(f"~/.cache/huggingface/modules/models--{safe}"),
+    ]
+    removed = 0
+    for path in candidates:
+        if os.path.exists(path):
+            try:
+                shutil.rmtree(path, ignore_errors=True)
+                removed += 1
+            except Exception:
+                # ignore deletion errors (permissions etc.)
+                pass
+    return removed
+def _build_error_card(msg: str) -> str:
+    return (
+        "<div style='color:#b80d0d; padding:14px; border:1px solid #b80d0d; "
+        "border-radius:10px; background:rgba(184,13,13,0.06);'>"
+        f"{msg}</div>"
+    )
+def get_model(force_redownload: bool = False):
+    """
+    Normal load uses cache (fast).
+    If force_redownload=True (from the Nuke button), we wipe cache + re-download.
+    """
     global tokenizer, model
+    if (not force_redownload) and model is not None and tokenizer is not None:
         return tokenizer, model
+    if force_redownload:
+        print("💣 NUKE requested: wiping cache + forcing re-download...")
+        removed = wipe_model_cache(MODEL_NAME)
+        print(f"🧹 Cache dirs removed: {removed}")
+        tokenizer = None
+        model = None
+    print(f"🚀 Loading Model: {MODEL_NAME} on {device}")
+    # Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_NAME,
+        force_download=force_redownload,
+    )
+    # Model (prefer safetensors)
     model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL_NAME,
         use_safetensors=True,
         ignore_mismatched_sizes=True,
         low_cpu_mem_usage=True,
+        force_download=force_redownload,
     ).to(device).eval()
     return tokenizer, model
 def analyze(text):
     text = (text or "").strip()
     if not text:
+        return "—", "—", "<em>Please enter text...</em>", None, ""
     word_count = len(text.split())
     if word_count < 250:
             f"⚠️ <b>Insufficient Text:</b> Your input has {word_count} words. "
             f"Please enter at least 250 words for accurate results."
         )
+        return "Too Short", "N/A", _build_error_card(warning_msg), None, ""
     try:
+        tok, mod = get_model(force_redownload=False)
     except Exception as e:
+        return "ERROR", "0%", _build_error_card(f"<b>Failed to load model:</b><br>{str(e)}"), None, ""
     blocks = split_preserving_structure(text)
     pure_sents_indices = [i for i, b in enumerate(blocks) if b.strip() and not b.startswith("\n")]
     pure_sents = [blocks[i] for i in pure_sents_indices]
     if not pure_sents:
+        return "—", "—", "<em>No sentences detected.</em>", None, ""
     windows = []
     for i in range(len(pure_sents)):
     display_score = f"{weighted_avg:.2%}"
     df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
+    return label, display_score, highlighted_html, df, ""
+def nuke_and_reload():
+    """
+    UI button: wipe cache + force re-download + try to load.
+    Returns a status message.
+    """
+    try:
+        get_model(force_redownload=True)
+        return (
+            "✅ **Nuked cache and reloaded model successfully.**\n\n"
+            "- Cache wiped\n"
+            "- Fresh download forced\n"
+            "- Model ready ✅"
+        )
+    except Exception as e:
+        return (
+            "❌ **Nuke attempted but model still failed to load.**\n\n"
+            f"**Error:** `{str(e)}`\n\n"
+            "If this keeps happening, it usually means the downloaded weights are getting truncated "
+            "(network/storage) or the runtime stack (Python/Torch) is incompatible."
+        )
 # -----------------------------
             with gr.Row():
                 clear_btn = gr.Button("Clear")
                 run_btn = gr.Button("Analyze Text", variant="primary")
+                nuke_btn = gr.Button("💣 Nuke Model Cache", variant="stop")
         with gr.Column(scale=1):
             verdict_out = gr.Label(label="Global Verdict")
             score_out = gr.Label(label="Weighted Probability")
+    status_out = gr.Markdown()
     with gr.Tabs():
         with gr.TabItem("Visual Heatmap"):
             html_out = gr.HTML()
         with gr.TabItem("Data Breakdown"):
             table_out = gr.Dataframe(headers=["Sentence", "AI Confidence"], wrap=True)
+    run_btn.click(analyze, inputs=text_input, outputs=[verdict_out, score_out, html_out, table_out, status_out])
     def _clear():
+        return "", "—", "—", "<em>Please enter text...</em>", None, ""
+    clear_btn.click(_clear, outputs=[text_input, verdict_out, score_out, html_out, table_out, status_out])
+    nuke_btn.click(nuke_and_reload, outputs=status_out)
 if __name__ == "__main__":
     demo.launch()