Spaces:

AlyanAkram
/

StealthWriter

Sleeping

App Files Files Community

AlyanAkram commited on Jul 9, 2025

Commit

de65627

verified ·

1 Parent(s): 029cea2

Update detector/custom_model.py

Browse files

Files changed (1) hide show

detector/custom_model.py +23 -33

detector/custom_model.py CHANGED Viewed

@@ -1,31 +1,22 @@
 import os
 import torch
-import nltk
 from pathlib import Path
 from nltk.tokenize import sent_tokenize
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas
 from reportlab.lib import colors
-# === SAFE PATHS: Hugging Face only allows writing to /tmp ===
-HF_HOME = "/tmp/hf_home"
-NLTK_DATA = "/tmp/nltk_data"
-REPORT_DIR = Path("/tmp/reports")
-os.environ["HF_HOME"] = HF_HOME
-os.environ["TRANSFORMERS_CACHE"] = HF_HOME
-os.environ["NLTK_DATA"] = NLTK_DATA
-Path(HF_HOME).mkdir(parents=True, exist_ok=True)
-Path(NLTK_DATA).mkdir(parents=True, exist_ok=True)
-REPORT_DIR.mkdir(parents=True, exist_ok=True)
-# === NLTK ===
-nltk.data.path.append(NLTK_DATA)
-nltk.download("punkt", download_dir=NLTK_DATA)
-# === Load model ===
 USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
 if USE_HF_MODEL:
@@ -38,21 +29,23 @@ else:
     tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
     model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.eval().to(device)
 AI_THRESHOLD = 0.5
 def analyze_text(text: str):
     results = []
     paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
-    ai_count = 0
-    total_sentences = 0
     for paragraph in paragraphs:
         sentence_results = []
-        sentences = sent_tokenize(paragraph)
-        for sentence in sentences:
             inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
             with torch.no_grad():
                 outputs = model(**inputs)
@@ -61,38 +54,34 @@ def analyze_text(text: str):
             is_ai = ai_prob >= AI_THRESHOLD
             sentence_results.append((sentence, is_ai, ai_prob))
             total_sentences += 1
             if is_ai:
                 ai_count += 1
         results.append(sentence_results)
-    overall_score = round((ai_count / total_sentences) * 100, 2) if total_sentences else 0
     return {
-        "overall_ai_percent": overall_score,
         "total_sentences": total_sentences,
         "ai_sentences": ai_count,
         "results": results
     }
 def generate_pdf_report(results: dict, filename: str) -> str:
     pdf_path = REPORT_DIR / f"{filename}.pdf"
     c = canvas.Canvas(str(pdf_path), pagesize=A4)
     width, height = A4
     x, y = 40, height - 60
-    line_height = 18
-    font_size = 12
     c.setFont("Helvetica-Bold", 14)
     c.drawString(x, y, f"📄 AI Detection Report: {filename}")
     y -= 25
-    c.setFont("Helvetica", 12)
     c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
     y -= 30
-    c.setFont("Helvetica", font_size)
     for para_result in results["results"]:
         if not para_result:
@@ -112,8 +101,9 @@ def generate_pdf_report(results: dict, filename: str) -> str:
             max_width = width - 80
             words = sentence.split()
             current_line = ""
             for word in words:
-                test_line = current_line + " " + word if current_line else word
                 if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
                     if is_ai:
                         text_width = c.stringWidth(current_line, "Helvetica", font_size)
@@ -142,4 +132,4 @@ def generate_pdf_report(results: dict, filename: str) -> str:
         y -= line_height
     c.save()
-    return f"{filename}.pdf"

 import os
 import torch
 from pathlib import Path
 from nltk.tokenize import sent_tokenize
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfgen import canvas
 from reportlab.lib import colors
+import nltk
+# === Environment (no runtime directory creation) ===
+os.environ["HF_HOME"] = "/tmp/hf_home"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home"
+os.environ["NLTK_DATA"] = "/tmp/nltk_data"
+# Append pre-installed punkt path (you must ensure it's there)
+nltk.data.path.append("/tmp/nltk_data")
+# === Model loading: Hugging Face (Remote) vs Local ===
 USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
 if USE_HF_MODEL:
     tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
     model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
+model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
+device = next(model.parameters()).device
+# === AI classification threshold ===
 AI_THRESHOLD = 0.5
+# === Output reports location (this must already be writable in /tmp) ===
+REPORT_DIR = Path("/tmp/reports")
 def analyze_text(text: str):
     results = []
     paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
+    ai_count, total_sentences = 0, 0
     for paragraph in paragraphs:
         sentence_results = []
+        for sentence in sent_tokenize(paragraph):
             inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
             with torch.no_grad():
                 outputs = model(**inputs)
             is_ai = ai_prob >= AI_THRESHOLD
             sentence_results.append((sentence, is_ai, ai_prob))
             total_sentences += 1
             if is_ai:
                 ai_count += 1
         results.append(sentence_results)
     return {
+        "overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0,
         "total_sentences": total_sentences,
         "ai_sentences": ai_count,
         "results": results
     }
 def generate_pdf_report(results: dict, filename: str) -> str:
+    REPORT_DIR.mkdir(exist_ok=True)
     pdf_path = REPORT_DIR / f"{filename}.pdf"
     c = canvas.Canvas(str(pdf_path), pagesize=A4)
     width, height = A4
     x, y = 40, height - 60
+    line_height, font_size = 18, 12
     c.setFont("Helvetica-Bold", 14)
     c.drawString(x, y, f"📄 AI Detection Report: {filename}")
     y -= 25
+    c.setFont("Helvetica", font_size)
     c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
     y -= 30
     for para_result in results["results"]:
         if not para_result:
             max_width = width - 80
             words = sentence.split()
             current_line = ""
             for word in words:
+                test_line = f"{current_line} {word}".strip()
                 if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
                     if is_ai:
                         text_width = c.stringWidth(current_line, "Helvetica", font_size)
         y -= line_height
     c.save()
+    return str(pdf_path.name)