AlyanAkram commited on
Commit
de65627
·
verified ·
1 Parent(s): 029cea2

Update detector/custom_model.py

Browse files
Files changed (1) hide show
  1. detector/custom_model.py +23 -33
detector/custom_model.py CHANGED
@@ -1,31 +1,22 @@
1
  import os
2
  import torch
3
- import nltk
4
  from pathlib import Path
5
  from nltk.tokenize import sent_tokenize
6
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
7
  from reportlab.lib.pagesizes import A4
8
  from reportlab.pdfgen import canvas
9
  from reportlab.lib import colors
 
10
 
11
- # === SAFE PATHS: Hugging Face only allows writing to /tmp ===
12
- HF_HOME = "/tmp/hf_home"
13
- NLTK_DATA = "/tmp/nltk_data"
14
- REPORT_DIR = Path("/tmp/reports")
15
-
16
- os.environ["HF_HOME"] = HF_HOME
17
- os.environ["TRANSFORMERS_CACHE"] = HF_HOME
18
- os.environ["NLTK_DATA"] = NLTK_DATA
19
-
20
- Path(HF_HOME).mkdir(parents=True, exist_ok=True)
21
- Path(NLTK_DATA).mkdir(parents=True, exist_ok=True)
22
- REPORT_DIR.mkdir(parents=True, exist_ok=True)
23
 
24
- # === NLTK ===
25
- nltk.data.path.append(NLTK_DATA)
26
- nltk.download("punkt", download_dir=NLTK_DATA)
27
 
28
- # === Load model ===
29
  USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
30
 
31
  if USE_HF_MODEL:
@@ -38,21 +29,23 @@ else:
38
  tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
39
  model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
40
 
41
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
- model.eval().to(device)
43
 
 
44
  AI_THRESHOLD = 0.5
45
 
 
 
 
46
  def analyze_text(text: str):
47
  results = []
48
  paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
49
- ai_count = 0
50
- total_sentences = 0
51
 
52
  for paragraph in paragraphs:
53
  sentence_results = []
54
- sentences = sent_tokenize(paragraph)
55
- for sentence in sentences:
56
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
57
  with torch.no_grad():
58
  outputs = model(**inputs)
@@ -61,38 +54,34 @@ def analyze_text(text: str):
61
 
62
  is_ai = ai_prob >= AI_THRESHOLD
63
  sentence_results.append((sentence, is_ai, ai_prob))
64
-
65
  total_sentences += 1
66
  if is_ai:
67
  ai_count += 1
68
 
69
  results.append(sentence_results)
70
 
71
- overall_score = round((ai_count / total_sentences) * 100, 2) if total_sentences else 0
72
-
73
  return {
74
- "overall_ai_percent": overall_score,
75
  "total_sentences": total_sentences,
76
  "ai_sentences": ai_count,
77
  "results": results
78
  }
79
 
80
  def generate_pdf_report(results: dict, filename: str) -> str:
 
81
  pdf_path = REPORT_DIR / f"{filename}.pdf"
82
 
83
  c = canvas.Canvas(str(pdf_path), pagesize=A4)
84
  width, height = A4
85
  x, y = 40, height - 60
86
- line_height = 18
87
- font_size = 12
88
 
89
  c.setFont("Helvetica-Bold", 14)
90
  c.drawString(x, y, f"📄 AI Detection Report: {filename}")
91
  y -= 25
92
- c.setFont("Helvetica", 12)
93
  c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
94
  y -= 30
95
- c.setFont("Helvetica", font_size)
96
 
97
  for para_result in results["results"]:
98
  if not para_result:
@@ -112,8 +101,9 @@ def generate_pdf_report(results: dict, filename: str) -> str:
112
  max_width = width - 80
113
  words = sentence.split()
114
  current_line = ""
 
115
  for word in words:
116
- test_line = current_line + " " + word if current_line else word
117
  if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
118
  if is_ai:
119
  text_width = c.stringWidth(current_line, "Helvetica", font_size)
@@ -142,4 +132,4 @@ def generate_pdf_report(results: dict, filename: str) -> str:
142
  y -= line_height
143
 
144
  c.save()
145
- return f"{filename}.pdf"
 
1
  import os
2
  import torch
 
3
  from pathlib import Path
4
  from nltk.tokenize import sent_tokenize
5
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
6
  from reportlab.lib.pagesizes import A4
7
  from reportlab.pdfgen import canvas
8
  from reportlab.lib import colors
9
+ import nltk
10
 
11
+ # === Environment (no runtime directory creation) ===
12
+ os.environ["HF_HOME"] = "/tmp/hf_home"
13
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_home"
14
+ os.environ["NLTK_DATA"] = "/tmp/nltk_data"
 
 
 
 
 
 
 
 
15
 
16
+ # Append pre-installed punkt path (you must ensure it's there)
17
+ nltk.data.path.append("/tmp/nltk_data")
 
18
 
19
+ # === Model loading: Hugging Face (Remote) vs Local ===
20
  USE_HF_MODEL = os.getenv("USE_HF_MODEL") == "1"
21
 
22
  if USE_HF_MODEL:
 
29
  tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
30
  model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH, local_files_only=True)
31
 
32
+ model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
33
+ device = next(model.parameters()).device
34
 
35
+ # === AI classification threshold ===
36
  AI_THRESHOLD = 0.5
37
 
38
+ # === Output reports location (this must already be writable in /tmp) ===
39
+ REPORT_DIR = Path("/tmp/reports")
40
+
41
  def analyze_text(text: str):
42
  results = []
43
  paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
44
+ ai_count, total_sentences = 0, 0
 
45
 
46
  for paragraph in paragraphs:
47
  sentence_results = []
48
+ for sentence in sent_tokenize(paragraph):
 
49
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
50
  with torch.no_grad():
51
  outputs = model(**inputs)
 
54
 
55
  is_ai = ai_prob >= AI_THRESHOLD
56
  sentence_results.append((sentence, is_ai, ai_prob))
 
57
  total_sentences += 1
58
  if is_ai:
59
  ai_count += 1
60
 
61
  results.append(sentence_results)
62
 
 
 
63
  return {
64
+ "overall_ai_percent": round((ai_count / total_sentences) * 100, 2) if total_sentences else 0,
65
  "total_sentences": total_sentences,
66
  "ai_sentences": ai_count,
67
  "results": results
68
  }
69
 
70
  def generate_pdf_report(results: dict, filename: str) -> str:
71
+ REPORT_DIR.mkdir(exist_ok=True)
72
  pdf_path = REPORT_DIR / f"{filename}.pdf"
73
 
74
  c = canvas.Canvas(str(pdf_path), pagesize=A4)
75
  width, height = A4
76
  x, y = 40, height - 60
77
+ line_height, font_size = 18, 12
 
78
 
79
  c.setFont("Helvetica-Bold", 14)
80
  c.drawString(x, y, f"📄 AI Detection Report: {filename}")
81
  y -= 25
82
+ c.setFont("Helvetica", font_size)
83
  c.drawString(x, y, f"🧠 AI Detected: {results['overall_ai_percent']}% of {results['total_sentences']} sentences")
84
  y -= 30
 
85
 
86
  for para_result in results["results"]:
87
  if not para_result:
 
101
  max_width = width - 80
102
  words = sentence.split()
103
  current_line = ""
104
+
105
  for word in words:
106
+ test_line = f"{current_line} {word}".strip()
107
  if c.stringWidth(test_line, "Helvetica", font_size) > max_width:
108
  if is_ai:
109
  text_width = c.stringWidth(current_line, "Helvetica", font_size)
 
132
  y -= line_height
133
 
134
  c.save()
135
+ return str(pdf_path.name)