KavinduHansaka commited on
Commit
4d7e6ac
·
verified ·
1 Parent(s): 2a6aea4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -66
app.py CHANGED
@@ -1,32 +1,49 @@
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
- from typing import List, Tuple
6
- from pathlib import Path
7
- import fitz # PyMuPDF
8
  import docx
9
 
 
 
 
 
 
 
10
  # =========================
11
- # Configuration
 
 
 
 
 
 
 
12
  # =========================
13
  MODEL_NAME = "openai-community/roberta-base-openai-detector"
14
  AI_THRESHOLD = 0.5
15
- MAX_LENGTH = 512
16
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
17
 
18
  SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
19
 
20
  # =========================
21
- # Model Loading (once)
22
  # =========================
23
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
24
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
25
  model.to(DEVICE)
26
  model.eval()
27
 
 
28
  # =========================
29
- # File Loaders
30
  # =========================
31
  def load_text_from_file(file_path: str) -> str:
32
  path = Path(file_path)
@@ -38,26 +55,19 @@ def load_text_from_file(file_path: str) -> str:
38
  return path.read_text(encoding="utf-8", errors="ignore")
39
 
40
  if path.suffix == ".pdf":
41
- return load_pdf(path)
 
 
 
 
42
 
43
  if path.suffix == ".docx":
44
- return load_docx(path)
 
45
 
46
 
47
- def load_pdf(path: Path) -> str:
48
- text = []
49
- with fitz.open(path) as pdf:
50
- for page in pdf:
51
- text.append(page.get_text())
52
- return "\n".join(text)
53
-
54
-
55
- def load_docx(path: Path) -> str:
56
- document = docx.Document(path)
57
- return "\n".join(p.text for p in document.paragraphs if p.text.strip())
58
-
59
  # =========================
60
- # Text Utilities
61
  # =========================
62
  def chunk_text(text: str, max_words: int = 200) -> List[str]:
63
  words = text.split()
@@ -70,26 +80,52 @@ def chunk_text(text: str, max_words: int = 200) -> List[str]:
70
 
71
  return chunks
72
 
 
73
  # =========================
74
- # Core Logic
 
 
 
 
 
 
 
 
 
 
 
 
75
  # =========================
76
  @torch.no_grad()
77
- def detect_ai_probability(texts: List[str]) -> List[float]:
78
- inputs = tokenizer(
79
- texts,
80
- return_tensors="pt",
81
- padding=True,
82
- truncation=True,
83
- max_length=MAX_LENGTH
84
- ).to(DEVICE)
85
 
86
- logits = model(**inputs).logits
87
- probs = torch.softmax(logits, dim=1)[:, 1] # AI-generated class
88
- return probs.cpu().tolist()
 
 
 
 
89
 
 
 
 
90
 
91
- def classify_chunks(chunks: List[str]) -> pd.DataFrame:
92
- probabilities = detect_ai_probability(chunks)
 
 
 
 
 
 
 
93
 
94
  df = pd.DataFrame({
95
  "Text Chunk": chunks,
@@ -97,76 +133,88 @@ def classify_chunks(chunks: List[str]) -> pd.DataFrame:
97
  "Prediction": [
98
  "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
99
  for p in probabilities
 
 
 
100
  ]
101
  })
102
 
103
  return df
104
 
105
 
106
- def run_detector(text_input: str, uploaded_files) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
107
- texts: List[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Manual text input
110
  if text_input.strip():
111
  texts.append(text_input.strip())
112
 
113
- # File inputs
114
  if uploaded_files:
115
  for file in uploaded_files:
116
- extracted_text = load_text_from_file(file.name)
117
- texts.append(extracted_text)
118
 
119
  if not texts:
120
  return pd.DataFrame({"Error": ["No input provided"]}), None
121
 
122
- # Chunk all inputs
123
- all_chunks = []
124
  for text in texts:
125
- all_chunks.extend(chunk_text(text))
126
 
127
- if not all_chunks:
128
  return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
129
 
130
- result_df = classify_chunks(all_chunks)
 
131
 
132
- # Document-level summary
133
- avg_score = result_df["AI Probability"].mean()
134
- summary_row = pd.DataFrame([{
135
- "Text Chunk": "📄 Document Summary",
136
- "AI Probability": round(avg_score, 4),
137
- "Prediction": "🤖 Likely AI" if avg_score >= AI_THRESHOLD else "🧍 Human"
138
- }])
139
 
140
- final_df = pd.concat([result_df, summary_row], ignore_index=True)
141
 
142
- csv_bytes = final_df.to_csv(index=False).encode("utf-8")
143
- return final_df, ("ai_document_detection.csv", csv_bytes)
144
 
145
  # =========================
146
- # Gradio UI (HF Space)
147
  # =========================
148
  with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
149
  gr.Markdown("## 🧪 Offline AI Document Detector")
150
  gr.Markdown(
151
- "Analyze **PDF, Word, TXT, or pasted text** to detect whether content is AI-generated. "
152
- "Runs fully offline using an open-source RoBERTa model."
153
  )
154
 
155
  text_input = gr.Textbox(
156
  lines=6,
157
- label="✍️ Paste Text (optional)",
158
- placeholder="Paste any text here..."
159
  )
160
 
161
  file_input = gr.File(
162
- label="📂 Upload Documents (PDF, DOCX, TXT)",
163
  file_types=[".pdf", ".docx", ".txt"],
164
  file_count="multiple"
165
  )
166
 
167
  analyze_btn = gr.Button("🔍 Analyze")
168
- output_table = gr.Dataframe(label="📊 Detection Results")
169
- download_file = gr.File(label="⬇️ Download Results")
170
 
171
  analyze_btn.click(
172
  fn=run_detector,
 
1
+ import os
2
+ import tempfile
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
  import gradio as gr
7
  import pandas as pd
8
  import torch
9
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
10
  import docx
11
 
12
+ try:
13
+ import fitz # PyMuPDF
14
+ except ImportError as e:
15
+ raise ImportError("Missing dependency: PyMuPDF") from e
16
+
17
+
18
  # =========================
19
+ # CPU OPTIMIZATION
20
+ # =========================
21
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
+ torch.set_num_threads(2)
23
+ torch.set_grad_enabled(False)
24
+
25
+ # =========================
26
+ # CONFIGURATION
27
  # =========================
28
  MODEL_NAME = "openai-community/roberta-base-openai-detector"
29
  AI_THRESHOLD = 0.5
30
+ MAX_LENGTH = 256
31
+ BATCH_SIZE = 8
32
+ DEVICE = "cpu"
33
 
34
  SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
35
 
36
  # =========================
37
+ # MODEL LOADING (ONCE)
38
  # =========================
39
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
40
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
41
  model.to(DEVICE)
42
  model.eval()
43
 
44
+
45
  # =========================
46
+ # FILE LOADERS
47
  # =========================
48
  def load_text_from_file(file_path: str) -> str:
49
  path = Path(file_path)
 
55
  return path.read_text(encoding="utf-8", errors="ignore")
56
 
57
  if path.suffix == ".pdf":
58
+ text = []
59
+ with fitz.open(path) as pdf:
60
+ for page in pdf:
61
+ text.append(page.get_text())
62
+ return "\n".join(text)
63
 
64
  if path.suffix == ".docx":
65
+ document = docx.Document(path)
66
+ return "\n".join(p.text for p in document.paragraphs if p.text.strip())
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # =========================
70
+ # TEXT UTILITIES
71
  # =========================
72
  def chunk_text(text: str, max_words: int = 200) -> List[str]:
73
  words = text.split()
 
80
 
81
  return chunks
82
 
83
+
84
  # =========================
85
+ # CONFIDENCE CALIBRATION
86
+ # =========================
87
+ def calibrate_confidence(prob: float) -> str:
88
+ distance = abs(prob - AI_THRESHOLD)
89
+ if distance >= 0.35:
90
+ return "High"
91
+ elif distance >= 0.15:
92
+ return "Medium"
93
+ return "Low"
94
+
95
+
96
+ # =========================
97
+ # AI DETECTION (BATCHED)
98
  # =========================
99
  @torch.no_grad()
100
+ def detect_ai_probability(texts: List[str], progress=gr.Progress()):
101
+ probabilities = []
102
+ total = len(texts)
103
+
104
+ for i in range(0, total, BATCH_SIZE):
105
+ progress((i, total))
106
+ batch = texts[i:i + BATCH_SIZE]
 
107
 
108
+ inputs = tokenizer(
109
+ batch,
110
+ return_tensors="pt",
111
+ padding=True,
112
+ truncation=True,
113
+ max_length=MAX_LENGTH
114
+ )
115
 
116
+ logits = model(**inputs).logits
117
+ probs = torch.softmax(logits, dim=1)[:, 1]
118
+ probabilities.extend(probs.tolist())
119
 
120
+ progress((total, total))
121
+ return probabilities
122
+
123
+
124
+ # =========================
125
+ # CLASSIFICATION LOGIC
126
+ # =========================
127
+ def classify_chunks(chunks: List[str], progress=gr.Progress()) -> pd.DataFrame:
128
+ probabilities = detect_ai_probability(chunks, progress)
129
 
130
  df = pd.DataFrame({
131
  "Text Chunk": chunks,
 
133
  "Prediction": [
134
  "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
135
  for p in probabilities
136
+ ],
137
+ "Confidence": [
138
+ calibrate_confidence(p) for p in probabilities
139
  ]
140
  })
141
 
142
  return df
143
 
144
 
145
+ def document_summary(df: pd.DataFrame) -> pd.DataFrame:
146
+ high_conf = df[df["Confidence"] == "High"]
147
+ avg_score = df["AI Probability"].mean()
148
+
149
+ summary = pd.DataFrame([{
150
+ "Text Chunk": "📄 Document Summary",
151
+ "AI Probability": round(avg_score, 4),
152
+ "Prediction": "🤖 Likely AI" if len(high_conf) >= len(df) * 0.6 else "🧍 Human",
153
+ "Confidence": "High" if len(high_conf) >= len(df) * 0.6 else "Medium"
154
+ }])
155
+
156
+ return pd.concat([df, summary], ignore_index=True)
157
+
158
+
159
+ # =========================
160
+ # GRADIO ENTRY FUNCTION
161
+ # =========================
162
+ def run_detector(text_input: str, uploaded_files, progress=gr.Progress()):
163
+ texts = []
164
 
 
165
  if text_input.strip():
166
  texts.append(text_input.strip())
167
 
 
168
  if uploaded_files:
169
  for file in uploaded_files:
170
+ texts.append(load_text_from_file(file.name))
 
171
 
172
  if not texts:
173
  return pd.DataFrame({"Error": ["No input provided"]}), None
174
 
175
+ chunks = []
 
176
  for text in texts:
177
+ chunks.extend(chunk_text(text))
178
 
179
+ if not chunks:
180
  return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
181
 
182
+ df = classify_chunks(chunks, progress)
183
+ final_df = document_summary(df)
184
 
185
+ with tempfile.NamedTemporaryFile(
186
+ delete=False, suffix=".csv", mode="w", encoding="utf-8"
187
+ ) as tmp:
188
+ final_df.to_csv(tmp.name, index=False)
189
+ output_path = tmp.name
 
 
190
 
191
+ return final_df, output_path
192
 
 
 
193
 
194
  # =========================
195
+ # GRADIO UI (HF SPACE)
196
  # =========================
197
  with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
198
  gr.Markdown("## 🧪 Offline AI Document Detector")
199
  gr.Markdown(
200
+ "Analyze **PDF, DOCX, TXT, or pasted text** using an open-source AI detector. "
201
+ "Optimized for **CPU-only Hugging Face Spaces**."
202
  )
203
 
204
  text_input = gr.Textbox(
205
  lines=6,
206
+ label="✍️ Paste Text (optional)"
 
207
  )
208
 
209
  file_input = gr.File(
210
+ label="📂 Upload Documents",
211
  file_types=[".pdf", ".docx", ".txt"],
212
  file_count="multiple"
213
  )
214
 
215
  analyze_btn = gr.Button("🔍 Analyze")
216
+ output_table = gr.Dataframe(label="📊 Results")
217
+ download_file = gr.File(label="⬇️ Download CSV")
218
 
219
  analyze_btn.click(
220
  fn=run_detector,