KavinduHansaka commited on
Commit
c3b4434
·
verified ·
1 Parent(s): f1f7273

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -41
app.py CHANGED
@@ -3,14 +3,20 @@ import pandas as pd
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  from typing import List, Tuple
 
 
 
6
 
7
  # =========================
8
  # Configuration
9
  # =========================
10
  MODEL_NAME = "openai-community/roberta-base-openai-detector"
11
  AI_THRESHOLD = 0.5
 
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
 
 
 
14
  # =========================
15
  # Model Loading (once)
16
  # =========================
@@ -19,20 +25,62 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
19
  model.to(DEVICE)
20
  model.eval()
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # =========================
23
  # Core Logic
24
  # =========================
25
  @torch.no_grad()
26
  def detect_ai_probability(texts: List[str]) -> List[float]:
27
- """
28
- Returns probability that each text is AI-generated.
29
- """
30
  inputs = tokenizer(
31
  texts,
32
  return_tensors="pt",
33
  padding=True,
34
  truncation=True,
35
- max_length=512
36
  ).to(DEVICE)
37
 
38
  logits = model(**inputs).logits
@@ -40,14 +88,11 @@ def detect_ai_probability(texts: List[str]) -> List[float]:
40
  return probs.cpu().tolist()
41
 
42
 
43
- def classify_texts(texts: List[str]) -> pd.DataFrame:
44
- """
45
- Classify texts as AI or Human.
46
- """
47
- probabilities = detect_ai_probability(texts)
48
 
49
  df = pd.DataFrame({
50
- "Comment": texts,
51
  "AI Probability": [round(p, 4) for p in probabilities],
52
  "Prediction": [
53
  "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
@@ -58,55 +103,74 @@ def classify_texts(texts: List[str]) -> pd.DataFrame:
58
  return df
59
 
60
 
61
- def run_detector(text_input: str, csv_file) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
62
- """
63
- Handles UI input and output.
64
- """
65
  texts: List[str] = []
66
 
 
67
  if text_input.strip():
68
- texts.extend([t.strip() for t in text_input.split("\n") if t.strip()])
69
 
70
- if csv_file:
71
- df = pd.read_csv(csv_file.name)
72
- if "comment" not in df.columns:
73
- return pd.DataFrame({"Error": ["CSV must contain a 'comment' column"]}), None
74
- texts.extend(df["comment"].astype(str).tolist())
75
 
76
  if not texts:
77
  return pd.DataFrame({"Error": ["No input provided"]}), None
78
 
79
- result_df = classify_texts(texts)
 
 
 
 
 
 
 
 
80
 
81
- csv_bytes = result_df.to_csv(index=False).encode("utf-8")
82
- return result_df, ("ai_detection_results.csv", csv_bytes)
 
 
 
 
 
83
 
 
 
 
 
84
 
85
  # =========================
86
- # Gradio UI
87
  # =========================
88
- with gr.Blocks(title="🧪 AI Text Detector") as app:
89
- gr.Markdown("## 🧪 AI Text Detector")
90
- gr.Markdown("Detect whether text is **AI-generated or human-written**.")
91
-
92
- with gr.Row():
93
- text_input = gr.Textbox(
94
- lines=8,
95
- label="✍️ Paste Text (one per line)",
96
- placeholder="Enter multiple comments, one per line..."
97
- )
98
- csv_input = gr.File(
99
- label="📄 Upload CSV",
100
- file_types=[".csv"]
101
- )
 
 
 
 
102
 
103
  analyze_btn = gr.Button("🔍 Analyze")
104
- output_table = gr.Dataframe(label="📊 Results")
105
- download_file = gr.File(label="⬇️ Download CSV")
106
 
107
  analyze_btn.click(
108
  fn=run_detector,
109
- inputs=[text_input, csv_input],
110
  outputs=[output_table, download_file]
111
  )
112
 
 
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
  from typing import List, Tuple
6
+ from pathlib import Path
7
+ import fitz # PyMuPDF
8
+ import docx
9
 
10
  # =========================
11
  # Configuration
12
  # =========================
13
  MODEL_NAME = "openai-community/roberta-base-openai-detector"
14
  AI_THRESHOLD = 0.5
15
+ MAX_LENGTH = 512
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
 
18
+ SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".docx"}
19
+
20
  # =========================
21
  # Model Loading (once)
22
  # =========================
 
25
  model.to(DEVICE)
26
  model.eval()
27
 
28
+ # =========================
29
+ # File Loaders
30
+ # =========================
31
+ def load_text_from_file(file_path: str) -> str:
32
+ path = Path(file_path)
33
+
34
+ if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
35
+ raise ValueError(f"Unsupported file type: {path.suffix}")
36
+
37
+ if path.suffix == ".txt":
38
+ return path.read_text(encoding="utf-8", errors="ignore")
39
+
40
+ if path.suffix == ".pdf":
41
+ return load_pdf(path)
42
+
43
+ if path.suffix == ".docx":
44
+ return load_docx(path)
45
+
46
+
47
+ def load_pdf(path: Path) -> str:
48
+ text = []
49
+ with fitz.open(path) as pdf:
50
+ for page in pdf:
51
+ text.append(page.get_text())
52
+ return "\n".join(text)
53
+
54
+
55
+ def load_docx(path: Path) -> str:
56
+ document = docx.Document(path)
57
+ return "\n".join(p.text for p in document.paragraphs if p.text.strip())
58
+
59
+ # =========================
60
+ # Text Utilities
61
+ # =========================
62
+ def chunk_text(text: str, max_words: int = 200) -> List[str]:
63
+ words = text.split()
64
+ chunks = []
65
+
66
+ for i in range(0, len(words), max_words):
67
+ chunk = " ".join(words[i:i + max_words])
68
+ if len(chunk.split()) >= 20:
69
+ chunks.append(chunk)
70
+
71
+ return chunks
72
+
73
  # =========================
74
  # Core Logic
75
  # =========================
76
  @torch.no_grad()
77
  def detect_ai_probability(texts: List[str]) -> List[float]:
 
 
 
78
  inputs = tokenizer(
79
  texts,
80
  return_tensors="pt",
81
  padding=True,
82
  truncation=True,
83
+ max_length=MAX_LENGTH
84
  ).to(DEVICE)
85
 
86
  logits = model(**inputs).logits
 
88
  return probs.cpu().tolist()
89
 
90
 
91
+ def classify_chunks(chunks: List[str]) -> pd.DataFrame:
92
+ probabilities = detect_ai_probability(chunks)
 
 
 
93
 
94
  df = pd.DataFrame({
95
+ "Text Chunk": chunks,
96
  "AI Probability": [round(p, 4) for p in probabilities],
97
  "Prediction": [
98
  "🤖 Likely AI" if p >= AI_THRESHOLD else "🧍 Human"
 
103
  return df
104
 
105
 
106
+ def run_detector(text_input: str, uploaded_files) -> Tuple[pd.DataFrame, Tuple[str, bytes]]:
 
 
 
107
  texts: List[str] = []
108
 
109
+ # Manual text input
110
  if text_input.strip():
111
+ texts.append(text_input.strip())
112
 
113
+ # File inputs
114
+ if uploaded_files:
115
+ for file in uploaded_files:
116
+ extracted_text = load_text_from_file(file.name)
117
+ texts.append(extracted_text)
118
 
119
  if not texts:
120
  return pd.DataFrame({"Error": ["No input provided"]}), None
121
 
122
+ # Chunk all inputs
123
+ all_chunks = []
124
+ for text in texts:
125
+ all_chunks.extend(chunk_text(text))
126
+
127
+ if not all_chunks:
128
+ return pd.DataFrame({"Error": ["Text too short for analysis"]}), None
129
+
130
+ result_df = classify_chunks(all_chunks)
131
 
132
+ # Document-level summary
133
+ avg_score = result_df["AI Probability"].mean()
134
+ summary_row = pd.DataFrame([{
135
+ "Text Chunk": "📄 Document Summary",
136
+ "AI Probability": round(avg_score, 4),
137
+ "Prediction": "🤖 Likely AI" if avg_score >= AI_THRESHOLD else "🧍 Human"
138
+ }])
139
 
140
+ final_df = pd.concat([result_df, summary_row], ignore_index=True)
141
+
142
+ csv_bytes = final_df.to_csv(index=False).encode("utf-8")
143
+ return final_df, ("ai_document_detection.csv", csv_bytes)
144
 
145
  # =========================
146
+ # Gradio UI (HF Space)
147
  # =========================
148
+ with gr.Blocks(title="🧪 Offline AI Document Detector") as app:
149
+ gr.Markdown("## 🧪 Offline AI Document Detector")
150
+ gr.Markdown(
151
+ "Analyze **PDF, Word, TXT, or pasted text** to detect whether content is AI-generated. "
152
+ "Runs fully offline using an open-source RoBERTa model."
153
+ )
154
+
155
+ text_input = gr.Textbox(
156
+ lines=6,
157
+ label="✍️ Paste Text (optional)",
158
+ placeholder="Paste any text here..."
159
+ )
160
+
161
+ file_input = gr.File(
162
+ label="📂 Upload Documents (PDF, DOCX, TXT)",
163
+ file_types=[".pdf", ".docx", ".txt"],
164
+ file_count="multiple"
165
+ )
166
 
167
  analyze_btn = gr.Button("🔍 Analyze")
168
+ output_table = gr.Dataframe(label="📊 Detection Results")
169
+ download_file = gr.File(label="⬇️ Download Results")
170
 
171
  analyze_btn.click(
172
  fn=run_detector,
173
+ inputs=[text_input, file_input],
174
  outputs=[output_table, download_file]
175
  )
176