msmaje commited on
Commit
adf71b3
·
verified ·
1 Parent(s): 31b076e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -284
app.py CHANGED
@@ -4,312 +4,95 @@ Detects whether text is human-written or AI-generated
4
  Supports multiple African languages
5
  """
6
 
 
7
  import os
 
 
 
8
  os.environ["GRADIO_DISABLE_PYDUB"] = "1"
9
 
 
 
 
 
 
 
 
10
  import gradio as gr
11
  import torch
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
  import numpy as np
 
14
 
15
- # Load model and tokenizer
16
- MODEL_NAME = "msmaje/phdhatamodel"
 
 
 
17
 
18
- print("Loading model...")
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
20
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 
21
  model.eval()
22
- print("Model loaded successfully!")
23
 
24
- # Language examples
25
- EXAMPLES = [
26
- ["Ìwé yìí jẹ́ ìwé tó dára púpọ̀ fún àwọn akẹ́kọ̀ọ́.", "Yoruba"],
27
- ["Wannan littafi mai kyau ne ga ɗalibai.", "Hausa"],
28
- ["Akwụkwọ a dị mma maka ụmụ akwụkwọ.", "Igbo"],
29
- ["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],
30
-
31
- ]
 
32
 
33
- def classify_text(text, show_probabilities=True):
34
- """
35
- Classify text as human-written or AI-generated
36
-
37
- Args:
38
- text: Input text to classify
39
- show_probabilities: Whether to show probability scores
40
-
41
- Returns:
42
- Classification result with confidence scores
43
- """
44
- if not text or len(text.strip()) == 0:
45
- return "⚠️ Please enter some text to classify.", None
46
-
47
- # Tokenize
48
  inputs = tokenizer(
49
  text,
50
  return_tensors="pt",
51
  truncation=True,
52
- max_length=128,
53
- padding=True
54
- )
55
-
56
- # Get prediction
57
- with torch.no_grad():
58
- outputs = model(**inputs)
59
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
60
- predicted_class = torch.argmax(probabilities, dim=-1).item()
61
- confidence = probabilities[0][predicted_class].item()
62
-
63
- # Labels
64
- labels = {0: "👤 Human-written", 1: "🤖 AI-generated"}
65
-
66
- # Create result text
67
- result = f"## Prediction: {labels[predicted_class]}\n"
68
- result += f"**Confidence:** {confidence:.2%}\n\n"
69
-
70
- # Add interpretation
71
- if confidence > 0.9:
72
- result += "✅ **High confidence** - The model is very certain about this prediction."
73
- elif confidence > 0.7:
74
- result += "⚠️ **Moderate confidence** - The model is fairly certain, but there's some uncertainty."
75
- else:
76
- result += "❓ **Low confidence** - The model is uncertain. The text may have mixed characteristics."
77
-
78
- # Probability chart data
79
- prob_data = {
80
- "Human-written": float(probabilities[0][0].item()),
81
- "AI-generated": float(probabilities[0][1].item())
82
- }
83
-
84
- if show_probabilities:
85
- return result, prob_data
86
- else:
87
- return result, None
88
 
89
- def batch_classify(file):
90
- """
91
- Classify multiple texts from uploaded file
92
- """
93
- if file is None:
94
- return "⚠️ Please upload a text file."
95
-
96
- # Read file
97
- try:
98
- with open(file.name, 'r', encoding='utf-8') as f:
99
- texts = f.readlines()
100
- except Exception as e:
101
- return f"❌ Error reading file: {e}"
102
-
103
- # Process each text
104
- results = []
105
- for i, text in enumerate(texts, 1):
106
- text = text.strip()
107
- if not text:
108
- continue
109
-
110
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
111
-
112
- with torch.no_grad():
113
- outputs = model(**inputs)
114
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
115
- predicted_class = torch.argmax(probabilities, dim=-1).item()
116
- confidence = probabilities[0][predicted_class].item()
117
-
118
- label = "Human" if predicted_class == 0 else "AI"
119
- results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")
120
-
121
- return "\n".join(results)
122
 
123
- # Custom CSS
124
- custom_css = """
125
- #title {
126
- text-align: center;
127
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
128
- -webkit-background-clip: text;
129
- -webkit-text-fill-color: transparent;
130
- font-size: 2.5em;
131
- font-weight: bold;
132
- margin-bottom: 0.5em;
133
- }
134
 
135
- #subtitle {
136
- text-align: center;
137
- color: #666;
138
- font-size: 1.2em;
139
- margin-bottom: 1em;
140
- }
 
141
 
142
- .output-box {
143
- border: 2px solid #667eea;
144
- border-radius: 10px;
145
- padding: 15px;
146
- }
147
 
148
- .gradio-container {
149
- max-width: 900px;
150
- margin: auto;
151
- }
152
- """
153
-
154
- # Create Gradio interface
155
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
156
-
157
- # Header
158
- gr.Markdown("<h1 id='title'>🔍 Human vs AI Text Detector</h1>")
159
- gr.Markdown(
160
- "<p id='subtitle'>Detect whether text is human-written or AI-generated | "
161
- "Supports African Languages 🌍</p>"
162
  )
163
-
164
- # Main interface
165
- with gr.Tabs():
166
- # Tab 1: Single text classification
167
- with gr.Tab("📝 Single Text"):
168
- with gr.Row():
169
- with gr.Column(scale=2):
170
- text_input = gr.Textbox(
171
- label="Enter text to classify",
172
- placeholder="Type or paste your text here...",
173
- lines=6,
174
- max_lines=10
175
- )
176
-
177
- show_probs = gr.Checkbox(
178
- label="Show probability distribution",
179
- value=True
180
- )
181
-
182
- with gr.Row():
183
- classify_btn = gr.Button("🔍 Classify Text", variant="primary")
184
- clear_btn = gr.ClearButton([text_input])
185
-
186
- with gr.Column(scale=2):
187
- result_output = gr.Markdown(label="Result")
188
- prob_plot = gr.BarPlot(
189
- x="label",
190
- y="probability",
191
- title="Probability Distribution",
192
- y_lim=[0, 1],
193
- height=300,
194
- visible=True
195
- )
196
-
197
- # Examples
198
- gr.Markdown("### 📚 Try these examples:")
199
- gr.Examples(
200
- examples=EXAMPLES,
201
- inputs=[text_input],
202
- label="Example texts in different languages"
203
- )
204
-
205
- # Connect classification function
206
- classify_btn.click(
207
- fn=classify_text,
208
- inputs=[text_input, show_probs],
209
- outputs=[result_output, prob_plot]
210
- )
211
-
212
- # Tab 2: Batch classification
213
- with gr.Tab("📄 Batch Processing"):
214
- gr.Markdown("""
215
- ### Upload a text file for batch classification
216
-
217
- Upload a `.txt` file with one text sample per line.
218
- The app will classify each line and show the results.
219
- """)
220
-
221
- with gr.Row():
222
- with gr.Column():
223
- file_input = gr.File(
224
- label="Upload text file (.txt)",
225
- file_types=[".txt"]
226
- )
227
- batch_btn = gr.Button("🔍 Classify All", variant="primary")
228
-
229
- with gr.Column():
230
- batch_output = gr.Textbox(
231
- label="Batch Results",
232
- lines=15,
233
- max_lines=20
234
- )
235
-
236
- batch_btn.click(
237
- fn=batch_classify,
238
- inputs=file_input,
239
- outputs=batch_output
240
  )
241
-
242
- # Tab 3: About
243
- with gr.Tab("ℹ️ About"):
244
- gr.Markdown("""
245
- # About This Model
246
-
247
- ## 🎯 Purpose
248
- This model detects whether text is **human-written** or **AI-generated**.
249
- It has been specifically trained on African languages to ensure fair and
250
- accurate detection across diverse linguistic contexts.
251
-
252
- ## 🌍 Supported Languages
253
- - **English**
254
- - **Yoruba** (yo)
255
- - **Hausa** (ha)
256
- - **Igbo** (ig)
257
- - **Swahili** (sw)
258
- - **Amharic** (am)
259
- - **Nigerian Pidgin** (pcm)
260
-
261
- ## 📊 Performance
262
- - **Accuracy:** 100%
263
- - **F1 Score:** 100%
264
- - **Fairness Metrics:** EOD = 0.0, AAOD = 0.0 (Perfect fairness)
265
-
266
- ## 🔬 Model Details
267
- - **Base Model:** [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
268
- - **Parameters:** ~270M (0.3B)
269
- - **Max Sequence Length:** 128 tokens
270
- - **Training Dataset:** PhD HATA African Dataset
271
-
272
- ## ⚖️ Fairness & Ethics
273
- This model has been trained with explicit fairness constraints to ensure:
274
- - Equal performance across all supported languages
275
- - No bias toward high-resource languages
276
- - Fair treatment of diverse linguistic communities
277
-
278
- ## ⚠️ Limitations
279
- - Performance may vary on languages outside the training distribution
280
- - AI detection capabilities are tied to the AI systems present in training data
281
- - Should be used as one component in content verification, not sole determinant
282
- - Text length and domain may affect accuracy
283
-
284
- ## 📚 Citation
285
- ```bibtex
286
- @misc{msmaje2025hata,
287
- author = {Maje, M.S.},
288
- title = {AfroXLMR for Human-AI Text Attribution},
289
- year = {2025},
290
- publisher = {HuggingFace},
291
- url = {https://huggingface.co/msmaje/phdhatamodel}
292
- }
293
- ```
294
-
295
- ## 🔗 Links
296
- - [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
297
- - [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
298
- - [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)
299
-
300
- ## 👤 Contact
301
- For questions or feedback, please open an issue on the model repository.
302
- """)
303
-
304
- # Footer
305
- gr.Markdown("""
306
- ---
307
- <div style='text-align: center; color: #666; padding: 20px;'>
308
- <p>Built with 💜 for African Language NLP | Powered by AfroXLMR</p>
309
- <p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
310
- </div>
311
- """)
312
 
313
- # Launch
 
 
314
  if __name__ == "__main__":
315
- demo.launch()
 
4
  Supports multiple African languages
5
  """
6
 
7
+ # --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
8
  import os
9
+ import sys
10
+ import types
11
+
12
  os.environ["GRADIO_DISABLE_PYDUB"] = "1"
13
 
14
+ # Provide stubs so that pydub cannot fail on audioop / pyaudioop
15
+ if "audioop" not in sys.modules:
16
+ sys.modules["audioop"] = types.ModuleType("audioop")
17
+ if "pyaudioop" not in sys.modules:
18
+ sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
19
+
20
+ # Now it is safe to import Gradio and the rest of the stack
21
  import gradio as gr
22
  import torch
 
23
  import numpy as np
24
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
25
 
26
+ # ----------------------------------------------------------------------
27
+ # Model configuration
28
+ # ----------------------------------------------------------------------
29
+ MODEL_NAME = "distilbert-base-multilingual-cased" # replace with your fine-tuned HATA checkpoint if available
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
 
 
32
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
33
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
34
+ model.to(DEVICE)
35
  model.eval()
 
36
 
37
+ LABELS = ["Human-written", "AI-generated"]
38
+
39
+ # ----------------------------------------------------------------------
40
+ # Inference routine
41
+ # ----------------------------------------------------------------------
42
+ @torch.no_grad()
43
+ def hata_predict(text: str):
44
+ if not text or not text.strip():
45
+ return {"Human-written": 0.0, "AI-generated": 0.0}
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  inputs = tokenizer(
48
  text,
49
  return_tensors="pt",
50
  truncation=True,
51
+ padding=True,
52
+ max_length=512,
53
+ ).to(DEVICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ outputs = model(**inputs)
56
+ logits = outputs.logits.squeeze(0)
57
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # ----------------------------------------------------------------------
62
+ # Gradio interface
63
+ # ----------------------------------------------------------------------
64
+ with gr.Blocks(title="Multilingual HATA System") as demo:
65
+ gr.Markdown(
66
+ """
67
+ # Multilingual Human–AI Text Attribution (HATA)
68
 
69
+ This system estimates whether an input passage is **human-written** or
70
+ **AI-generated**, with a focus on multilingual and African-language use
71
+ cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
 
 
72
 
73
+ The backend is a Transformer-based classifier fine-tuned for attribution.
74
+ """
 
 
 
 
 
 
 
 
 
 
 
 
75
  )
76
+
77
+ with gr.Row():
78
+ with gr.Column(scale=3):
79
+ text_input = gr.Textbox(
80
+ label="Input Text",
81
+ placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
82
+ lines=8,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
+ submit_btn = gr.Button("Analyze")
85
+ with gr.Column(scale=2):
86
+ output = gr.Label(label="Attribution Probabilities")
87
+
88
+ submit_btn.click(
89
+ fn=hata_predict,
90
+ inputs=text_input,
91
+ outputs=output,
92
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # ----------------------------------------------------------------------
95
+ # Entry point
96
+ # ----------------------------------------------------------------------
97
  if __name__ == "__main__":
98
+ demo.launch()