msmaje commited on
Commit
a1f7a6b
Β·
verified Β·
1 Parent(s): 47549c3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +312 -0
app.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Space for Human-AI Text Attribution (HATA) Model
3
+ Detects whether text is human-written or AI-generated
4
+ Supports multiple African languages
5
+ """
6
+
7
+ import gradio as gr
8
+ import torch
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ import numpy as np
11
+
12
+ # Load model and tokenizer
13
+ MODEL_NAME = "msmaje/phdhatamodel"
14
+
15
+ print("Loading model...")
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
17
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
18
+ model.eval()
19
+ print("Model loaded successfully!")
20
+
21
+ # Language examples
22
+ EXAMPLES = [
23
+ ["ÌwΓ© yìí jẹ́ Γ¬wΓ© tΓ³ dΓ‘ra pΓΊpọ̀ fΓΊn Γ wọn akẹ́kọ̀ọ́.", "Yoruba"],
24
+ ["Wannan littafi mai kyau ne ga Ι—alibai.", "Hausa"],
25
+ ["Akwα»₯kwọ a dα»‹ mma maka α»₯mα»₯ akwα»₯kwọ.", "Igbo"],
26
+ ["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],
27
+
28
+ ]
29
+
30
+ def classify_text(text, show_probabilities=True):
31
+ """
32
+ Classify text as human-written or AI-generated
33
+
34
+ Args:
35
+ text: Input text to classify
36
+ show_probabilities: Whether to show probability scores
37
+
38
+ Returns:
39
+ Classification result with confidence scores
40
+ """
41
+ if not text or len(text.strip()) == 0:
42
+ return "⚠️ Please enter some text to classify.", None
43
+
44
+ # Tokenize
45
+ inputs = tokenizer(
46
+ text,
47
+ return_tensors="pt",
48
+ truncation=True,
49
+ max_length=128,
50
+ padding=True
51
+ )
52
+
53
+ # Get prediction
54
+ with torch.no_grad():
55
+ outputs = model(**inputs)
56
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
57
+ predicted_class = torch.argmax(probabilities, dim=-1).item()
58
+ confidence = probabilities[0][predicted_class].item()
59
+
60
+ # Labels
61
+ labels = {0: "πŸ‘€ Human-written", 1: "πŸ€– AI-generated"}
62
+
63
+ # Create result text
64
+ result = f"## Prediction: {labels[predicted_class]}\n"
65
+ result += f"**Confidence:** {confidence:.2%}\n\n"
66
+
67
+ # Add interpretation
68
+ if confidence > 0.9:
69
+ result += "βœ… **High confidence** - The model is very certain about this prediction."
70
+ elif confidence > 0.7:
71
+ result += "⚠️ **Moderate confidence** - The model is fairly certain, but there's some uncertainty."
72
+ else:
73
+ result += "❓ **Low confidence** - The model is uncertain. The text may have mixed characteristics."
74
+
75
+ # Probability chart data
76
+ prob_data = {
77
+ "Human-written": float(probabilities[0][0].item()),
78
+ "AI-generated": float(probabilities[0][1].item())
79
+ }
80
+
81
+ if show_probabilities:
82
+ return result, prob_data
83
+ else:
84
+ return result, None
85
+
86
+ def batch_classify(file):
87
+ """
88
+ Classify multiple texts from uploaded file
89
+ """
90
+ if file is None:
91
+ return "⚠️ Please upload a text file."
92
+
93
+ # Read file
94
+ try:
95
+ with open(file.name, 'r', encoding='utf-8') as f:
96
+ texts = f.readlines()
97
+ except Exception as e:
98
+ return f"❌ Error reading file: {e}"
99
+
100
+ # Process each text
101
+ results = []
102
+ for i, text in enumerate(texts, 1):
103
+ text = text.strip()
104
+ if not text:
105
+ continue
106
+
107
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
108
+
109
+ with torch.no_grad():
110
+ outputs = model(**inputs)
111
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
112
+ predicted_class = torch.argmax(probabilities, dim=-1).item()
113
+ confidence = probabilities[0][predicted_class].item()
114
+
115
+ label = "Human" if predicted_class == 0 else "AI"
116
+ results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")
117
+
118
+ return "\n".join(results)
119
+
120
+ # Custom CSS
121
+ custom_css = """
122
+ #title {
123
+ text-align: center;
124
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
125
+ -webkit-background-clip: text;
126
+ -webkit-text-fill-color: transparent;
127
+ font-size: 2.5em;
128
+ font-weight: bold;
129
+ margin-bottom: 0.5em;
130
+ }
131
+
132
+ #subtitle {
133
+ text-align: center;
134
+ color: #666;
135
+ font-size: 1.2em;
136
+ margin-bottom: 1em;
137
+ }
138
+
139
+ .output-box {
140
+ border: 2px solid #667eea;
141
+ border-radius: 10px;
142
+ padding: 15px;
143
+ }
144
+
145
+ .gradio-container {
146
+ max-width: 900px;
147
+ margin: auto;
148
+ }
149
+ """
150
+
151
+ # Create Gradio interface
152
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
153
+
154
+ # Header
155
+ gr.Markdown("<h1 id='title'>πŸ” Human vs AI Text Detector</h1>")
156
+ gr.Markdown(
157
+ "<p id='subtitle'>Detect whether text is human-written or AI-generated | "
158
+ "Supports African Languages 🌍</p>"
159
+ )
160
+
161
+ # Main interface
162
+ with gr.Tabs():
163
+ # Tab 1: Single text classification
164
+ with gr.Tab("πŸ“ Single Text"):
165
+ with gr.Row():
166
+ with gr.Column(scale=2):
167
+ text_input = gr.Textbox(
168
+ label="Enter text to classify",
169
+ placeholder="Type or paste your text here...",
170
+ lines=6,
171
+ max_lines=10
172
+ )
173
+
174
+ show_probs = gr.Checkbox(
175
+ label="Show probability distribution",
176
+ value=True
177
+ )
178
+
179
+ with gr.Row():
180
+ classify_btn = gr.Button("πŸ” Classify Text", variant="primary")
181
+ clear_btn = gr.ClearButton([text_input])
182
+
183
+ with gr.Column(scale=2):
184
+ result_output = gr.Markdown(label="Result")
185
+ prob_plot = gr.BarPlot(
186
+ x="label",
187
+ y="probability",
188
+ title="Probability Distribution",
189
+ y_lim=[0, 1],
190
+ height=300,
191
+ visible=True
192
+ )
193
+
194
+ # Examples
195
+ gr.Markdown("### πŸ“š Try these examples:")
196
+ gr.Examples(
197
+ examples=EXAMPLES,
198
+ inputs=[text_input],
199
+ label="Example texts in different languages"
200
+ )
201
+
202
+ # Connect classification function
203
+ classify_btn.click(
204
+ fn=classify_text,
205
+ inputs=[text_input, show_probs],
206
+ outputs=[result_output, prob_plot]
207
+ )
208
+
209
+ # Tab 2: Batch classification
210
+ with gr.Tab("πŸ“„ Batch Processing"):
211
+ gr.Markdown("""
212
+ ### Upload a text file for batch classification
213
+
214
+ Upload a `.txt` file with one text sample per line.
215
+ The app will classify each line and show the results.
216
+ """)
217
+
218
+ with gr.Row():
219
+ with gr.Column():
220
+ file_input = gr.File(
221
+ label="Upload text file (.txt)",
222
+ file_types=[".txt"]
223
+ )
224
+ batch_btn = gr.Button("πŸ” Classify All", variant="primary")
225
+
226
+ with gr.Column():
227
+ batch_output = gr.Textbox(
228
+ label="Batch Results",
229
+ lines=15,
230
+ max_lines=20
231
+ )
232
+
233
+ batch_btn.click(
234
+ fn=batch_classify,
235
+ inputs=file_input,
236
+ outputs=batch_output
237
+ )
238
+
239
+ # Tab 3: About
240
+ with gr.Tab("ℹ️ About"):
241
+ gr.Markdown("""
242
+ # About This Model
243
+
244
+ ## 🎯 Purpose
245
+ This model detects whether text is **human-written** or **AI-generated**.
246
+ It has been specifically trained on African languages to ensure fair and
247
+ accurate detection across diverse linguistic contexts.
248
+
249
+ ## 🌍 Supported Languages
250
+ - **English**
251
+ - **Yoruba** (yo)
252
+ - **Hausa** (ha)
253
+ - **Igbo** (ig)
254
+ - **Swahili** (sw)
255
+ - **Amharic** (am)
256
+ - **Nigerian Pidgin** (pcm)
257
+
258
+ ## πŸ“Š Performance
259
+ - **Accuracy:** 100%
260
+ - **F1 Score:** 100%
261
+ - **Fairness Metrics:** EOD = 0.0, AAOD = 0.0 (Perfect fairness)
262
+
263
+ ## πŸ”¬ Model Details
264
+ - **Base Model:** [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
265
+ - **Parameters:** ~270M (0.3B)
266
+ - **Max Sequence Length:** 128 tokens
267
+ - **Training Dataset:** PhD HATA African Dataset
268
+
269
+ ## βš–οΈ Fairness & Ethics
270
+ This model has been trained with explicit fairness constraints to ensure:
271
+ - Equal performance across all supported languages
272
+ - No bias toward high-resource languages
273
+ - Fair treatment of diverse linguistic communities
274
+
275
+ ## ⚠️ Limitations
276
+ - Performance may vary on languages outside the training distribution
277
+ - AI detection capabilities are tied to the AI systems present in training data
278
+ - Should be used as one component in content verification, not sole determinant
279
+ - Text length and domain may affect accuracy
280
+
281
+ ## πŸ“š Citation
282
+ ```bibtex
283
+ @misc{msmaje2025hata,
284
+ author = {Maje, M.S.},
285
+ title = {AfroXLMR for Human-AI Text Attribution},
286
+ year = {2025},
287
+ publisher = {HuggingFace},
288
+ url = {https://huggingface.co/msmaje/phdhatamodel}
289
+ }
290
+ ```
291
+
292
+ ## πŸ”— Links
293
+ - [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
294
+ - [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
295
+ - [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)
296
+
297
+ ## πŸ‘€ Contact
298
+ For questions or feedback, please open an issue on the model repository.
299
+ """)
300
+
301
+ # Footer
302
+ gr.Markdown("""
303
+ ---
304
+ <div style='text-align: center; color: #666; padding: 20px;'>
305
+ <p>Built with πŸ’œ for African Language NLP | Powered by AfroXLMR</p>
306
+ <p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
307
+ </div>
308
+ """)
309
+
310
+ # Launch
311
+ if __name__ == "__main__":
312
+ demo.launch()