File size: 10,530 Bytes
31c6aa2
9850f4b
 
 
31c6aa2
 
5310029
 
 
9850f4b
676c241
9850f4b
5310029
676c241
9850f4b
 
 
 
 
5310029
9850f4b
 
 
 
 
 
 
 
 
676c241
9850f4b
 
 
0225ee0
9850f4b
 
 
0225ee0
9850f4b
 
 
0225ee0
9850f4b
0225ee0
9850f4b
 
 
 
 
 
 
 
0225ee0
9850f4b
5310029
 
0225ee0
 
 
 
9850f4b
5310029
0225ee0
9850f4b
 
 
 
 
0225ee0
9850f4b
0225ee0
9850f4b
5310029
9850f4b
0225ee0
9850f4b
 
 
 
 
0225ee0
9850f4b
 
5310029
9850f4b
676c241
9850f4b
 
 
 
 
 
0225ee0
9850f4b
5310029
9850f4b
 
 
 
 
 
 
 
 
 
 
0225ee0
9850f4b
0225ee0
9850f4b
 
 
 
 
0225ee0
9850f4b
 
 
 
676c241
9850f4b
5310029
 
 
 
 
 
 
 
9850f4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5310029
 
 
9850f4b
5310029
0225ee0
9850f4b
 
 
 
 
 
0225ee0
9850f4b
5310029
9850f4b
 
5310029
9850f4b
0225ee0
9850f4b
 
 
 
0225ee0
9850f4b
 
 
 
0225ee0
9850f4b
 
 
 
0225ee0
9850f4b
 
 
 
 
 
0c0c1de
 
9850f4b
0225ee0
 
9850f4b
 
0c0c1de
9850f4b
 
 
0c0c1de
0225ee0
9850f4b
0225ee0
9850f4b
 
 
0225ee0
 
9850f4b
 
0225ee0
9850f4b
0225ee0
9850f4b
 
0225ee0
 
5310029
 
9850f4b
 
 
 
 
0225ee0
5310029
9850f4b
 
 
 
 
0225ee0
9850f4b
 
 
 
0225ee0
 
5310029
 
 
9850f4b
0225ee0
9850f4b
 
 
 
0225ee0
9850f4b
 
 
 
 
 
 
 
0225ee0
9850f4b
 
 
 
0225ee0
9850f4b
 
 
 
 
0225ee0
9850f4b
 
 
 
 
0225ee0
9850f4b
 
 
 
 
0225ee0
 
 
 
 
9850f4b
0225ee0
 
 
 
 
9850f4b
 
 
 
 
 
 
 
5310029
0225ee0
9850f4b
0225ee0
 
9850f4b
 
 
0225ee0
 
5310029
9850f4b
a1f7a6b
0c0c1de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
"""
Gradio Space for Human-AI Text Attribution (HATA) Model
Detects whether text is human-written or AI-generated
Supports multiple African languages
"""

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# Load model and tokenizer
MODEL_NAME = "msmaje/phdhatamodel"

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()
print("Model loaded successfully!")

# Language examples
EXAMPLES = [
   
    ["Ìwé yìí jẹ́ ìwé tó dára púpọ̀ fún àwọn akẹ́kọ̀ọ́.", "Yoruba"],
    ["Wannan littafi mai kyau ne ga ɗalibai.", "Hausa"],
    ["Akwụkwọ a dị mma maka ụmụ akwụkwọ.", "Igbo"],
    ["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],
   
]

def classify_text(text, show_probabilities=True):
    """
    Classify text as human-written or AI-generated
    
    Args:
        text: Input text to classify
        show_probabilities: Whether to show probability scores
        
    Returns:
        Classification result with confidence scores
    """
    if not text or len(text.strip()) == 0:
        return "⚠️ Please enter some text to classify.", None
    
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding=True
    )
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][predicted_class].item()
    
    # Labels
    labels = {0: "👤 Human-written", 1: "🤖 AI-generated"}
    
    # Create result text
    result = f"## Prediction: {labels[predicted_class]}\n"
    result += f"**Confidence:** {confidence:.2%}\n\n"
    
    # Add interpretation
    if confidence > 0.9:
        result += "✅ **High confidence** - The model is very certain about this prediction."
    elif confidence > 0.7:
        result += "⚠️ **Moderate confidence** - The model is fairly certain, but there's some uncertainty."
    else:
        result += "❓ **Low confidence** - The model is uncertain. The text may have mixed characteristics."
    
    # Probability chart data
    prob_data = {
        "Human-written": float(probabilities[0][0].item()),
        "AI-generated": float(probabilities[0][1].item())
    }
    
    if show_probabilities:
        return result, prob_data
    else:
        return result, None

def batch_classify(file):
    """
    Classify multiple texts from uploaded file
    """
    if file is None:
        return "⚠️ Please upload a text file."
    
    # Read file
    try:
        with open(file.name, 'r', encoding='utf-8') as f:
            texts = f.readlines()
    except Exception as e:
        return f"❌ Error reading file: {e}"
    
    # Process each text
    results = []
    for i, text in enumerate(texts, 1):
        text = text.strip()
        if not text:
            continue
            
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0][predicted_class].item()
        
        label = "Human" if predicted_class == 0 else "AI"
        results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")
    
    return "\n".join(results)

# Custom CSS
custom_css = """
#title {
    text-align: center;
    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 2.5em;
    font-weight: bold;
    margin-bottom: 0.5em;
}

#subtitle {
    text-align: center;
    color: #666;
    font-size: 1.2em;
    margin-bottom: 1em;
}

.output-box {
    border: 2px solid #667eea;
    border-radius: 10px;
    padding: 15px;
}

.gradio-container {
    max-width: 900px;
    margin: auto;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    
    # Header
    gr.Markdown("<h1 id='title'>🔍 Human vs AI Text Detector</h1>")
    gr.Markdown(
        "<p id='subtitle'>Detect whether text is human-written or AI-generated | "
        "Supports African Languages 🌍</p>"
    )
    
    # Main interface
    with gr.Tabs():
        # Tab 1: Single text classification
        with gr.Tab("📝 Single Text"):
            with gr.Row():
                with gr.Column(scale=2):
                    text_input = gr.Textbox(
                        label="Enter text to classify",
                        placeholder="Type or paste your text here...",
                        lines=6,
                        max_lines=10
                    )
                    
                    show_probs = gr.Checkbox(
                        label="Show probability distribution",
                        value=True
                    )
                    
                    with gr.Row():
                        classify_btn = gr.Button("🔍 Classify Text", variant="primary")
                        clear_btn = gr.ClearButton([text_input])
                
                with gr.Column(scale=2):
                    result_output = gr.Markdown(label="Result")
                    prob_plot = gr.BarPlot(
                        x="label",
                        y="probability",
                        title="Probability Distribution",
                        y_lim=[0, 1],
                        height=300,
                        visible=True
                    )
            
            # Examples
            gr.Markdown("### 📚 Try these examples:")
            gr.Examples(
                examples=EXAMPLES,
                inputs=[text_input],
                label="Example texts in different languages"
            )
            
            # Connect classification function
            classify_btn.click(
                fn=classify_text,
                inputs=[text_input, show_probs],
                outputs=[result_output, prob_plot]
            )
        
        # Tab 2: Batch classification
        with gr.Tab("📄 Batch Processing"):
            gr.Markdown("""
            ### Upload a text file for batch classification
            
            Upload a `.txt` file with one text sample per line. 
            The app will classify each line and show the results.
            """)
            
            with gr.Row():
                with gr.Column():
                    file_input = gr.File(
                        label="Upload text file (.txt)",
                        file_types=[".txt"]
                    )
                    batch_btn = gr.Button("🔍 Classify All", variant="primary")
                
                with gr.Column():
                    batch_output = gr.Textbox(
                        label="Batch Results",
                        lines=15,
                        max_lines=20
                    )
            
            batch_btn.click(
                fn=batch_classify,
                inputs=file_input,
                outputs=batch_output
            )
        
        # Tab 3: About
        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
            # About This Model
            
            ## 🎯 Purpose
            This model detects whether text is **human-written** or **AI-generated**. 
            It has been specifically trained on African languages to ensure fair and 
            accurate detection across diverse linguistic contexts.
            
            ## 🌍 Supported Languages
            - **English**
            - **Yoruba** (yo)
            - **Hausa** (ha)
            - **Igbo** (ig)
            - **Swahili** (sw)
            - **Amharic** (am)
            - **Nigerian Pidgin** (pcm)
            
            ## 📊 Performance
            - **Accuracy:** 100%
            - **F1 Score:** 100%
            - **Fairness Metrics:** EOD = 0.0, AAOD = 0.0 (Perfect fairness)
            
            ## 🔬 Model Details
            - **Base Model:** [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
            - **Parameters:** ~270M (0.3B)
            - **Max Sequence Length:** 128 tokens
            - **Training Dataset:** PhD HATA African Dataset
            
            ## ⚖️ Fairness & Ethics
            This model has been trained with explicit fairness constraints to ensure:
            - Equal performance across all supported languages
            - No bias toward high-resource languages
            - Fair treatment of diverse linguistic communities
            
            ## ⚠️ Limitations
            - Performance may vary on languages outside the training distribution
            - AI detection capabilities are tied to the AI systems present in training data
            - Should be used as one component in content verification, not sole determinant
            - Text length and domain may affect accuracy
            
            ## 📚 Citation
            ```bibtex
            @misc{msmaje2025hata,
              author = {Maje, M.S.},
              title = {AfroXLMR for Human-AI Text Attribution},
              year = {2025},
              publisher = {HuggingFace},
              url = {https://huggingface.co/msmaje/phdhatamodel}
            }
            ```
            
            ## 🔗 Links
            - [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
            - [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
            - [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)
            
            ## 👤 Contact
            For questions or feedback, please open an issue on the model repository.
            """)
    
    # Footer
    gr.Markdown("""
    ---
    <div style='text-align: center; color: #666; padding: 20px;'>
        <p>Built with 💜 for African Language NLP | Powered by AfroXLMR</p>
        <p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
    </div>
    """)

# Launch
if __name__ == "__main__":
    demo.launch()