|
|
""" |
|
|
Enhanced Gradio Space for Human-AI Text Attribution (HATA) Model |
|
|
With Comprehensive Bias Detection and Explainability (SHAP/LIME) |
|
|
Supports multiple African languages with fairness auditing |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import types |
|
|
import gradio as gr |
|
|
import torch |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
from sklearn.metrics import confusion_matrix, classification_report |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from collections import defaultdict |
|
|
import math |
|
|
|
|
|
|
|
|
os.environ["GRADIO_DISABLE_PYDUB"] = "1" |
|
|
if "audioop" not in sys.modules: |
|
|
sys.modules["audioop"] = types.ModuleType("audioop") |
|
|
if "pyaudioop" not in sys.modules: |
|
|
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop") |
|
|
|
|
|
|
|
|
try: |
|
|
import shap |
|
|
SHAP_AVAILABLE = True |
|
|
except ImportError: |
|
|
SHAP_AVAILABLE = False |
|
|
print("β οΈ SHAP not available. Install with: pip install shap") |
|
|
|
|
|
try: |
|
|
from lime.lime_text import LimeTextExplainer |
|
|
LIME_AVAILABLE = True |
|
|
except ImportError: |
|
|
LIME_AVAILABLE = False |
|
|
print("β οΈ LIME not available. Install with: pip install lime") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "msmaje/phdhatamodel" |
|
|
SUPPORTED_LANGUAGES = ["Hausa", "Yoruba", "Igbo", "Swahili", "Amharic", "Nigerian Pidgin"] |
|
|
LANGUAGE_CODES = { |
|
|
"Hausa": "ha", |
|
|
"Yoruba": "yo", |
|
|
"Igbo": "ig", |
|
|
"Swahili": "sw", |
|
|
"Amharic": "am", |
|
|
"Nigerian Pidgin": "pcm" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading model and tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) |
|
|
model.eval() |
|
|
print("β
Model loaded successfully!") |
|
|
|
|
|
|
|
|
if LIME_AVAILABLE: |
|
|
lime_explainer = LimeTextExplainer(class_names=["Human", "AI"]) |
|
|
|
|
|
if SHAP_AVAILABLE: |
|
|
|
|
|
def model_predict_proba(texts): |
|
|
inputs = tokenizer(texts, return_tensors="pt", truncation=True, |
|
|
max_length=128, padding=True) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
return probs.numpy() |
|
|
|
|
|
shap_explainer = shap.Explainer(model_predict_proba, tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BiasMetrics: |
|
|
"""Calculate fairness and bias metrics""" |
|
|
|
|
|
@staticmethod |
|
|
def calculate_eod(y_true, y_pred, groups): |
|
|
"""Equal Opportunity Difference""" |
|
|
unique_groups = np.unique(groups) |
|
|
recalls = [] |
|
|
|
|
|
for group in unique_groups: |
|
|
mask = groups == group |
|
|
if np.sum(y_true[mask] == 1) > 0: |
|
|
tp = np.sum((y_true[mask] == 1) & (y_pred[mask] == 1)) |
|
|
fn = np.sum((y_true[mask] == 1) & (y_pred[mask] == 0)) |
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0 |
|
|
recalls.append(recall) |
|
|
|
|
|
return max(recalls) - min(recalls) if len(recalls) > 1 else 0.0 |
|
|
|
|
|
@staticmethod |
|
|
def calculate_aaod(y_true, y_pred, groups): |
|
|
"""Average Absolute Odds Difference""" |
|
|
unique_groups = np.unique(groups) |
|
|
tpr_diffs = [] |
|
|
fpr_diffs = [] |
|
|
|
|
|
for i, g1 in enumerate(unique_groups): |
|
|
for g2 in unique_groups[i+1:]: |
|
|
m1 = groups == g1 |
|
|
m2 = groups == g2 |
|
|
|
|
|
|
|
|
if np.sum(y_true[m1] == 1) > 0 and np.sum(y_true[m2] == 1) > 0: |
|
|
tpr1 = np.sum((y_true[m1] == 1) & (y_pred[m1] == 1)) / np.sum(y_true[m1] == 1) |
|
|
tpr2 = np.sum((y_true[m2] == 1) & (y_pred[m2] == 1)) / np.sum(y_true[m2] == 1) |
|
|
tpr_diffs.append(abs(tpr1 - tpr2)) |
|
|
|
|
|
|
|
|
tn1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 0)) |
|
|
fp1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 1)) |
|
|
tn2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 0)) |
|
|
fp2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 1)) |
|
|
|
|
|
fpr1 = fp1 / (fp1 + tn1) if (fp1 + tn1) > 0 else 0 |
|
|
fpr2 = fp2 / (fp2 + tn2) if (fp2 + tn2) > 0 else 0 |
|
|
fpr_diffs.append(abs(fpr1 - fpr2)) |
|
|
|
|
|
return (np.mean(tpr_diffs) + np.mean(fpr_diffs)) / 2 if tpr_diffs else 0.0 |
|
|
|
|
|
@staticmethod |
|
|
def demographic_parity(y_pred, groups): |
|
|
"""Demographic Parity Difference""" |
|
|
unique_groups = np.unique(groups) |
|
|
positive_rates = [] |
|
|
|
|
|
for group in unique_groups: |
|
|
mask = groups == group |
|
|
positive_rate = np.mean(y_pred[mask] == 1) |
|
|
positive_rates.append(positive_rate) |
|
|
|
|
|
return max(positive_rates) - min(positive_rates) if len(positive_rates) > 1 else 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_shap_explanation(text, language="English"): |
|
|
"""Generate SHAP-based explanation""" |
|
|
if not SHAP_AVAILABLE: |
|
|
return "β οΈ SHAP is not installed. Install with: pip install shap", None |
|
|
|
|
|
try: |
|
|
|
|
|
shap_values = shap_explainer([text]) |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
shap.plots.text(shap_values[0], display=False) |
|
|
plt.tight_layout() |
|
|
|
|
|
|
|
|
tokens = tokenizer.tokenize(text)[:20] |
|
|
values = shap_values.values[0][:len(tokens), 1] |
|
|
|
|
|
attribution_data = { |
|
|
"Token": tokens, |
|
|
"Attribution": values.tolist() |
|
|
} |
|
|
|
|
|
explanation = f"## SHAP Explanation for {language}\n\n" |
|
|
explanation += "Tokens with **positive values** push toward AI-generated classification.\n" |
|
|
explanation += "Tokens with **negative values** push toward Human-written classification.\n\n" |
|
|
explanation += f"Top 5 most influential tokens:\n" |
|
|
|
|
|
top_indices = np.argsort(np.abs(values))[-5:][::-1] |
|
|
for idx in top_indices: |
|
|
token = tokens[idx] |
|
|
value = values[idx] |
|
|
direction = "β AI" if value > 0 else "β Human" |
|
|
explanation += f"- **{token}**: {value:.4f} {direction}\n" |
|
|
|
|
|
return explanation, (fig, attribution_data) |
|
|
|
|
|
except Exception as e: |
|
|
return f"β SHAP explanation failed: {str(e)}", None |
|
|
|
|
|
def get_lime_explanation(text, language="English"): |
|
|
"""Generate LIME-based explanation""" |
|
|
if not LIME_AVAILABLE: |
|
|
return "β οΈ LIME is not installed. Install with: pip install lime", None |
|
|
|
|
|
try: |
|
|
def predict_fn(texts): |
|
|
inputs = tokenizer(texts, return_tensors="pt", truncation=True, |
|
|
max_length=128, padding=True) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
return probs.numpy() |
|
|
|
|
|
|
|
|
exp = lime_explainer.explain_instance( |
|
|
text, |
|
|
predict_fn, |
|
|
num_features=10, |
|
|
num_samples=100 |
|
|
) |
|
|
|
|
|
|
|
|
fig = exp.as_pyplot_figure() |
|
|
plt.tight_layout() |
|
|
|
|
|
|
|
|
weights = exp.as_list() |
|
|
|
|
|
explanation = f"## LIME Explanation for {language}\n\n" |
|
|
explanation += "Features with **positive weights** indicate AI-generated characteristics.\n" |
|
|
explanation += "Features with **negative weights** indicate Human-written characteristics.\n\n" |
|
|
explanation += "Top contributing features:\n" |
|
|
|
|
|
for feature, weight in weights[:5]: |
|
|
direction = "β AI" if weight > 0 else "β Human" |
|
|
explanation += f"- **{feature}**: {weight:.4f} {direction}\n" |
|
|
|
|
|
return explanation, fig |
|
|
|
|
|
except Exception as e: |
|
|
return f"β LIME explanation failed: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_with_explanation(text, language, explainer_type="SHAP"): |
|
|
"""Classify text and provide explanation""" |
|
|
|
|
|
if not text or len(text.strip()) == 0: |
|
|
return "β οΈ Please enter text to classify", None, None, None |
|
|
|
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
predicted_class = torch.argmax(probabilities, dim=-1).item() |
|
|
confidence = probabilities[0][predicted_class].item() |
|
|
|
|
|
|
|
|
labels = {0: "π€ Human-written", 1: "π€ AI-generated"} |
|
|
result = f"## Classification Result\n\n" |
|
|
result += f"**Prediction:** {labels[predicted_class]}\n" |
|
|
result += f"**Confidence:** {confidence:.2%}\n" |
|
|
result += f"**Language:** {language}\n\n" |
|
|
|
|
|
|
|
|
if confidence > 0.9: |
|
|
result += "β
**High confidence** - Very certain about this prediction\n" |
|
|
elif confidence > 0.7: |
|
|
result += "β οΈ **Moderate confidence** - Fairly certain with some uncertainty\n" |
|
|
else: |
|
|
result += "β **Low confidence** - Uncertain, mixed characteristics detected\n" |
|
|
|
|
|
|
|
|
prob_chart = { |
|
|
"Class": ["Human-written", "AI-generated"], |
|
|
"Probability": [float(probabilities[0][0]), float(probabilities[0][1])] |
|
|
} |
|
|
|
|
|
|
|
|
explanation_text = None |
|
|
explanation_viz = None |
|
|
|
|
|
if explainer_type == "SHAP" and SHAP_AVAILABLE: |
|
|
explanation_text, explanation_viz = get_shap_explanation(text, language) |
|
|
elif explainer_type == "LIME" and LIME_AVAILABLE: |
|
|
explanation_text, explanation_viz = get_lime_explanation(text, language) |
|
|
elif explainer_type == "Both": |
|
|
shap_text, shap_viz = get_shap_explanation(text, language) |
|
|
lime_text, lime_viz = get_lime_explanation(text, language) |
|
|
explanation_text = shap_text + "\n\n---\n\n" + lime_text |
|
|
explanation_viz = (shap_viz, lime_viz) if shap_viz and lime_viz else shap_viz or lime_viz |
|
|
else: |
|
|
explanation_text = "β οΈ Selected explainer not available" |
|
|
|
|
|
return result, prob_chart, explanation_text, explanation_viz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def audit_bias(uploaded_file): |
|
|
"""Perform bias audit on uploaded dataset""" |
|
|
|
|
|
if uploaded_file is None: |
|
|
return "β οΈ Please upload a CSV file with columns: text, label, language" |
|
|
|
|
|
try: |
|
|
|
|
|
df = pd.read_csv(uploaded_file.name) |
|
|
|
|
|
required_cols = ['text', 'label', 'language'] |
|
|
if not all(col in df.columns for col in required_cols): |
|
|
return f"β CSV must have columns: {required_cols}" |
|
|
|
|
|
|
|
|
predictions = [] |
|
|
for text in df['text']: |
|
|
inputs = tokenizer(str(text), return_tensors="pt", truncation=True, max_length=128) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
pred = torch.argmax(outputs.logits, dim=-1).item() |
|
|
predictions.append(pred) |
|
|
|
|
|
df['prediction'] = predictions |
|
|
|
|
|
|
|
|
y_true = df['label'].values |
|
|
y_pred = df['prediction'].values |
|
|
groups = df['language'].values |
|
|
|
|
|
eod = BiasMetrics.calculate_eod(y_true, y_pred, groups) |
|
|
aaod = BiasMetrics.calculate_aaod(y_true, y_pred, groups) |
|
|
dpd = BiasMetrics.demographic_parity(y_pred, groups) |
|
|
|
|
|
|
|
|
lang_metrics = {} |
|
|
for lang in df['language'].unique(): |
|
|
mask = df['language'] == lang |
|
|
lang_true = y_true[mask] |
|
|
lang_pred = y_pred[mask] |
|
|
|
|
|
accuracy = np.mean(lang_true == lang_pred) |
|
|
precision = np.sum((lang_true == 1) & (lang_pred == 1)) / np.sum(lang_pred == 1) if np.sum(lang_pred == 1) > 0 else 0 |
|
|
recall = np.sum((lang_true == 1) & (lang_pred == 1)) / np.sum(lang_true == 1) if np.sum(lang_true == 1) > 0 else 0 |
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 |
|
|
|
|
|
lang_metrics[lang] = { |
|
|
'accuracy': accuracy, |
|
|
'precision': precision, |
|
|
'recall': recall, |
|
|
'f1': f1, |
|
|
'samples': int(np.sum(mask)) |
|
|
} |
|
|
|
|
|
|
|
|
report = f"# Bias Audit Report\n\n" |
|
|
report += f"**Total Samples:** {len(df)}\n" |
|
|
report += f"**Languages:** {', '.join(df['language'].unique())}\n\n" |
|
|
|
|
|
report += f"## Fairness Metrics\n\n" |
|
|
report += f"| Metric | Value | Interpretation |\n" |
|
|
report += f"|--------|-------|----------------|\n" |
|
|
report += f"| EOD | {eod:.4f} | {'β
Fair' if eod < 0.1 else 'β οΈ Bias detected'} |\n" |
|
|
report += f"| AAOD | {aaod:.4f} | {'β
Fair' if aaod < 0.1 else 'β οΈ Bias detected'} |\n" |
|
|
report += f"| Demographic Parity | {dpd:.4f} | {'β
Fair' if dpd < 0.1 else 'β οΈ Bias detected'} |\n\n" |
|
|
|
|
|
report += f"## Per-Language Performance\n\n" |
|
|
report += f"| Language | Accuracy | F1 Score | Precision | Recall | Samples |\n" |
|
|
report += f"|----------|----------|----------|-----------|--------|----------|\n" |
|
|
|
|
|
for lang, metrics in sorted(lang_metrics.items()): |
|
|
report += f"| {lang} | {metrics['accuracy']:.4f} | {metrics['f1']:.4f} | " |
|
|
report += f"{metrics['precision']:.4f} | {metrics['recall']:.4f} | {metrics['samples']} |\n" |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
|
cm = confusion_matrix(y_true, y_pred) |
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) |
|
|
ax.set_title('Overall Confusion Matrix') |
|
|
ax.set_xlabel('Predicted') |
|
|
ax.set_ylabel('Actual') |
|
|
ax.set_xticklabels(['Human', 'AI']) |
|
|
ax.set_yticklabels(['Human', 'AI']) |
|
|
plt.tight_layout() |
|
|
|
|
|
return report, fig |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error during bias audit: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
#title { |
|
|
text-align: center; |
|
|
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
font-size: 2.5em; |
|
|
font-weight: bold; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown("<h1 id='title'>π HATA: Human vs AI Text Detector</h1>") |
|
|
gr.Markdown(""" |
|
|
<div style='text-align: center; margin-bottom: 20px;'> |
|
|
Detect AI-generated text in African languages with **explainable AI** and **fairness auditing** |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Text Classification"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="Enter Text", |
|
|
placeholder="Paste text here to classify...", |
|
|
lines=8 |
|
|
) |
|
|
language_select = gr.Dropdown( |
|
|
choices=SUPPORTED_LANGUAGES, |
|
|
value="Hausa", |
|
|
label="Select Language" |
|
|
) |
|
|
explainer_select = gr.Radio( |
|
|
choices=["SHAP", "LIME", "Both"], |
|
|
value="SHAP", |
|
|
label="Explainability Method" |
|
|
) |
|
|
classify_btn = gr.Button("π Classify & Explain", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
result_output = gr.Markdown(label="Classification Result") |
|
|
prob_chart = gr.BarPlot( |
|
|
x="Class", |
|
|
y="Probability", |
|
|
title="Prediction Probabilities", |
|
|
y_lim=[0, 1] |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
explanation_output = gr.Markdown(label="Explanation") |
|
|
explanation_viz = gr.Plot(label="Visual Explanation") |
|
|
|
|
|
classify_btn.click( |
|
|
fn=classify_with_explanation, |
|
|
inputs=[text_input, language_select, explainer_select], |
|
|
outputs=[result_output, prob_chart, explanation_output, explanation_viz] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("βοΈ Bias Audit"): |
|
|
gr.Markdown(""" |
|
|
### Fairness and Bias Auditing |
|
|
|
|
|
Upload a CSV file with columns: `text`, `label` (0=Human, 1=AI), `language` |
|
|
|
|
|
The system will calculate: |
|
|
- **EOD (Equal Opportunity Difference)**: Fairness in recall across languages |
|
|
- **AAOD (Average Absolute Odds Difference)**: Disparity in TPR and FPR |
|
|
- **Demographic Parity**: Difference in positive prediction rates |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audit_file = gr.File(label="Upload CSV Dataset", file_types=[".csv"]) |
|
|
audit_btn = gr.Button("π Run Bias Audit", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audit_report = gr.Markdown(label="Audit Report") |
|
|
audit_viz = gr.Plot(label="Confusion Matrix") |
|
|
|
|
|
audit_btn.click( |
|
|
fn=audit_bias, |
|
|
inputs=audit_file, |
|
|
outputs=[audit_report, audit_viz] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("βΉοΈ About"): |
|
|
gr.Markdown(""" |
|
|
# About HATA System |
|
|
|
|
|
## π― Features |
|
|
|
|
|
### Explainable AI |
|
|
- **SHAP**: Game-theory based feature attribution |
|
|
- **LIME**: Local interpretable model-agnostic explanations |
|
|
- Visual token-level attributions |
|
|
|
|
|
### Fairness Auditing |
|
|
- Equal Opportunity Difference (EOD) |
|
|
- Average Absolute Odds Difference (AAOD) |
|
|
- Demographic Parity |
|
|
- Per-language performance metrics |
|
|
|
|
|
## π Supported Languages |
|
|
Hausa, Yoruba, Igbo, Swahili, Amharic, Nigerian Pidgin |
|
|
|
|
|
## π Model Performance |
|
|
- Accuracy: 100% |
|
|
- F1 Score: 100% |
|
|
- EOD: 0.0 (Perfect fairness) |
|
|
- AAOD: 0.0 (No bias) |
|
|
|
|
|
## π¬ Technical Details |
|
|
- Base Model: AfroXLMR-base |
|
|
- Parameters: ~270M |
|
|
- Max Sequence Length: 128 tokens |
|
|
|
|
|
## π Citation |
|
|
```bibtex |
|
|
@misc{msmaje2025hata, |
|
|
author = {Maje, M.S.}, |
|
|
title = {HATA: Human-AI Text Attribution for African Languages}, |
|
|
year = {2025}, |
|
|
publisher = {HuggingFace}, |
|
|
url = {https://huggingface.co/msmaje/phdhatamodel} |
|
|
} |
|
|
``` |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
<div style='text-align: center; color: #666;'> |
|
|
Built with π for African Language NLP | Powered by AfroXLMR & Explainable AI |
|
|
</div> |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |