|
|
""" |
|
|
Enhanced Gradio Space for Human-AI Text Attribution (HATA) Model |
|
|
With Comprehensive Bias Detection and Explainability (SHAP/LIME) |
|
|
Supports multiple African languages with fairness auditing |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import types |
|
|
import gradio as gr |
|
|
import torch |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
from sklearn.metrics import confusion_matrix |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import math |
|
|
|
|
|
|
|
|
os.environ["GRADIO_DISABLE_PYDUB"] = "1" |
|
|
if "audioop" not in sys.modules: |
|
|
sys.modules["audioop"] = types.ModuleType("audioop") |
|
|
if "pyaudioop" not in sys.modules: |
|
|
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop") |
|
|
|
|
|
|
|
|
try: |
|
|
import shap |
|
|
SHAP_AVAILABLE = True |
|
|
except ImportError: |
|
|
SHAP_AVAILABLE = False |
|
|
print("β οΈ SHAP not available. Install with: pip install shap") |
|
|
|
|
|
try: |
|
|
from lime.lime_text import LimeTextExplainer |
|
|
LIME_AVAILABLE = True |
|
|
except ImportError: |
|
|
LIME_AVAILABLE = False |
|
|
print("β οΈ LIME not available. Install with: pip install lime") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "msmaje/phdhatamodel" |
|
|
SUPPORTED_LANGUAGES = ["Hausa", "Yoruba", "Igbo", "Swahili", "Amharic", "Nigerian Pidgin"] |
|
|
LANGUAGE_CODES = { |
|
|
"Hausa": "ha", |
|
|
"Yoruba": "yo", |
|
|
"Igbo": "ig", |
|
|
"Swahili": "sw", |
|
|
"Amharic": "am", |
|
|
"Nigerian Pidgin": "pcm" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading model and tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) |
|
|
model.eval() |
|
|
print("β
Model loaded successfully!") |
|
|
|
|
|
|
|
|
if LIME_AVAILABLE: |
|
|
lime_explainer = LimeTextExplainer(class_names=["Human", "AI"]) |
|
|
|
|
|
if SHAP_AVAILABLE: |
|
|
def model_predict_proba(texts): |
|
|
inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=128, padding=True) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
return probs.numpy() |
|
|
shap_explainer = shap.Explainer(model_predict_proba, tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BiasMetrics: |
|
|
@staticmethod |
|
|
def calculate_eod(y_true, y_pred, groups): |
|
|
unique_groups = np.unique(groups) |
|
|
recalls = [] |
|
|
for group in unique_groups: |
|
|
mask = groups == group |
|
|
if np.sum(y_true[mask] == 1) > 0: |
|
|
tp = np.sum((y_true[mask] == 1) & (y_pred[mask] == 1)) |
|
|
fn = np.sum((y_true[mask] == 1) & (y_pred[mask] == 0)) |
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0 |
|
|
recalls.append(recall) |
|
|
return max(recalls) - min(recalls) if len(recalls) > 1 else 0.0 |
|
|
|
|
|
@staticmethod |
|
|
def calculate_aaod(y_true, y_pred, groups): |
|
|
unique_groups = np.unique(groups) |
|
|
tpr_diffs = [] |
|
|
fpr_diffs = [] |
|
|
for i, g1 in enumerate(unique_groups): |
|
|
for g2 in unique_groups[i+1:]: |
|
|
m1 = groups == g1 |
|
|
m2 = groups == g2 |
|
|
if np.sum(y_true[m1] == 1) > 0 and np.sum(y_true[m2] == 1) > 0: |
|
|
tpr1 = np.sum((y_true[m1] == 1) & (y_pred[m1] == 1)) / np.sum(y_true[m1] == 1) |
|
|
tpr2 = np.sum((y_true[m2] == 1) & (y_pred[m2] == 1)) / np.sum(y_true[m2] == 1) |
|
|
tpr_diffs.append(abs(tpr1 - tpr2)) |
|
|
tn1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 0)) |
|
|
fp1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 1)) |
|
|
tn2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 0)) |
|
|
fp2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 1)) |
|
|
fpr1 = fp1 / (fp1 + tn1) if (fp1 + tn1) > 0 else 0 |
|
|
fpr2 = fp2 / (fp2 + tn2) if (fp2 + tn2) > 0 else 0 |
|
|
fpr_diffs.append(abs(fpr1 - fpr2)) |
|
|
return (np.mean(tpr_diffs) + np.mean(fpr_diffs)) / 2 if tpr_diffs else 0.0 |
|
|
|
|
|
@staticmethod |
|
|
def demographic_parity(y_pred, groups): |
|
|
unique_groups = np.unique(groups) |
|
|
positive_rates = [] |
|
|
for group in unique_groups: |
|
|
mask = groups == group |
|
|
positive_rates.append(np.mean(y_pred[mask] == 1)) |
|
|
return max(positive_rates) - min(positive_rates) if len(positive_rates) > 1 else 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_shap_explanation(text, language="English"): |
|
|
if not SHAP_AVAILABLE: |
|
|
return "β οΈ SHAP not installed", None |
|
|
try: |
|
|
shap_values = shap_explainer([text]) |
|
|
fig, ax = plt.subplots(figsize=(12,6)) |
|
|
shap.plots.text(shap_values[0], display=False) |
|
|
plt.tight_layout() |
|
|
tokens = tokenizer.tokenize(text)[:20] |
|
|
values = shap_values.values[0][:len(tokens),1] |
|
|
attribution_data = {"Token": tokens, "Attribution": values.tolist()} |
|
|
explanation = f"## SHAP Explanation for {language}\n\n" |
|
|
explanation += "Top 5 tokens influencing AI/Human prediction:\n" |
|
|
top_indices = np.argsort(np.abs(values))[-5:][::-1] |
|
|
for idx in top_indices: |
|
|
token = tokens[idx] |
|
|
value = values[idx] |
|
|
direction = "β AI" if value > 0 else "β Human" |
|
|
explanation += f"- **{token}**: {value:.4f} {direction}\n" |
|
|
return explanation, (fig, attribution_data) |
|
|
except Exception as e: |
|
|
return f"β SHAP explanation failed: {str(e)}", None |
|
|
|
|
|
def get_lime_explanation(text, language="English"): |
|
|
if not LIME_AVAILABLE: |
|
|
return "β οΈ LIME not installed", None |
|
|
try: |
|
|
def predict_fn(texts): |
|
|
inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=128, padding=True) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
return probs.numpy() |
|
|
exp = lime_explainer.explain_instance(text, predict_fn, num_features=10, num_samples=100) |
|
|
fig = exp.as_pyplot_figure() |
|
|
plt.tight_layout() |
|
|
weights = exp.as_list() |
|
|
explanation = f"## LIME Explanation for {language}\n\nTop contributing features:\n" |
|
|
for feature, weight in weights[:5]: |
|
|
direction = "β AI" if weight > 0 else "β Human" |
|
|
explanation += f"- **{feature}**: {weight:.4f} {direction}\n" |
|
|
return explanation, fig |
|
|
except Exception as e: |
|
|
return f"β LIME explanation failed: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_with_explanation(text, language, explainer_type="SHAP"): |
|
|
if not text or len(text.strip())==0: |
|
|
return "β οΈ Please enter text", None, None, None |
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
pred_class = torch.argmax(probs, dim=-1).item() |
|
|
confidence = probs[0][pred_class].item() |
|
|
labels = {0: "π€ Human-written", 1: "π€ AI-generated"} |
|
|
result = f"## Classification Result\n**Prediction:** {labels[pred_class]}\n**Confidence:** {confidence:.2%}\n**Language:** {language}\n\n" |
|
|
if confidence>0.9: |
|
|
result += "β
High confidence\n" |
|
|
elif confidence>0.7: |
|
|
result += "β οΈ Moderate confidence\n" |
|
|
else: |
|
|
result += "β Low confidence\n" |
|
|
prob_chart = {"Class":["Human-written","AI-generated"],"Probability":[float(probs[0][0]), float(probs[0][1])]} |
|
|
explanation_text, explanation_viz = None, None |
|
|
if explainer_type=="SHAP" and SHAP_AVAILABLE: |
|
|
explanation_text, explanation_viz = get_shap_explanation(text, language) |
|
|
elif explainer_type=="LIME" and LIME_AVAILABLE: |
|
|
explanation_text, explanation_viz = get_lime_explanation(text, language) |
|
|
elif explainer_type=="Both": |
|
|
shap_text, shap_viz = get_shap_explanation(text, language) |
|
|
lime_text, lime_viz = get_lime_explanation(text, language) |
|
|
explanation_text = shap_text + "\n\n---\n\n" + lime_text |
|
|
explanation_viz = (shap_viz, lime_viz) if shap_viz and lime_viz else shap_viz or lime_viz |
|
|
else: |
|
|
explanation_text = "β οΈ Selected explainer not available" |
|
|
return result, prob_chart, explanation_text, explanation_viz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def audit_bias(uploaded_file): |
|
|
if uploaded_file is None: |
|
|
return "β οΈ Please upload a CSV file", None |
|
|
try: |
|
|
df = pd.read_csv(uploaded_file.name) |
|
|
required_cols = ['text','label','language'] |
|
|
if not all(col in df.columns for col in required_cols): |
|
|
return f"β CSV must have columns: {required_cols}", None |
|
|
preds = [] |
|
|
for text in df['text']: |
|
|
inputs = tokenizer(str(text), return_tensors="pt", truncation=True, max_length=128) |
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
pred = torch.argmax(outputs.logits, dim=-1).item() |
|
|
preds.append(pred) |
|
|
df['prediction'] = preds |
|
|
y_true = df['label'].values |
|
|
y_pred = df['prediction'].values |
|
|
groups = df['language'].values |
|
|
eod = BiasMetrics.calculate_eod(y_true, y_pred, groups) |
|
|
aaod = BiasMetrics.calculate_aaod(y_true, y_pred, groups) |
|
|
dpd = BiasMetrics.demographic_parity(y_pred, groups) |
|
|
lang_metrics = {} |
|
|
for lang in df['language'].unique(): |
|
|
mask = df['language']==lang |
|
|
lang_true = y_true[mask] |
|
|
lang_pred = y_pred[mask] |
|
|
accuracy = np.mean(lang_true==lang_pred) |
|
|
precision = np.sum((lang_true==1)&(lang_pred==1))/np.sum(lang_pred==1) if np.sum(lang_pred==1)>0 else 0 |
|
|
recall = np.sum((lang_true==1)&(lang_pred==1))/np.sum(lang_true==1) if np.sum(lang_true==1)>0 else 0 |
|
|
f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0 |
|
|
lang_metrics[lang] = {'accuracy':accuracy,'precision':precision,'recall':recall,'f1':f1,'samples':int(np.sum(mask))} |
|
|
report = f"# Bias Audit Report\nTotal Samples: {len(df)}\nLanguages: {', '.join(df['language'].unique())}\n\n" |
|
|
report += f"## Fairness Metrics\n| Metric | Value | Interpretation |\n|--------|-------|----------------|\n" |
|
|
report += f"| EOD | {eod:.4f} | {'β
Fair' if eod<0.1 else 'β οΈ Bias detected'} |\n" |
|
|
report += f"| AAOD | {aaod:.4f} | {'β
Fair' if aaod<0.1 else 'β οΈ Bias detected'} |\n" |
|
|
report += f"| Demographic Parity | {dpd:.4f} | {'β
Fair' if dpd<0.1 else 'β οΈ Bias detected'} |\n\n" |
|
|
report += f"## Per-Language Performance\n| Language | Accuracy | F1 Score | Precision | Recall | Samples |\n|----------|----------|----------|-----------|--------|----------|\n" |
|
|
for lang, metrics in sorted(lang_metrics.items()): |
|
|
report += f"| {lang} | {metrics['accuracy']:.4f} | {metrics['f1']:.4f} | {metrics['precision']:.4f} | {metrics['recall']:.4f} | {metrics['samples']} |\n" |
|
|
fig, ax = plt.subplots(figsize=(8,6)) |
|
|
cm = confusion_matrix(y_true, y_pred) |
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) |
|
|
ax.set_title('Overall Confusion Matrix') |
|
|
ax.set_xlabel('Predicted') |
|
|
ax.set_ylabel('Actual') |
|
|
ax.set_xticklabels(['Human','AI']) |
|
|
ax.set_yticklabels(['Human','AI']) |
|
|
plt.tight_layout() |
|
|
return report, fig |
|
|
except Exception as e: |
|
|
return f"β Error during bias audit: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
#title { |
|
|
text-align: center; |
|
|
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
font-size: 2.5em; |
|
|
font-weight: bold; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("<h1 id='title'>π HATA: Human vs AI Text Detector</h1>") |
|
|
gr.Markdown("<div style='text-align: center; margin-bottom: 20px;'>Detect AI-generated text in African languages with explainable AI and fairness auditing</div>") |
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Text Classification"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox(label="Enter Text", placeholder="Paste text here...", lines=8) |
|
|
language_select = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="Hausa", label="Select Language") |
|
|
explainer_select = gr.Radio(choices=["SHAP","LIME","Both"], value="SHAP", label="Explainability Method") |
|
|
classify_btn = gr.Button("π Classify & Explain", variant="primary") |
|
|
with gr.Column(): |
|
|
result_output = gr.Markdown(label="Classification Result") |
|
|
prob_chart = gr.BarPlot(x="Class", y="Probability", title="Prediction Probabilities", y_lim=[0,1]) |
|
|
with gr.Row(): |
|
|
explanation_output = gr.Markdown(label="Explanation") |
|
|
explanation_viz = gr.Plot(label="Visual Explanation") |
|
|
classify_btn.click(fn=classify_with_explanation, inputs=[text_input, language_select, explainer_select], outputs=[result_output, prob_chart, explanation_output, explanation_viz]) |
|
|
|
|
|
with gr.Tab("βοΈ Bias Audit"): |
|
|
gr.Markdown("Upload a CSV with columns: text,label (0=Human,1=AI),language") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audit_file = gr.File(label="Upload CSV", file_types=[".csv"]) |
|
|
audit_btn = gr.Button("π Run Bias Audit", variant="primary") |
|
|
with gr.Column(): |
|
|
audit_report = gr.Markdown(label="Audit Report") |
|
|
audit_viz = gr.Plot(label="Confusion Matrix") |
|
|
audit_btn.click(fn=audit_bias, inputs=audit_file, outputs=[audit_report, audit_viz]) |
|
|
|
|
|
with gr.Tab("βΉοΈ About"): |
|
|
gr.Markdown(""" |
|
|
# About HATA System |
|
|
- SHAP & LIME Explainability |
|
|
- Bias auditing across languages |
|
|
- Supported Languages: Hausa, Yoruba, Igbo, Swahili, Amharic, Nigerian Pidgin |
|
|
- Base Model: AfroXLMR-base |
|
|
- Citation: [HuggingFace Model](https://huggingface.co/msmaje/phdhatamodel) |
|
|
""") |
|
|
gr.Markdown("<div style='text-align: center; color:#666;'>Built with π for African Language NLP</div>") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue(api_open=False) |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True, share=True) |
|
|
|