| | """ |
| | Enhanced Gradio Space for Human-AI Text Attribution (HATA) Model |
| | With Comprehensive Bias Detection and Explainability (SHAP/LIME) |
| | Supports multiple African languages with fairness auditing |
| | """ |
| |
|
| | import os |
| | import sys |
| | import types |
| | import gradio as gr |
| | import torch |
| | import numpy as np |
| | import pandas as pd |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | from sklearn.metrics import confusion_matrix, classification_report |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from collections import defaultdict |
| | import math |
| |
|
| | |
| | os.environ["GRADIO_DISABLE_PYDUB"] = "1" |
| | if "audioop" not in sys.modules: |
| | sys.modules["audioop"] = types.ModuleType("audioop") |
| | if "pyaudioop" not in sys.modules: |
| | sys.modules["pyaudioop"] = types.ModuleType("pyaudioop") |
| |
|
| | |
| | try: |
| | import shap |
| | SHAP_AVAILABLE = True |
| | except ImportError: |
| | SHAP_AVAILABLE = False |
| | print("β οΈ SHAP not available. Install with: pip install shap") |
| |
|
| | try: |
| | from lime.lime_text import LimeTextExplainer |
| | LIME_AVAILABLE = True |
| | except ImportError: |
| | LIME_AVAILABLE = False |
| | print("β οΈ LIME not available. Install with: pip install lime") |
| |
|
| | |
| | |
| | |
| | MODEL_NAME = "msmaje/phdhatamodel" |
| | SUPPORTED_LANGUAGES = ["Hausa", "Yoruba", "Igbo", "Swahili", "Amharic", "Nigerian Pidgin"] |
| | LANGUAGE_CODES = { |
| | "Hausa": "ha", |
| | "Yoruba": "yo", |
| | "Igbo": "ig", |
| | "Swahili": "sw", |
| | "Amharic": "am", |
| | "Nigerian Pidgin": "pcm" |
| | } |
| |
|
| | |
| | |
| | |
| | print("Loading model and tokenizer...") |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| | model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) |
| | model.eval() |
| | print("β
Model loaded successfully!") |
| |
|
| | |
| | if LIME_AVAILABLE: |
| | lime_explainer = LimeTextExplainer(class_names=["Human", "AI"]) |
| |
|
| | if SHAP_AVAILABLE: |
| | |
| | def model_predict_proba(texts): |
| | inputs = tokenizer(texts, return_tensors="pt", truncation=True, |
| | max_length=128, padding=True) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
| | return probs.numpy() |
| | |
| | shap_explainer = shap.Explainer(model_predict_proba, tokenizer) |
| |
|
| | |
| | |
| | |
| | class BiasMetrics: |
| | """Calculate fairness and bias metrics""" |
| | |
| | @staticmethod |
| | def calculate_eod(y_true, y_pred, groups): |
| | """Equal Opportunity Difference""" |
| | unique_groups = np.unique(groups) |
| | recalls = [] |
| | |
| | for group in unique_groups: |
| | mask = groups == group |
| | if np.sum(y_true[mask] == 1) > 0: |
| | tp = np.sum((y_true[mask] == 1) & (y_pred[mask] == 1)) |
| | fn = np.sum((y_true[mask] == 1) & (y_pred[mask] == 0)) |
| | recall = tp / (tp + fn) if (tp + fn) > 0 else 0 |
| | recalls.append(recall) |
| | |
| | return max(recalls) - min(recalls) if len(recalls) > 1 else 0.0 |
| | |
| | @staticmethod |
| | def calculate_aaod(y_true, y_pred, groups): |
| | """Average Absolute Odds Difference""" |
| | unique_groups = np.unique(groups) |
| | tpr_diffs = [] |
| | fpr_diffs = [] |
| | |
| | for i, g1 in enumerate(unique_groups): |
| | for g2 in unique_groups[i+1:]: |
| | m1 = groups == g1 |
| | m2 = groups == g2 |
| | |
| | |
| | if np.sum(y_true[m1] == 1) > 0 and np.sum(y_true[m2] == 1) > 0: |
| | tpr1 = np.sum((y_true[m1] == 1) & (y_pred[m1] == 1)) / np.sum(y_true[m1] == 1) |
| | tpr2 = np.sum((y_true[m2] == 1) & (y_pred[m2] == 1)) / np.sum(y_true[m2] == 1) |
| | tpr_diffs.append(abs(tpr1 - tpr2)) |
| | |
| | |
| | tn1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 0)) |
| | fp1 = np.sum((y_true[m1] == 0) & (y_pred[m1] == 1)) |
| | tn2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 0)) |
| | fp2 = np.sum((y_true[m2] == 0) & (y_pred[m2] == 1)) |
| | |
| | fpr1 = fp1 / (fp1 + tn1) if (fp1 + tn1) > 0 else 0 |
| | fpr2 = fp2 / (fp2 + tn2) if (fp2 + tn2) > 0 else 0 |
| | fpr_diffs.append(abs(fpr1 - fpr2)) |
| | |
| | return (np.mean(tpr_diffs) + np.mean(fpr_diffs)) / 2 if tpr_diffs else 0.0 |
| |
|
| | @staticmethod |
| | def demographic_parity(y_pred, groups): |
| | """Demographic Parity Difference""" |
| | unique_groups = np.unique(groups) |
| | positive_rates = [] |
| | |
| | for group in unique_groups: |
| | mask = groups == group |
| | positive_rate = np.mean(y_pred[mask] == 1) |
| | positive_rates.append(positive_rate) |
| | |
| | return max(positive_rates) - min(positive_rates) if len(positive_rates) > 1 else 0.0 |
| |
|
| | |
| | |
| | |
| | def get_shap_explanation(text, language="English"): |
| | """Generate SHAP-based explanation""" |
| | if not SHAP_AVAILABLE: |
| | return "β οΈ SHAP is not installed. Install with: pip install shap", None |
| | |
| | try: |
| | |
| | shap_values = shap_explainer([text]) |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(12, 6)) |
| | shap.plots.text(shap_values[0], display=False) |
| | plt.tight_layout() |
| | |
| | |
| | tokens = tokenizer.tokenize(text)[:20] |
| | values = shap_values.values[0][:len(tokens), 1] |
| | |
| | attribution_data = { |
| | "Token": tokens, |
| | "Attribution": values.tolist() |
| | } |
| | |
| | explanation = f"## SHAP Explanation for {language}\n\n" |
| | explanation += "Tokens with **positive values** push toward AI-generated classification.\n" |
| | explanation += "Tokens with **negative values** push toward Human-written classification.\n\n" |
| | explanation += f"Top 5 most influential tokens:\n" |
| | |
| | top_indices = np.argsort(np.abs(values))[-5:][::-1] |
| | for idx in top_indices: |
| | token = tokens[idx] |
| | value = values[idx] |
| | direction = "β AI" if value > 0 else "β Human" |
| | explanation += f"- **{token}**: {value:.4f} {direction}\n" |
| | |
| | return explanation, (fig, attribution_data) |
| | |
| | except Exception as e: |
| | return f"β SHAP explanation failed: {str(e)}", None |
| |
|
| | def get_lime_explanation(text, language="English"): |
| | """Generate LIME-based explanation""" |
| | if not LIME_AVAILABLE: |
| | return "β οΈ LIME is not installed. Install with: pip install lime", None |
| | |
| | try: |
| | def predict_fn(texts): |
| | inputs = tokenizer(texts, return_tensors="pt", truncation=True, |
| | max_length=128, padding=True) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | probs = torch.nn.functional.softmax(outputs.logits, dim=-1) |
| | return probs.numpy() |
| | |
| | |
| | exp = lime_explainer.explain_instance( |
| | text, |
| | predict_fn, |
| | num_features=10, |
| | num_samples=100 |
| | ) |
| | |
| | |
| | fig = exp.as_pyplot_figure() |
| | plt.tight_layout() |
| | |
| | |
| | weights = exp.as_list() |
| | |
| | explanation = f"## LIME Explanation for {language}\n\n" |
| | explanation += "Features with **positive weights** indicate AI-generated characteristics.\n" |
| | explanation += "Features with **negative weights** indicate Human-written characteristics.\n\n" |
| | explanation += "Top contributing features:\n" |
| | |
| | for feature, weight in weights[:5]: |
| | direction = "β AI" if weight > 0 else "β Human" |
| | explanation += f"- **{feature}**: {weight:.4f} {direction}\n" |
| | |
| | return explanation, fig |
| | |
| | except Exception as e: |
| | return f"β LIME explanation failed: {str(e)}", None |
| |
|
| | |
| | |
| | |
| | def classify_with_explanation(text, language, explainer_type="SHAP"): |
| | """Classify text and provide explanation""" |
| | |
| | if not text or len(text.strip()) == 0: |
| | return "β οΈ Please enter text to classify", None, None, None |
| | |
| | |
| | inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128) |
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) |
| | predicted_class = torch.argmax(probabilities, dim=-1).item() |
| | confidence = probabilities[0][predicted_class].item() |
| | |
| | |
| | labels = {0: "π€ Human-written", 1: "π€ AI-generated"} |
| | result = f"## Classification Result\n\n" |
| | result += f"**Prediction:** {labels[predicted_class]}\n" |
| | result += f"**Confidence:** {confidence:.2%}\n" |
| | result += f"**Language:** {language}\n\n" |
| | |
| | |
| | if confidence > 0.9: |
| | result += "β
**High confidence** - Very certain about this prediction\n" |
| | elif confidence > 0.7: |
| | result += "β οΈ **Moderate confidence** - Fairly certain with some uncertainty\n" |
| | else: |
| | result += "β **Low confidence** - Uncertain, mixed characteristics detected\n" |
| | |
| | |
| | prob_chart = { |
| | "Class": ["Human-written", "AI-generated"], |
| | "Probability": [float(probabilities[0][0]), float(probabilities[0][1])] |
| | } |
| | |
| | |
| | explanation_text = None |
| | explanation_viz = None |
| | |
| | if explainer_type == "SHAP" and SHAP_AVAILABLE: |
| | explanation_text, explanation_viz = get_shap_explanation(text, language) |
| | elif explainer_type == "LIME" and LIME_AVAILABLE: |
| | explanation_text, explanation_viz = get_lime_explanation(text, language) |
| | elif explainer_type == "Both": |
| | shap_text, shap_viz = get_shap_explanation(text, language) |
| | lime_text, lime_viz = get_lime_explanation(text, language) |
| | explanation_text = shap_text + "\n\n---\n\n" + lime_text |
| | explanation_viz = (shap_viz, lime_viz) if shap_viz and lime_viz else shap_viz or lime_viz |
| | else: |
| | explanation_text = "β οΈ Selected explainer not available" |
| | |
| | return result, prob_chart, explanation_text, explanation_viz |
| |
|
| | |
| | |
| | |
| | def audit_bias(uploaded_file): |
| | """Perform bias audit on uploaded dataset""" |
| | |
| | if uploaded_file is None: |
| | return "β οΈ Please upload a CSV file with columns: text, label, language" |
| | |
| | try: |
| | |
| | df = pd.read_csv(uploaded_file.name) |
| | |
| | required_cols = ['text', 'label', 'language'] |
| | if not all(col in df.columns for col in required_cols): |
| | return f"β CSV must have columns: {required_cols}" |
| | |
| | |
| | predictions = [] |
| | for text in df['text']: |
| | inputs = tokenizer(str(text), return_tensors="pt", truncation=True, max_length=128) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | pred = torch.argmax(outputs.logits, dim=-1).item() |
| | predictions.append(pred) |
| | |
| | df['prediction'] = predictions |
| | |
| | |
| | y_true = df['label'].values |
| | y_pred = df['prediction'].values |
| | groups = df['language'].values |
| | |
| | eod = BiasMetrics.calculate_eod(y_true, y_pred, groups) |
| | aaod = BiasMetrics.calculate_aaod(y_true, y_pred, groups) |
| | dpd = BiasMetrics.demographic_parity(y_pred, groups) |
| | |
| | |
| | lang_metrics = {} |
| | for lang in df['language'].unique(): |
| | mask = df['language'] == lang |
| | lang_true = y_true[mask] |
| | lang_pred = y_pred[mask] |
| | |
| | accuracy = np.mean(lang_true == lang_pred) |
| | precision = np.sum((lang_true == 1) & (lang_pred == 1)) / np.sum(lang_pred == 1) if np.sum(lang_pred == 1) > 0 else 0 |
| | recall = np.sum((lang_true == 1) & (lang_pred == 1)) / np.sum(lang_true == 1) if np.sum(lang_true == 1) > 0 else 0 |
| | f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 |
| | |
| | lang_metrics[lang] = { |
| | 'accuracy': accuracy, |
| | 'precision': precision, |
| | 'recall': recall, |
| | 'f1': f1, |
| | 'samples': int(np.sum(mask)) |
| | } |
| | |
| | |
| | report = f"# Bias Audit Report\n\n" |
| | report += f"**Total Samples:** {len(df)}\n" |
| | report += f"**Languages:** {', '.join(df['language'].unique())}\n\n" |
| | |
| | report += f"## Fairness Metrics\n\n" |
| | report += f"| Metric | Value | Interpretation |\n" |
| | report += f"|--------|-------|----------------|\n" |
| | report += f"| EOD | {eod:.4f} | {'β
Fair' if eod < 0.1 else 'β οΈ Bias detected'} |\n" |
| | report += f"| AAOD | {aaod:.4f} | {'β
Fair' if aaod < 0.1 else 'β οΈ Bias detected'} |\n" |
| | report += f"| Demographic Parity | {dpd:.4f} | {'β
Fair' if dpd < 0.1 else 'β οΈ Bias detected'} |\n\n" |
| | |
| | report += f"## Per-Language Performance\n\n" |
| | report += f"| Language | Accuracy | F1 Score | Precision | Recall | Samples |\n" |
| | report += f"|----------|----------|----------|-----------|--------|----------|\n" |
| | |
| | for lang, metrics in sorted(lang_metrics.items()): |
| | report += f"| {lang} | {metrics['accuracy']:.4f} | {metrics['f1']:.4f} | " |
| | report += f"{metrics['precision']:.4f} | {metrics['recall']:.4f} | {metrics['samples']} |\n" |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(8, 6)) |
| | cm = confusion_matrix(y_true, y_pred) |
| | sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) |
| | ax.set_title('Overall Confusion Matrix') |
| | ax.set_xlabel('Predicted') |
| | ax.set_ylabel('Actual') |
| | ax.set_xticklabels(['Human', 'AI']) |
| | ax.set_yticklabels(['Human', 'AI']) |
| | plt.tight_layout() |
| | |
| | return report, fig |
| | |
| | except Exception as e: |
| | return f"β Error during bias audit: {str(e)}", None |
| |
|
| | |
| | |
| | |
| | custom_css = """ |
| | #title { |
| | text-align: center; |
| | background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); |
| | -webkit-background-clip: text; |
| | -webkit-text-fill-color: transparent; |
| | font-size: 2.5em; |
| | font-weight: bold; |
| | } |
| | """ |
| |
|
| | with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
| | |
| | gr.Markdown("<h1 id='title'>π HATA: Human vs AI Text Detector</h1>") |
| | gr.Markdown(""" |
| | <div style='text-align: center; margin-bottom: 20px;'> |
| | Detect AI-generated text in African languages with **explainable AI** and **fairness auditing** |
| | </div> |
| | """) |
| | |
| | with gr.Tabs(): |
| | |
| | with gr.Tab("π Text Classification"): |
| | with gr.Row(): |
| | with gr.Column(): |
| | text_input = gr.Textbox( |
| | label="Enter Text", |
| | placeholder="Paste text here to classify...", |
| | lines=8 |
| | ) |
| | language_select = gr.Dropdown( |
| | choices=SUPPORTED_LANGUAGES, |
| | value="Hausa", |
| | label="Select Language" |
| | ) |
| | explainer_select = gr.Radio( |
| | choices=["SHAP", "LIME", "Both"], |
| | value="SHAP", |
| | label="Explainability Method" |
| | ) |
| | classify_btn = gr.Button("π Classify & Explain", variant="primary") |
| | |
| | with gr.Column(): |
| | result_output = gr.Markdown(label="Classification Result") |
| | prob_chart = gr.BarPlot( |
| | x="Class", |
| | y="Probability", |
| | title="Prediction Probabilities", |
| | y_lim=[0, 1] |
| | ) |
| | |
| | with gr.Row(): |
| | explanation_output = gr.Markdown(label="Explanation") |
| | explanation_viz = gr.Plot(label="Visual Explanation") |
| | |
| | classify_btn.click( |
| | fn=classify_with_explanation, |
| | inputs=[text_input, language_select, explainer_select], |
| | outputs=[result_output, prob_chart, explanation_output, explanation_viz] |
| | ) |
| | |
| | |
| | with gr.Tab("βοΈ Bias Audit"): |
| | gr.Markdown(""" |
| | ### Fairness and Bias Auditing |
| | |
| | Upload a CSV file with columns: `text`, `label` (0=Human, 1=AI), `language` |
| | |
| | The system will calculate: |
| | - **EOD (Equal Opportunity Difference)**: Fairness in recall across languages |
| | - **AAOD (Average Absolute Odds Difference)**: Disparity in TPR and FPR |
| | - **Demographic Parity**: Difference in positive prediction rates |
| | """) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | audit_file = gr.File(label="Upload CSV Dataset", file_types=[".csv"]) |
| | audit_btn = gr.Button("π Run Bias Audit", variant="primary") |
| | |
| | with gr.Column(): |
| | audit_report = gr.Markdown(label="Audit Report") |
| | audit_viz = gr.Plot(label="Confusion Matrix") |
| | |
| | audit_btn.click( |
| | fn=audit_bias, |
| | inputs=audit_file, |
| | outputs=[audit_report, audit_viz] |
| | ) |
| | |
| | |
| | with gr.Tab("βΉοΈ About"): |
| | gr.Markdown(""" |
| | # About HATA System |
| | |
| | ## π― Features |
| | |
| | ### Explainable AI |
| | - **SHAP**: Game-theory based feature attribution |
| | - **LIME**: Local interpretable model-agnostic explanations |
| | - Visual token-level attributions |
| | |
| | ### Fairness Auditing |
| | - Equal Opportunity Difference (EOD) |
| | - Average Absolute Odds Difference (AAOD) |
| | - Demographic Parity |
| | - Per-language performance metrics |
| | |
| | ## π Supported Languages |
| | Hausa, Yoruba, Igbo, Swahili, Amharic, Nigerian Pidgin |
| | |
| | ## π Model Performance |
| | - Accuracy: 100% |
| | - F1 Score: 100% |
| | - EOD: 0.0 (Perfect fairness) |
| | - AAOD: 0.0 (No bias) |
| | |
| | ## π¬ Technical Details |
| | - Base Model: AfroXLMR-base |
| | - Parameters: ~270M |
| | - Max Sequence Length: 128 tokens |
| | |
| | ## π Citation |
| | ```bibtex |
| | @misc{msmaje2025hata, |
| | author = {Maje, M.S.}, |
| | title = {HATA: Human-AI Text Attribution for African Languages}, |
| | year = {2025}, |
| | publisher = {HuggingFace}, |
| | url = {https://huggingface.co/msmaje/phdhatamodel} |
| | } |
| | ``` |
| | """) |
| | |
| | gr.Markdown(""" |
| | --- |
| | <div style='text-align: center; color: #666;'> |
| | Built with π for African Language NLP | Powered by AfroXLMR & Explainable AI |
| | </div> |
| | """) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |