File size: 3,295 Bytes
31c6aa2 e2d8b9b 31c6aa2 e2d8b9b 2735c1a e2d8b9b 2735c1a e2d8b9b 676c241 e2d8b9b 2735c1a e2d8b9b 0c0c1de e2d8b9b 5310029 e2d8b9b a1f7a6b e2d8b9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
"""
Gradio Space for Human-AI Text Attribution (HATA) Model
Detects whether text is human-written or AI-generated
Supports multiple African languages
"""
# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
import os
import sys
import types
os.environ["GRADIO_DISABLE_PYDUB"] = "1"
# Provide stubs so that pydub cannot fail on audioop / pyaudioop
if "audioop" not in sys.modules:
sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
# Now it is safe to import Gradio and the rest of the stack
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ----------------------------------------------------------------------
# Model configuration
# ----------------------------------------------------------------------
MODEL_NAME = "distilbert-base-multilingual-cased" # replace with your fine-tuned HATA checkpoint if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
model.eval()
LABELS = ["Human-written", "AI-generated"]
# ----------------------------------------------------------------------
# Inference routine
# ----------------------------------------------------------------------
@torch.no_grad()
def hata_predict(text: str):
if not text or not text.strip():
return {"Human-written": 0.0, "AI-generated": 0.0}
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512,
).to(DEVICE)
outputs = model(**inputs)
logits = outputs.logits.squeeze(0)
probs = torch.softmax(logits, dim=-1).cpu().numpy()
return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
# ----------------------------------------------------------------------
# Gradio interface
# ----------------------------------------------------------------------
with gr.Blocks(title="Multilingual HATA System") as demo:
gr.Markdown(
"""
# Multilingual Human–AI Text Attribution (HATA)
This system estimates whether an input passage is **human-written** or
**AI-generated**, with a focus on multilingual and African-language use
cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
The backend is a Transformer-based classifier fine-tuned for attribution.
"""
)
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Input Text",
placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
lines=8,
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
output = gr.Label(label="Attribution Probabilities")
submit_btn.click(
fn=hata_predict,
inputs=text_input,
outputs=output,
)
# ----------------------------------------------------------------------
# Entry point
# ----------------------------------------------------------------------
if __name__ == "__main__":
demo.launch()
|