File size: 3,295 Bytes
a1f7a6b adf71b3 31b076e adf71b3 31b076e adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 a1f7a6b adf71b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
"""
Gradio Space for Human-AI Text Attribution (HATA) Model
Detects whether text is human-written or AI-generated
Supports multiple African languages
"""
# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
import os
import sys
import types
os.environ["GRADIO_DISABLE_PYDUB"] = "1"
# Provide stubs so that pydub cannot fail on audioop / pyaudioop
if "audioop" not in sys.modules:
sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
# Now it is safe to import Gradio and the rest of the stack
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ----------------------------------------------------------------------
# Model configuration
# ----------------------------------------------------------------------
MODEL_NAME = "distilbert-base-multilingual-cased" # replace with your fine-tuned HATA checkpoint if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
model.eval()
LABELS = ["Human-written", "AI-generated"]
# ----------------------------------------------------------------------
# Inference routine
# ----------------------------------------------------------------------
@torch.no_grad()
def hata_predict(text: str):
if not text or not text.strip():
return {"Human-written": 0.0, "AI-generated": 0.0}
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512,
).to(DEVICE)
outputs = model(**inputs)
logits = outputs.logits.squeeze(0)
probs = torch.softmax(logits, dim=-1).cpu().numpy()
return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
# ----------------------------------------------------------------------
# Gradio interface
# ----------------------------------------------------------------------
with gr.Blocks(title="Multilingual HATA System") as demo:
gr.Markdown(
"""
# Multilingual Human–AI Text Attribution (HATA)
This system estimates whether an input passage is **human-written** or
**AI-generated**, with a focus on multilingual and African-language use
cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
The backend is a Transformer-based classifier fine-tuned for attribution.
"""
)
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Input Text",
placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
lines=8,
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
output = gr.Label(label="Attribution Probabilities")
submit_btn.click(
fn=hata_predict,
inputs=text_input,
outputs=output,
)
# ----------------------------------------------------------------------
# Entry point
# ----------------------------------------------------------------------
if __name__ == "__main__":
demo.launch()
|