msmaje's picture
Update app.py
adf71b3 verified
raw
history blame
3.3 kB
"""
Gradio Space for Human-AI Text Attribution (HATA) Model
Detects whether text is human-written or AI-generated
Supports multiple African languages
"""
# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
import os
import sys
import types
os.environ["GRADIO_DISABLE_PYDUB"] = "1"
# Provide stubs so that pydub cannot fail on audioop / pyaudioop
if "audioop" not in sys.modules:
sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
# Now it is safe to import Gradio and the rest of the stack
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ----------------------------------------------------------------------
# Model configuration
# ----------------------------------------------------------------------
MODEL_NAME = "distilbert-base-multilingual-cased" # replace with your fine-tuned HATA checkpoint if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
model.eval()
LABELS = ["Human-written", "AI-generated"]
# ----------------------------------------------------------------------
# Inference routine
# ----------------------------------------------------------------------
@torch.no_grad()
def hata_predict(text: str):
if not text or not text.strip():
return {"Human-written": 0.0, "AI-generated": 0.0}
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512,
).to(DEVICE)
outputs = model(**inputs)
logits = outputs.logits.squeeze(0)
probs = torch.softmax(logits, dim=-1).cpu().numpy()
return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
# ----------------------------------------------------------------------
# Gradio interface
# ----------------------------------------------------------------------
with gr.Blocks(title="Multilingual HATA System") as demo:
gr.Markdown(
"""
# Multilingual Human–AI Text Attribution (HATA)
This system estimates whether an input passage is **human-written** or
**AI-generated**, with a focus on multilingual and African-language use
cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
The backend is a Transformer-based classifier fine-tuned for attribution.
"""
)
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Input Text",
placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
lines=8,
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
output = gr.Label(label="Attribution Probabilities")
submit_btn.click(
fn=hata_predict,
inputs=text_input,
outputs=output,
)
# ----------------------------------------------------------------------
# Entry point
# ----------------------------------------------------------------------
if __name__ == "__main__":
demo.launch()