Spaces:

msmaje
/

PhDComputerScienceMultilingualHATASystem

Sleeping

PhDComputerScienceMultilingualHATASystem

File size: 3,295 Bytes

a1f7a6b
 
 
 
 
 
adf71b3
31b076e
adf71b3
 
 
31b076e
 
adf71b3
 
 
 
 
 
 
a1f7a6b
 
 
adf71b3
a1f7a6b
adf71b3
 
 
 
 
a1f7a6b
 
adf71b3
 
a1f7a6b
 
adf71b3
 
 
 
 
 
 
 
 
a1f7a6b
 
 
 
 
adf71b3
 
 
a1f7a6b
adf71b3
 
 
a1f7a6b
adf71b3
a1f7a6b
adf71b3
 
 
 
 
 
 
a1f7a6b
adf71b3
 
 
a1f7a6b
adf71b3
 
a1f7a6b
adf71b3
 
 
 
 
 
 
a1f7a6b
adf71b3
 
 
 
 
 
 
 
 
a1f7a6b
adf71b3
 
 
a1f7a6b
adf71b3

"""
Gradio Space for Human-AI Text Attribution (HATA) Model
Detects whether text is human-written or AI-generated
Supports multiple African languages
"""

# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
import os
import sys
import types

os.environ["GRADIO_DISABLE_PYDUB"] = "1"

# Provide stubs so that pydub cannot fail on audioop / pyaudioop
if "audioop" not in sys.modules:
    sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
    sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")

# Now it is safe to import Gradio and the rest of the stack
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ----------------------------------------------------------------------
# Model configuration
# ----------------------------------------------------------------------
MODEL_NAME = "distilbert-base-multilingual-cased"  # replace with your fine-tuned HATA checkpoint if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
model.eval()

LABELS = ["Human-written", "AI-generated"]

# ----------------------------------------------------------------------
# Inference routine
# ----------------------------------------------------------------------
@torch.no_grad()
def hata_predict(text: str):
    if not text or not text.strip():
        return {"Human-written": 0.0, "AI-generated": 0.0}

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512,
    ).to(DEVICE)

    outputs = model(**inputs)
    logits = outputs.logits.squeeze(0)
    probs = torch.softmax(logits, dim=-1).cpu().numpy()

    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}

# ----------------------------------------------------------------------
# Gradio interface
# ----------------------------------------------------------------------
with gr.Blocks(title="Multilingual HATA System") as demo:
    gr.Markdown(
        """
        # Multilingual Human–AI Text Attribution (HATA)

        This system estimates whether an input passage is **human-written** or
        **AI-generated**, with a focus on multilingual and African-language use
        cases (e.g., Hausa, Yoruba, Igbo, Pidgin).

        The backend is a Transformer-based classifier fine-tuned for attribution.
        """
    )

    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
                lines=8,
            )
            submit_btn = gr.Button("Analyze")
        with gr.Column(scale=2):
            output = gr.Label(label="Attribution Probabilities")

    submit_btn.click(
        fn=hata_predict,
        inputs=text_input,
        outputs=output,
    )

# ----------------------------------------------------------------------
# Entry point
# ----------------------------------------------------------------------
if __name__ == "__main__":
    demo.launch()