Spaces:

msmaje
/

PhDComputerScienceMultilingualHATASystem

Running

App Files Files Community

msmaje commited on Jan 16

Commit

adf71b3

verified ·

1 Parent(s): 31b076e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -284

app.py CHANGED Viewed

@@ -4,312 +4,95 @@ Detects whether text is human-written or AI-generated
 Supports multiple African languages
 """
 import os
 os.environ["GRADIO_DISABLE_PYDUB"] = "1"
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import numpy as np
-# Load model and tokenizer
-MODEL_NAME = "msmaje/phdhatamodel"
-print("Loading model...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 model.eval()
-print("Model loaded successfully!")
-# Language examples
-EXAMPLES = [
-    ["Ìwé yìí jẹ́ ìwé tó dára púpọ̀ fún àwọn akẹ́kọ̀ọ́.", "Yoruba"],
-    ["Wannan littafi mai kyau ne ga ɗalibai.", "Hausa"],
-    ["Akwụkwọ a dị mma maka ụmụ akwụkwọ.", "Igbo"],
-    ["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],
-]
-def classify_text(text, show_probabilities=True):
-    """
-    Classify text as human-written or AI-generated
-    Args:
-        text: Input text to classify
-        show_probabilities: Whether to show probability scores
-    Returns:
-        Classification result with confidence scores
-    """
-    if not text or len(text.strip()) == 0:
-        return "⚠️ Please enter some text to classify.", None
-    # Tokenize
     inputs = tokenizer(
         text,
         return_tensors="pt",
         truncation=True,
-        max_length=128,
-        padding=True
-    )
-    # Get prediction
-    with torch.no_grad():
-        outputs = model(**inputs)
-        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
-        predicted_class = torch.argmax(probabilities, dim=-1).item()
-        confidence = probabilities[0][predicted_class].item()
-    # Labels
-    labels = {0: "👤 Human-written", 1: "🤖 AI-generated"}
-    # Create result text
-    result = f"## Prediction: {labels[predicted_class]}\n"
-    result += f"**Confidence:** {confidence:.2%}\n\n"
-    # Add interpretation
-    if confidence > 0.9:
-        result += "✅ **High confidence** - The model is very certain about this prediction."
-    elif confidence > 0.7:
-        result += "⚠️ **Moderate confidence** - The model is fairly certain, but there's some uncertainty."
-    else:
-        result += "❓ **Low confidence** - The model is uncertain. The text may have mixed characteristics."
-    # Probability chart data
-    prob_data = {
-        "Human-written": float(probabilities[0][0].item()),
-        "AI-generated": float(probabilities[0][1].item())
-    }
-    if show_probabilities:
-        return result, prob_data
-    else:
-        return result, None
-def batch_classify(file):
-    """
-    Classify multiple texts from uploaded file
-    """
-    if file is None:
-        return "⚠️ Please upload a text file."
-    # Read file
-    try:
-        with open(file.name, 'r', encoding='utf-8') as f:
-            texts = f.readlines()
-    except Exception as e:
-        return f"❌ Error reading file: {e}"
-    # Process each text
-    results = []
-    for i, text in enumerate(texts, 1):
-        text = text.strip()
-        if not text:
-            continue
-        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
-            predicted_class = torch.argmax(probabilities, dim=-1).item()
-            confidence = probabilities[0][predicted_class].item()
-        label = "Human" if predicted_class == 0 else "AI"
-        results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")
-    return "\n".join(results)
-# Custom CSS
-custom_css = """
-#title {
-    text-align: center;
-    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    font-size: 2.5em;
-    font-weight: bold;
-    margin-bottom: 0.5em;
-}
-#subtitle {
-    text-align: center;
-    color: #666;
-    font-size: 1.2em;
-    margin-bottom: 1em;
-}
-.output-box {
-    border: 2px solid #667eea;
-    border-radius: 10px;
-    padding: 15px;
-}
-.gradio-container {
-    max-width: 900px;
-    margin: auto;
-}
-"""
-# Create Gradio interface
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    # Header
-    gr.Markdown("<h1 id='title'>🔍 Human vs AI Text Detector</h1>")
-    gr.Markdown(
-        "<p id='subtitle'>Detect whether text is human-written or AI-generated | "
-        "Supports African Languages 🌍</p>"
     )
-    # Main interface
-    with gr.Tabs():
-        # Tab 1: Single text classification
-        with gr.Tab("📝 Single Text"):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    text_input = gr.Textbox(
-                        label="Enter text to classify",
-                        placeholder="Type or paste your text here...",
-                        lines=6,
-                        max_lines=10
-                    )
-                    show_probs = gr.Checkbox(
-                        label="Show probability distribution",
-                        value=True
-                    )
-                    with gr.Row():
-                        classify_btn = gr.Button("🔍 Classify Text", variant="primary")
-                        clear_btn = gr.ClearButton([text_input])
-                with gr.Column(scale=2):
-                    result_output = gr.Markdown(label="Result")
-                    prob_plot = gr.BarPlot(
-                        x="label",
-                        y="probability",
-                        title="Probability Distribution",
-                        y_lim=[0, 1],
-                        height=300,
-                        visible=True
-                    )
-            # Examples
-            gr.Markdown("### 📚 Try these examples:")
-            gr.Examples(
-                examples=EXAMPLES,
-                inputs=[text_input],
-                label="Example texts in different languages"
-            )
-            # Connect classification function
-            classify_btn.click(
-                fn=classify_text,
-                inputs=[text_input, show_probs],
-                outputs=[result_output, prob_plot]
-            )
-        # Tab 2: Batch classification
-        with gr.Tab("📄 Batch Processing"):
-            gr.Markdown("""
-            ### Upload a text file for batch classification
-            Upload a `.txt` file with one text sample per line.
-            The app will classify each line and show the results.
-            """)
-            with gr.Row():
-                with gr.Column():
-                    file_input = gr.File(
-                        label="Upload text file (.txt)",
-                        file_types=[".txt"]
-                    )
-                    batch_btn = gr.Button("🔍 Classify All", variant="primary")
-                with gr.Column():
-                    batch_output = gr.Textbox(
-                        label="Batch Results",
-                        lines=15,
-                        max_lines=20
-                    )
-            batch_btn.click(
-                fn=batch_classify,
-                inputs=file_input,
-                outputs=batch_output
             )
-        # Tab 3: About
-        with gr.Tab("ℹ️ About"):
-            gr.Markdown("""
-            # About This Model
-            ## 🎯 Purpose
-            This model detects whether text is **human-written** or **AI-generated**.
-            It has been specifically trained on African languages to ensure fair and
-            accurate detection across diverse linguistic contexts.
-            ## 🌍 Supported Languages
-            - **English**
-            - **Yoruba** (yo)
-            - **Hausa** (ha)
-            - **Igbo** (ig)
-            - **Swahili** (sw)
-            - **Amharic** (am)
-            - **Nigerian Pidgin** (pcm)
-            ## 📊 Performance
-            - **Accuracy:** 100%
-            - **F1 Score:** 100%
-            - **Fairness Metrics:** EOD = 0.0, AAOD = 0.0 (Perfect fairness)
-            ## 🔬 Model Details
-            - **Base Model:** [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
-            - **Parameters:** ~270M (0.3B)
-            - **Max Sequence Length:** 128 tokens
-            - **Training Dataset:** PhD HATA African Dataset
-            ## ⚖️ Fairness & Ethics
-            This model has been trained with explicit fairness constraints to ensure:
-            - Equal performance across all supported languages
-            - No bias toward high-resource languages
-            - Fair treatment of diverse linguistic communities
-            ## ⚠️ Limitations
-            - Performance may vary on languages outside the training distribution
-            - AI detection capabilities are tied to the AI systems present in training data
-            - Should be used as one component in content verification, not sole determinant
-            - Text length and domain may affect accuracy
-            ## 📚 Citation
-            ```bibtex
-            @misc{msmaje2025hata,
-              author = {Maje, M.S.},
-              title = {AfroXLMR for Human-AI Text Attribution},
-              year = {2025},
-              publisher = {HuggingFace},
-              url = {https://huggingface.co/msmaje/phdhatamodel}
-            }
-            ```
-            ## 🔗 Links
-            - [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
-            - [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
-            - [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)
-            ## 👤 Contact
-            For questions or feedback, please open an issue on the model repository.
-            """)
-    # Footer
-    gr.Markdown("""
-    ---
-    <div style='text-align: center; color: #666; padding: 20px;'>
-        <p>Built with 💜 for African Language NLP | Powered by AfroXLMR</p>
-        <p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
-    </div>
-    """)
-# Launch
 if __name__ == "__main__":
-    demo.launch()

 Supports multiple African languages
 """
+# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
 import os
+import sys
+import types
 os.environ["GRADIO_DISABLE_PYDUB"] = "1"
+# Provide stubs so that pydub cannot fail on audioop / pyaudioop
+if "audioop" not in sys.modules:
+    sys.modules["audioop"] = types.ModuleType("audioop")
+if "pyaudioop" not in sys.modules:
+    sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
+# Now it is safe to import Gradio and the rest of the stack
 import gradio as gr
 import torch
 import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ----------------------------------------------------------------------
+# Model configuration
+# ----------------------------------------------------------------------
+MODEL_NAME = "distilbert-base-multilingual-cased"  # replace with your fine-tuned HATA checkpoint if available
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
+model.to(DEVICE)
 model.eval()
+LABELS = ["Human-written", "AI-generated"]
+# ----------------------------------------------------------------------
+# Inference routine
+# ----------------------------------------------------------------------
+@torch.no_grad()
+def hata_predict(text: str):
+    if not text or not text.strip():
+        return {"Human-written": 0.0, "AI-generated": 0.0}
     inputs = tokenizer(
         text,
         return_tensors="pt",
         truncation=True,
+        padding=True,
+        max_length=512,
+    ).to(DEVICE)
+    outputs = model(**inputs)
+    logits = outputs.logits.squeeze(0)
+    probs = torch.softmax(logits, dim=-1).cpu().numpy()
+    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
+# ----------------------------------------------------------------------
+# Gradio interface
+# ----------------------------------------------------------------------
+with gr.Blocks(title="Multilingual HATA System") as demo:
+    gr.Markdown(
+        """
+        # Multilingual Human–AI Text Attribution (HATA)
+        This system estimates whether an input passage is **human-written** or
+        **AI-generated**, with a focus on multilingual and African-language use
+        cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
+        The backend is a Transformer-based classifier fine-tuned for attribution.
+        """
     )
+    with gr.Row():
+        with gr.Column(scale=3):
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
+                lines=8,
             )
+            submit_btn = gr.Button("Analyze")
+        with gr.Column(scale=2):
+            output = gr.Label(label="Attribution Probabilities")
+    submit_btn.click(
+        fn=hata_predict,
+        inputs=text_input,
+        outputs=output,
+    )
+# ----------------------------------------------------------------------
+# Entry point
+# ----------------------------------------------------------------------
 if __name__ == "__main__":
+    demo.launch()