Spaces:

msmaje
/

PhDComputerScienceMultilingualHATASystem

Sleeping

App Files Files Community

msmaje commited on Jan 16

Commit

a1f7a6b

verified ·

1 Parent(s): 47549c3

Create app.py

Browse files

Files changed (1) hide show

app.py +312 -0

app.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Gradio Space for Human-AI Text Attribution (HATA) Model
+Detects whether text is human-written or AI-generated
+Supports multiple African languages
+"""
+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import numpy as np
+# Load model and tokenizer
+MODEL_NAME = "msmaje/phdhatamodel"
+print("Loading model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+model.eval()
+print("Model loaded successfully!")
+# Language examples
+EXAMPLES = [
+    ["Ìwé yìí jẹ́ ìwé tó dára púpọ̀ fún àwọn akẹ́kọ̀ọ́.", "Yoruba"],
+    ["Wannan littafi mai kyau ne ga ɗalibai.", "Hausa"],
+    ["Akwụkwọ a dị mma maka ụmụ akwụkwọ.", "Igbo"],
+    ["Dis book dey very good for students wey wan learn.", "Nigerian Pidgin"],
+]
+def classify_text(text, show_probabilities=True):
+    """
+    Classify text as human-written or AI-generated
+    Args:
+        text: Input text to classify
+        show_probabilities: Whether to show probability scores
+    Returns:
+        Classification result with confidence scores
+    """
+    if not text or len(text.strip()) == 0:
+        return "⚠️ Please enter some text to classify.", None
+    # Tokenize
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=128,
+        padding=True
+    )
+    # Get prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        predicted_class = torch.argmax(probabilities, dim=-1).item()
+        confidence = probabilities[0][predicted_class].item()
+    # Labels
+    labels = {0: "👤 Human-written", 1: "🤖 AI-generated"}
+    # Create result text
+    result = f"## Prediction: {labels[predicted_class]}\n"
+    result += f"**Confidence:** {confidence:.2%}\n\n"
+    # Add interpretation
+    if confidence > 0.9:
+        result += "✅ **High confidence** - The model is very certain about this prediction."
+    elif confidence > 0.7:
+        result += "⚠️ **Moderate confidence** - The model is fairly certain, but there's some uncertainty."
+    else:
+        result += "❓ **Low confidence** - The model is uncertain. The text may have mixed characteristics."
+    # Probability chart data
+    prob_data = {
+        "Human-written": float(probabilities[0][0].item()),
+        "AI-generated": float(probabilities[0][1].item())
+    }
+    if show_probabilities:
+        return result, prob_data
+    else:
+        return result, None
+def batch_classify(file):
+    """
+    Classify multiple texts from uploaded file
+    """
+    if file is None:
+        return "⚠️ Please upload a text file."
+    # Read file
+    try:
+        with open(file.name, 'r', encoding='utf-8') as f:
+            texts = f.readlines()
+    except Exception as e:
+        return f"❌ Error reading file: {e}"
+    # Process each text
+    results = []
+    for i, text in enumerate(texts, 1):
+        text = text.strip()
+        if not text:
+            continue
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            predicted_class = torch.argmax(probabilities, dim=-1).item()
+            confidence = probabilities[0][predicted_class].item()
+        label = "Human" if predicted_class == 0 else "AI"
+        results.append(f"{i}. [{label} - {confidence:.2%}] {text[:100]}...")
+    return "\n".join(results)
+# Custom CSS
+custom_css = """
+#title {
+    text-align: center;
+    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-size: 2.5em;
+    font-weight: bold;
+    margin-bottom: 0.5em;
+}
+#subtitle {
+    text-align: center;
+    color: #666;
+    font-size: 1.2em;
+    margin-bottom: 1em;
+}
+.output-box {
+    border: 2px solid #667eea;
+    border-radius: 10px;
+    padding: 15px;
+}
+.gradio-container {
+    max-width: 900px;
+    margin: auto;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
+    # Header
+    gr.Markdown("<h1 id='title'>🔍 Human vs AI Text Detector</h1>")
+    gr.Markdown(
+        "<p id='subtitle'>Detect whether text is human-written or AI-generated | "
+        "Supports African Languages 🌍</p>"
+    )
+    # Main interface
+    with gr.Tabs():
+        # Tab 1: Single text classification
+        with gr.Tab("📝 Single Text"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    text_input = gr.Textbox(
+                        label="Enter text to classify",
+                        placeholder="Type or paste your text here...",
+                        lines=6,
+                        max_lines=10
+                    )
+                    show_probs = gr.Checkbox(
+                        label="Show probability distribution",
+                        value=True
+                    )
+                    with gr.Row():
+                        classify_btn = gr.Button("🔍 Classify Text", variant="primary")
+                        clear_btn = gr.ClearButton([text_input])
+                with gr.Column(scale=2):
+                    result_output = gr.Markdown(label="Result")
+                    prob_plot = gr.BarPlot(
+                        x="label",
+                        y="probability",
+                        title="Probability Distribution",
+                        y_lim=[0, 1],
+                        height=300,
+                        visible=True
+                    )
+            # Examples
+            gr.Markdown("### 📚 Try these examples:")
+            gr.Examples(
+                examples=EXAMPLES,
+                inputs=[text_input],
+                label="Example texts in different languages"
+            )
+            # Connect classification function
+            classify_btn.click(
+                fn=classify_text,
+                inputs=[text_input, show_probs],
+                outputs=[result_output, prob_plot]
+            )
+        # Tab 2: Batch classification
+        with gr.Tab("📄 Batch Processing"):
+            gr.Markdown("""
+            ### Upload a text file for batch classification
+            Upload a `.txt` file with one text sample per line.
+            The app will classify each line and show the results.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Upload text file (.txt)",
+                        file_types=[".txt"]
+                    )
+                    batch_btn = gr.Button("🔍 Classify All", variant="primary")
+                with gr.Column():
+                    batch_output = gr.Textbox(
+                        label="Batch Results",
+                        lines=15,
+                        max_lines=20
+                    )
+            batch_btn.click(
+                fn=batch_classify,
+                inputs=file_input,
+                outputs=batch_output
+            )
+        # Tab 3: About
+        with gr.Tab("ℹ️ About"):
+            gr.Markdown("""
+            # About This Model
+            ## 🎯 Purpose
+            This model detects whether text is **human-written** or **AI-generated**.
+            It has been specifically trained on African languages to ensure fair and
+            accurate detection across diverse linguistic contexts.
+            ## 🌍 Supported Languages
+            - **English**
+            - **Yoruba** (yo)
+            - **Hausa** (ha)
+            - **Igbo** (ig)
+            - **Swahili** (sw)
+            - **Amharic** (am)
+            - **Nigerian Pidgin** (pcm)
+            ## 📊 Performance
+            - **Accuracy:** 100%
+            - **F1 Score:** 100%
+            - **Fairness Metrics:** EOD = 0.0, AAOD = 0.0 (Perfect fairness)
+            ## 🔬 Model Details
+            - **Base Model:** [AfroXLMR-base](https://huggingface.co/davlan/afro-xlmr-base)
+            - **Parameters:** ~270M (0.3B)
+            - **Max Sequence Length:** 128 tokens
+            - **Training Dataset:** PhD HATA African Dataset
+            ## ⚖️ Fairness & Ethics
+            This model has been trained with explicit fairness constraints to ensure:
+            - Equal performance across all supported languages
+            - No bias toward high-resource languages
+            - Fair treatment of diverse linguistic communities
+            ## ⚠️ Limitations
+            - Performance may vary on languages outside the training distribution
+            - AI detection capabilities are tied to the AI systems present in training data
+            - Should be used as one component in content verification, not sole determinant
+            - Text length and domain may affect accuracy
+            ## 📚 Citation
+            ```bibtex
+            @misc{msmaje2025hata,
+              author = {Maje, M.S.},
+              title = {AfroXLMR for Human-AI Text Attribution},
+              year = {2025},
+              publisher = {HuggingFace},
+              url = {https://huggingface.co/msmaje/phdhatamodel}
+            }
+            ```
+            ## 🔗 Links
+            - [Model on HuggingFace](https://huggingface.co/msmaje/phdhatamodel)
+            - [Training Visualizations](https://huggingface.co/msmaje/phdhatamodel/tree/main/visualizations)
+            - [Dataset](https://huggingface.co/datasets/msmaje/phd-hata-african-dataset)
+            ## 👤 Contact
+            For questions or feedback, please open an issue on the model repository.
+            """)
+    # Footer
+    gr.Markdown("""
+    ---
+    <div style='text-align: center; color: #666; padding: 20px;'>
+        <p>Built with 💜 for African Language NLP | Powered by AfroXLMR</p>
+        <p>Model: <a href='https://huggingface.co/msmaje/phdhatamodel'>msmaje/phdhatamodel</a></p>
+    </div>
+    """)
+# Launch
+if __name__ == "__main__":
+    demo.launch()