Spaces:

msmaje
/

PhDComputerScienceMultilingualHATASystem

Running

App Files Files Community

msmaje commited on Jan 16

Commit

676c241

verified ·

1 Parent(s): adf71b3

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -95

app.py CHANGED Viewed

@@ -1,98 +1,146 @@
-"""
-Gradio Space for Human-AI Text Attribution (HATA) Model
-Detects whether text is human-written or AI-generated
-Supports multiple African languages
-"""
-# --- Deterministic suppression of Gradio audio stack under Python 3.13 ---
 import os
-import sys
-import types
-os.environ["GRADIO_DISABLE_PYDUB"] = "1"
-# Provide stubs so that pydub cannot fail on audioop / pyaudioop
-if "audioop" not in sys.modules:
-    sys.modules["audioop"] = types.ModuleType("audioop")
-if "pyaudioop" not in sys.modules:
-    sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
-# Now it is safe to import Gradio and the rest of the stack
-import gradio as gr
-import torch
-import numpy as np
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# ----------------------------------------------------------------------
-# Model configuration
-# ----------------------------------------------------------------------
-MODEL_NAME = "distilbert-base-multilingual-cased"  # replace with your fine-tuned HATA checkpoint if available
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
-model.to(DEVICE)
-model.eval()
-LABELS = ["Human-written", "AI-generated"]
-# ----------------------------------------------------------------------
-# Inference routine
-# ----------------------------------------------------------------------
-@torch.no_grad()
-def hata_predict(text: str):
-    if not text or not text.strip():
-        return {"Human-written": 0.0, "AI-generated": 0.0}
-    inputs = tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        padding=True,
-        max_length=512,
-    ).to(DEVICE)
-    outputs = model(**inputs)
-    logits = outputs.logits.squeeze(0)
-    probs = torch.softmax(logits, dim=-1).cpu().numpy()
-    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
-# ----------------------------------------------------------------------
-# Gradio interface
-# ----------------------------------------------------------------------
-with gr.Blocks(title="Multilingual HATA System") as demo:
-    gr.Markdown(
-        """
-        # Multilingual Human–AI Text Attribution (HATA)
-        This system estimates whether an input passage is **human-written** or
-        **AI-generated**, with a focus on multilingual and African-language use
-        cases (e.g., Hausa, Yoruba, Igbo, Pidgin).
-        The backend is a Transformer-based classifier fine-tuned for attribution.
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=3):
-            text_input = gr.Textbox(
-                label="Input Text",
-                placeholder="Paste a paragraph in Hausa, Yoruba, Igbo, Pidgin, or English...",
-                lines=8,
             )
-            submit_btn = gr.Button("Analyze")
-        with gr.Column(scale=2):
-            output = gr.Label(label="Attribution Probabilities")
-    submit_btn.click(
-        fn=hata_predict,
-        inputs=text_input,
-        outputs=output,
-    )
-# ----------------------------------------------------------------------
-# Entry point
-# ----------------------------------------------------------------------
 if __name__ == "__main__":
-    demo.launch()

+# app.py
 import os
+import math
+import requests
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from langdetect import detect
+# -----------------------------------------------------------------------------
+# Configuration
+# -----------------------------------------------------------------------------
+HF_API_URL = "https://api-inference.huggingface.co/models/YOUR_USERNAME/YOUR_MODEL"
+HF_TOKEN = os.getenv("HF_TOKEN")
+HEADERS = {
+    "Authorization": f"Bearer {HF_TOKEN}",
+    "Content-Type": "application/json"
+}
+app = Flask(__name__)
+CORS(app)
+# -----------------------------------------------------------------------------
+# Utility Functions
+# -----------------------------------------------------------------------------
+def entropy(probs):
+    """Shannon entropy as epistemic uncertainty indicator."""
+    return -sum(p * math.log2(p) for p in probs if p > 0)
+def normalize_labels(hf_output):
+    """
+    Normalize Hugging Face output into a stable schema.
+    Expected HF format:
+    [
+      {"label": "HUMAN", "score": 0.73},
+      {"label": "AI", "score": 0.27}
+    ]
+    """
+    result = {item["label"].lower(): float(item["score"]) for item in hf_output}
+    human_p = result.get("human", 0.0)
+    ai_p = result.get("ai", 0.0)
+    return human_p, ai_p
+def hf_inference(text):
+    payload = {"inputs": text}
+    r = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=30)
+    r.raise_for_status()
+    return r.json()
+# -----------------------------------------------------------------------------
+# Core Endpoint
+# -----------------------------------------------------------------------------
+@app.route("/analyze", methods=["POST"])
+def analyze():
+    data = request.get_json()
+    text = data.get("text", "").strip()
+    if not text:
+        return jsonify({"error": "Empty input"}), 400
+    # 1. Language detection (supports linguistic auditing)
+    try:
+        language = detect(text)
+    except Exception:
+        language = "unknown"
+    # 2. Hugging Face inference
+    hf_raw = hf_inference(text)
+    if not isinstance(hf_raw, list):
+        return jsonify({"error": "Unexpected model response", "raw": hf_raw}), 500
+    human_p, ai_p = normalize_labels(hf_raw)
+    # 3. Decision
+    label = "Human" if human_p >= ai_p else "Machine"
+    confidence = max(human_p, ai_p)
+    # 4. Epistemic uncertainty
+    H = entropy([human_p, ai_p])
+    # 5. Explainability placeholder (XAI-ready schema)
+    explainability_stub = {
+        "method": "pending",
+        "note": (
+            "This model endpoint does not natively expose SHAP/LIME. "
+            "Post-hoc explainability must be computed locally using a "
+            "replicated model or proxy explainer."
+        ),
+        "token_attributions": []
+    }
+    # 6. Fairness metadata (for downstream auditing)
+    fairness_context = {
+        "language": language,
+        "human_probability": human_p,
+        "ai_probability": ai_p,
+        "entropy": H
+    }
+    response = {
+        "prediction": {
+            "label": label,
+            "confidence": round(confidence, 4)
+        },
+        "probabilities": {
+            "human": round(human_p, 4),
+            "machine": round(ai_p, 4)
+        },
+        "uncertainty": {
+            "entropy": round(H, 4),
+            "interpretation": (
+                "High entropy indicates epistemic ambiguity; "
+                "classification should be treated cautiously."
             )
+        },
+        "linguistic_context": {
+            "detected_language": language
+        },
+        "explainability": explainability_stub,
+        "fairness_audit_fields": fairness_context
+    }
+    return jsonify(response)
+# -----------------------------------------------------------------------------
+# Health Check
+# -----------------------------------------------------------------------------
+@app.route("/", methods=["GET"])
+def index():
+    return jsonify({
+        "system": "HATA API",
+        "capabilities": [
+            "Human vs AI classification",
+            "Probability calibration",
+            "Uncertainty estimation",
+            "Language-aware auditing",
+            "Explainability-ready schema",
+            "Fairness instrumentation"
+        ]
+    })
+# -----------------------------------------------------------------------------
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000, debug=True)