Spaces:

tnp554
/

SQuAD

Sleeping

App Files Files Community

tnp554 commited on Apr 22

Commit

09daf0b

1 Parent(s): a4081df

feat: deploy SQuAD backend with all AI models

Browse files

Files changed (36) hide show

.env +21 -0
Dockerfile +21 -0
README.md +38 -10
__pycache__/auth.cpython-314.pyc +0 -0
__pycache__/qa_engine.cpython-314.pyc +0 -0
app.py +638 -0
auth.py +92 -0
data_loader/load_squad_json.py +25 -0
gunicorn.conf.py +10 -0
main.py +68 -0
models/__init__.py +1 -0
models/__pycache__/__init__.cpython-314.pyc +0 -0
models/__pycache__/bert_model.cpython-314.pyc +0 -0
models/__pycache__/model2.cpython-314.pyc +0 -0
models/__pycache__/model3.cpython-314.pyc +0 -0
models/__pycache__/qa_model.cpython-314.pyc +0 -0
models/bert_model.py +123 -0
models/model2.py +28 -0
models/model3.py +100 -0
models/qa_model.py +27 -0
qa_engine.py +115 -0
qa_model.pth +3 -0
requirements.txt +16 -0
train.py +100 -0
utils/__init__.py +1 -0
utils/__pycache__/__init__.cpython-314.pyc +0 -0
utils/__pycache__/db.cpython-314.pyc +0 -0
utils/__pycache__/pdf_parser.cpython-314.pyc +0 -0
utils/__pycache__/preprocess.cpython-314.pyc +0 -0
utils/__pycache__/vocab.cpython-314.pyc +0 -0
utils/db.py +82 -0
utils/file_loader.py +19 -0
utils/pdf_parser.py +49 -0
utils/preprocess.py +6 -0
utils/squad_preprocess.py +26 -0
utils/vocab.py +13 -0

.env ADDED Viewed

	@@ -0,0 +1,21 @@

+# ─── Database ───────────────────────────────────────────────────────────────
+MONGO_URI=mongodb+srv://tnp554:ibmtnp@ibmcluster.swumgnp.mongodb.net/squad_qa?appName=IBMCLUSTER
+# ─── Auth ────────────────────────────────────────────────────────────────────
+JWT_SECRET=905d93e5bf632330aee5075046c4b8cc7d1d2c28d575918c9dbf7be33536badd
+JWT_EXPIRY_HOURS=24
+# ─── Admin Seed ──────────────────────────────────────────────────────────────
+ADMIN_EMAIL=admin@squad.ai
+ADMIN_PASSWORD=Admin@123
+# ─── App Config ──────────────────────────────────────────────────────────────
+FLASK_ENV=production
+# Comma-separated list of allowed origins (no trailing slash)
+ALLOWED_ORIGINS=http://localhost:5173,http://localhost:5174,http://localhost:3000
+# ─── Feature Flags ───────────────────────────────────────────────────────────
+PDF_MAX_PAGES=15
+EMAIL_USER=otp.squad.ai@gmail.com
+EMAIL_PASS=yfqkqjtzlbljgpww

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system deps for PyPDF2, python-docx, torch, and file security (libmagic)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    libgomp1 \
+    libmagic1 \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy source
+COPY . .
+EXPOSE 7860
+CMD ["gunicorn", "-c", "gunicorn.conf.py", "app:app"]

README.md CHANGED Viewed

@@ -1,10 +1,38 @@
----
-title: SQuAD
-emoji: 🏆
-colorFrom: indigo
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# 🐍 Backend Architecture (Flask + PyTorch)
+The core engine responsible for MongoDB tracking, Authentication routing, and executing Heavy Machine Learning Inference locally on your physical server via Virtual Environments.
+## 🔑 Environment Variables
+The root of this folder requires a `.env` file to function:
+```env
+MONGODB_URI=mongodb+srv://<your-creds>.mongodb.net
+JWT_SECRET=super_secure_hash_string_here
+ADMIN_EMAIL=admin@squad.ai
+ADMIN_PASSWORD=Admin@123
+EMAIL_USER=your_gmail@gmail.com
+EMAIL_PASS=your_16_char_gmail_app_password
+FLASK_ENV=development
+```
+## 🧠 AI Inference Matrix (`/models`)
+The system routes questions based on physical payload ID bindings directly into active memory arrays.
+1. **Model 1: `bert_model.py` (BERT)**
+   * Leverages HuggingFace `transformers` for `deepset/bert-base-cased-squad2`.
+2. **Model 3: `model3.py` (BiLSTM)**
+   * Native PyTorch integration running isolated weights mapped precisely off a local `qa_model.pth` tensor dictionary array.
+## 📜 Database Collections
+All queries are funneled cleanly into MongoDB:
+- `users`: Standard user tracking, OTP storage, password hashing tracking.
+- `chats`: Detailed inference payloads, system diagnostics, user-soft deletion patterns (`user_deleted: True`).
+- `settings`: Central singleton objects storing administrative configurations.
+## 🚀 Running Locally
+```bash
+# 1. Activate Virtual Env
+.\.venv\Scripts\activate
+# 2. Install Dependencies
+pip install -r requirements.txt
+# 3. Boot Server
+python app.py
+```

__pycache__/auth.cpython-314.pyc ADDED Viewed

Binary file (5.05 kB). View file

__pycache__/qa_engine.cpython-314.pyc ADDED Viewed

Binary file (4.33 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,638 @@

+"""
+app.py — Main Flask application for the SQuAD QA System.
+Endpoints:
+  Public:
+    POST /api/auth/register
+    POST /api/auth/login
+    GET  /api/health
+  Authenticated (any user):
+    GET  /api/auth/me
+    GET  /api/models
+    POST /api/ask
+    GET  /api/history
+    DELETE /api/history/<chat_id>
+    DELETE /api/history
+  Admin only:
+    GET    /api/admin/users
+    PUT    /api/admin/users/<user_id>
+    DELETE /api/admin/users/<user_id>
+    GET    /api/admin/stats
+"""
+import os
+import sys
+import logging
+import re
+from datetime import datetime, timezone, timedelta
+from flask import Flask, request, jsonify, g
+from flask_cors import CORS
+from flask_bcrypt import Bcrypt
+from flask_limiter import Limiter
+from flask_limiter.util import get_remote_address
+from bson import ObjectId
+from dotenv import load_dotenv
+# ─── Load environment ─────────────────────────────────────────────────────────
+load_dotenv()
+# ─── Logging ─────────────────────────────────────────────────────────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+# ─── App init ─────────────────────────────────────────────────────────────────
+app = Flask(__name__)
+bcrypt = Bcrypt(app)
+limiter = Limiter(
+    get_remote_address,
+    app=app,
+    default_limits=["1000 per day", "100 per hour"],
+    storage_uri="memory://"
+)
+app.config['MAX_CONTENT_LENGTH'] = 5 * 1024 * 1024  # 5 MB max constraint
+# ─── CORS (reads from env for cloud safety) ───────────────────────────────────
+raw_origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:5173,http://localhost:3000")
+allowed_origins = [o.strip() for o in raw_origins.split(",") if o.strip()]
+CORS(app, origins=allowed_origins, supports_credentials=True)
+# ─── Internal imports (after app init) ───────────────────────────────────────
+from auth import generate_token, require_auth, require_admin
+from utils.db import users_col, chats_col, settings_col, is_using_mock
+from utils.pdf_parser import extract_text
+import qa_engine
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+def _serialize(doc: dict) -> dict:
+    """Convert MongoDB ObjectId fields to strings for JSON serialization."""
+    if doc is None:
+        return None
+    doc = dict(doc)
+    if "_id" in doc:
+        doc["id"] = str(doc.pop("_id"))
+    return doc
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def _future_iso(seconds: int) -> str:
+    return (datetime.now(timezone.utc) + timedelta(seconds=seconds)).isoformat()
+def safe_str(val) -> str:
+    """Ensure the input is strictly a string, preventing NoSQL injection dicts."""
+    if not isinstance(val, str):
+        return ""
+    return val.strip()
+def send_otp_email(to_email, otp):
+    """Sends OTP via real Gmail SMTP if ENV vars exist."""
+    email_user = os.getenv("EMAIL_USER")
+    email_pass = os.getenv("EMAIL_PASS")
+    if not email_user or not email_pass:
+        # Fallback to mock logging if user hasn't put in valid app passwords yet
+        logger.warning("=" * 60)
+        logger.warning(f"  [MOCK EMAIL OTP] Verification code for {to_email}: {otp}")
+        logger.warning("=" * 60)
+        return False
+    try:
+        import smtplib
+        from email.mime.text import MIMEText
+        from email.mime.multipart import MIMEMultipart
+        msg = MIMEMultipart()
+        msg['From'] = email_user
+        msg['To'] = to_email
+        msg['Subject'] = "SQuAD QA - Your Verification Code"
+        body = f"Welcome to SQuAD QA!!!\n\nYour 6-digit registration verification code is: {otp}\n\nPlease enter this code to complete your registration.\n\nThank you!!!"
+        msg.attach(MIMEText(body, 'plain'))
+        server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
+        server.login(email_user, email_pass)
+        server.send_message(msg)
+        server.quit()
+        logger.info(f"[SMTP] Successfully dispatched OTP to {to_email}")
+        return True
+    except Exception as e:
+        logger.error(f"[SMTP ERROR] Failed to send actual email to {to_email}: {e}")
+        return False
+# ─── Admin Seed ────────────────────────────��──────────────────────────────────
+def _seed_admin():
+    """Create the default admin user if it doesn't exist."""
+    admin_email = os.getenv("ADMIN_EMAIL", "admin@squad.ai")
+    admin_password = os.getenv("ADMIN_PASSWORD", "Admin@123")
+    col = users_col()
+    if col.find_one({"email": admin_email}):
+        logger.info(f"[Seed] Admin user '{admin_email}' already exists.")
+        return
+    hashed = bcrypt.generate_password_hash(admin_password).decode("utf-8")
+    col.insert_one({
+        "name": "Administrator",
+        "email": admin_email,
+        "password": hashed,
+        "role": "admin",
+        "is_active": True,
+        "created_at": _now_iso(),
+        "last_login": None,
+    })
+    logger.info(f"[Seed] Admin user '{admin_email}' created.")
+# ─── Health ───────────────────────────────────────────────────────────────────
+@app.route("/api/health", methods=["GET"])
+def health():
+    return jsonify({
+        "status": "ok",
+        "db_mode": "mock" if is_using_mock() else "atlas",
+        "timestamp": _now_iso(),
+    })
+# ─── Auth Routes ──────────────────────────────────────────────────────────────
+@app.route("/api/auth/register", methods=["POST"])
+@limiter.limit("10 per hour")
+def register():
+    data = request.get_json(silent=True) or {}
+    name = safe_str(data.get("name"))
+    email = safe_str(data.get("email")).lower()
+    password = safe_str(data.get("password"))
+    if not name or not email or not password:
+        return jsonify({"error": "Name, email, and password are required."}), 400
+    password_regex = r"^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&#^])[A-Za-z\d@$!%*?&#^]{8,}$"
+    if not re.match(password_regex, password):
+        return jsonify({"error": "Password must be at least 8 characters and include uppercase, lowercase, number, and a special character."}), 400
+    col = users_col()
+    sys_col = settings_col()
+    sys_conf = sys_col.find_one({"_id": "system_config"}) or {}
+    if sys_conf.get("disable_registrations", False):
+        return jsonify({"error": "New user registrations are currently disabled by the administrator."}), 403
+    if col.find_one({"email": email}):
+        return jsonify({"error": "An account with this email already exists."}), 409
+    hashed = bcrypt.generate_password_hash(password).decode("utf-8")
+    import random
+    otp = str(random.randint(100000, 999999))
+    send_otp_email(email, otp)
+    result = col.insert_one({
+        "name": name,
+        "email": email,
+        "password": hashed,
+        "role": "user",
+        "is_active": False,
+        "is_verified": False,
+        "otp": otp,
+        "otp_expires_at": _future_iso(60),
+        "created_at": _now_iso(),
+        "last_login": None,
+    })
+    return jsonify({
+        "message": "OTP sent to email. Please verify your account.",
+        "requires_otp": True
+    }), 201
+@app.route("/api/auth/verify", methods=["POST"])
+@limiter.limit("5 per minute")
+def verify_otp():
+    data = request.get_json(silent=True) or {}
+    email = safe_str(data.get("email")).lower()
+    otp = safe_str(data.get("otp"))
+    if not email or not otp:
+        return jsonify({"error": "Email and OTP are required."}), 400
+    col = users_col()
+    user = col.find_one({"email": email})
+    if not user:
+        return jsonify({"error": "User not found."}), 404
+    if user.get("is_verified", False):
+        return jsonify({"error": "Account already verified."}), 400
+    expires_at = user.get("otp_expires_at")
+    if expires_at and _now_iso() > expires_at:
+        return jsonify({"error": "OTP has expired. Please request a new one."}), 400
+    if str(user.get("otp")) != str(otp):
+        return jsonify({"error": "Invalid verification code."}), 400
+    col.update_one({"_id": user["_id"]}, {"$set": {"is_verified": True, "is_active": True, "otp": None}})
+    user_id = str(user["_id"])
+    from auth import generate_token
+    role = user.get("role", "user")
+    token = generate_token(user_id, role)
+    col.update_one({"_id": user["_id"]}, {"$set": {"last_login": _now_iso()}})
+    return jsonify({
+        "message": "Account verified successfully.",
+        "token": token,
+        "user": {"id": user_id, "name": user["name"], "email": user["email"], "role": role},
+    }), 200
+@app.route("/api/auth/resend-otp", methods=["POST"])
+@limiter.limit("3 per minute")
+def resend_otp():
+    data = request.get_json(silent=True) or {}
+    email = safe_str(data.get("email")).lower()
+    if not email:
+        return jsonify({"error": "Email is required."}), 400
+    col = users_col()
+    user = col.find_one({"email": email})
+    if not user:
+        return jsonify({"error": "User not found."}), 404
+    if user.get("is_verified", False):
+        return jsonify({"error": "Account is already verified."}), 400
+    import random
+    new_otp = str(random.randint(100000, 999999))
+    col.update_one({"_id": user["_id"]}, {"$set": {"otp": new_otp, "otp_expires_at": _future_iso(60)}})
+    send_otp_email(email, new_otp)
+    return jsonify({"message": "A new OTP has been sent to your email."}), 200
+@app.route("/api/auth/login", methods=["POST"])
+@limiter.limit("15 per minute")
+def login():
+    data = request.get_json(silent=True) or {}
+    email = safe_str(data.get("email")).lower()
+    password = safe_str(data.get("password"))
+    if not email or not password:
+        return jsonify({"error": "Email and password are required."}), 400
+    col = users_col()
+    user = col.find_one({"email": email})
+    if not user or not bcrypt.check_password_hash(user["password"], password):
+        return jsonify({"error": "Invalid email or password."}), 401
+    if not user.get("is_verified", True):
+        # We can trigger verify if they try to login while unverified, but for simplicity:
+        return jsonify({"error": "Your account is not verified. Please check your email for the OTP."}), 403
+    if not user.get("is_active", True):
+        return jsonify({"error": "Your account has been deactivated. Contact admin."}), 403
+    user_id = str(user["_id"])
+    role = user.get("role", "user")
+    token = generate_token(user_id, role)
+    # Update last_login
+    col.update_one({"_id": user["_id"]}, {"$set": {"last_login": _now_iso()}})
+    return jsonify({
+        "message": "Login successful.",
+        "token": token,
+        "user": {
+            "id": user_id,
+            "name": user["name"],
+            "email": user["email"],
+            "role": role,
+        },
+    })
+@app.route("/api/auth/me", methods=["GET"])
+@require_auth
+def me():
+    from bson import ObjectId as ObjId
+    col = users_col()
+    try:
+        user = col.find_one({"_id": ObjId(g.current_user["id"])})
+    except Exception:
+        user = col.find_one({"_id": g.current_user["id"]})
+    if not user:
+        return jsonify({"error": "User not found."}), 404
+    user = _serialize(user)
+    user.pop("password", None)
+    return jsonify({"user": user})
+# ─── Models ───────────────────────────────────────────────────────────────────
+@app.route("/api/models", methods=["GET"])
+@require_auth
+def get_models():
+    models_info = qa_engine.get_models_info()
+    ready_ids = [m["id"] for m in models_info if m.get("status") == "ready"]
+    pipeline = [
+        {"$match": {"model_id": {"$in": ready_ids}, "error": False}},
+        {"$group": {"_id": "$model_id", "avg_score": {"$avg": "$score"}, "count": {"$sum": 1}}}
+    ]
+    try:
+        from utils.db import chats_col
+        stats = {doc["_id"]: doc for doc in chats_col().aggregate(pipeline)}
+        total_queries = sum(d["count"] for d in stats.values())
+        total_score = sum(d["avg_score"] * d["count"] for d in stats.values())
+        global_avg = (total_score / total_queries) if total_queries > 0 else 0
+    except Exception:
+        stats = {}
+        global_avg = 0
+        total_queries = 0
+    for m in models_info:
+        model_stat = stats.get(m["id"], {})
+        m["avg_score"] = model_stat.get("avg_score", 0.0)
+        m["query_count"] = model_stat.get("count", 0)
+    return jsonify({
+        "models": models_info,
+        "global_avg": global_avg,
+        "total_queries": total_queries
+    })
+# ─── Ask (QA Inference) ───────────────────────────────────────────────────────
+@app.route("/api/ask", methods=["POST"])
+@require_auth
+@limiter.limit("30 per minute")
+def ask():
+    model_id = "bert"
+    context = ""
+    question = ""
+    # ── File upload (multipart form) ──
+    if request.content_type and "multipart/form-data" in request.content_type:
+        model_id = safe_str(request.form.get("model_id")) or "bert"
+        question = safe_str(request.form.get("question"))
+        file = request.files.get("file")
+        if file:
+            try:
+                import magic
+                buffer = file.read()
+                mime = magic.from_buffer(buffer, mime=True)
+                allowed_mimes = ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "text/plain"]
+                if mime not in allowed_mimes:
+                    return jsonify({"error": f"Security system rejected {mime}. Only true PDF/DOCX files permitted."}), 400
+                from utils.pdf_parser import extract_text
+                context = extract_text(buffer, file.filename)
+            except ValueError as exc:
+                return jsonify({"error": str(exc)}), 400
+        else:
+            context = safe_str(request.form.get("context"))
+    else:
+        # ── JSON body ──
+        data = request.get_json(silent=True) or {}
+        model_id = safe_str(data.get("model_id")) or "bert"
+        context = safe_str(data.get("context"))
+        question = safe_str(data.get("question"))
+    if not context:
+        return jsonify({"error": "Context (text or file) is required."}), 400
+    if not question:
+        return jsonify({"error": "Question is required."}), 400
+    # ── Run inference ──
+    result = qa_engine.run_inference(model_id, context, question)
+    # ── Persist to DB ──
+    chat_doc = {
+        "user_id": g.current_user["id"],
+        "model_id": model_id,
+        "model_name": result.get("model", model_id),
+        "context": context[:2000],           # truncate for storage
+        "question": question,
+        "answer": result.get("answer", ""),
+        "score": result.get("score", 0.0),
+        "error": result.get("error", False),
+        "created_at": _now_iso(),
+    }
+    insert_result = chats_col().insert_one(chat_doc)
+    result["chat_id"] = str(insert_result.inserted_id)
+    return jsonify(result)
+# ─── History ──────────────────────────────────────────────────────────────────
+@app.route("/api/history", methods=["GET"])
+@require_auth
+def get_history():
+    col = chats_col()
+    docs = list(col.find(
+        {"user_id": g.current_user["id"], "user_deleted": {"$ne": True}},
+        sort=[("created_at", -1)],
+        limit=50,
+    ))
+    return jsonify({"history": [_serialize(d) for d in docs]})
+@app.route("/api/history/<chat_id>", methods=["DELETE"])
+@require_auth
+def delete_chat(chat_id):
+    from bson import ObjectId as ObjId
+    col = chats_col()
+    try:
+        res = col.update_one(
+            {"_id": ObjId(chat_id), "user_id": g.current_user["id"]},
+            {"$set": {"user_deleted": True}}
+        )
+    except Exception:
+        return jsonify({"error": "Invalid chat ID."}), 400
+    if res.matched_count == 0:
+        return jsonify({"error": "Chat not found or not owned by you."}), 404
+    return jsonify({"message": "Chat deleted."})
+@app.route("/api/history", methods=["DELETE"])
+@require_auth
+def clear_history():
+    col = chats_col()
+    res = col.update_many(
+        {"user_id": g.current_user["id"]},
+        {"$set": {"user_deleted": True}}
+    )
+    return jsonify({"message": f"Cleared {res.modified_count} chat(s)."})
+# ─── Admin Routes ─────────────────────────────────────────────────────────────
+@app.route("/api/admin/users", methods=["GET"])
+@require_admin
+def admin_list_users():
+    col = users_col()
+    users = list(col.find({}, sort=[("created_at", -1)]))
+    result = []
+    for u in users:
+        u = _serialize(u)
+        u.pop("password", None)
+        result.append(u)
+    return jsonify({"users": result, "total": len(result)})
+@app.route("/api/admin/users/<user_id>", methods=["PUT"])
+@require_admin
+def admin_update_user(user_id):
+    from bson import ObjectId as ObjId
+    data = request.get_json(silent=True) or {}
+    allowed_fields = {"name", "role", "is_active"}
+    update = {k: v for k, v in data.items() if k in allowed_fields}
+    if not update:
+        return jsonify({"error": "No valid fields to update."}), 400
+    col = users_col()
+    try:
+        res = col.update_one({"_id": ObjId(user_id)}, {"$set": update})
+    except Exception:
+        return jsonify({"error": "Invalid user ID."}), 400
+    if res.matched_count == 0:
+        return jsonify({"error": "User not found."}), 404
+    return jsonify({"message": "User updated successfully."})
+@app.route("/api/admin/users/<user_id>", methods=["DELETE"])
+@require_admin
+def admin_delete_user(user_id):
+    from bson import ObjectId as ObjId
+    # Prevent self-deletion
+    if user_id == g.current_user["id"]:
+        return jsonify({"error": "You cannot delete your own account."}), 400
+    col = users_col()
+    try:
+        res = col.delete_one({"_id": ObjId(user_id)})
+    except Exception:
+        return jsonify({"error": "Invalid user ID."}), 400
+    if res.deleted_count == 0:
+        return jsonify({"error": "User not found."}), 404
+    # Also logically remove their chat history
+    chats_col().update_many(
+        {"user_id": user_id},
+        {"$set": {"user_deleted": True, "admin_deleted_user": True}}
+    )
+    return jsonify({"message": "User and their history deleted."})
+@app.route("/api/admin/stats", methods=["GET"])
+@require_admin
+def admin_stats():
+    users = users_col()
+    chats = chats_col()
+    total_users = users.count_documents({})
+    total_queries = chats.count_documents({})
+    # Model usage breakdown
+    pipeline = [
+        {"$group": {"_id": "$model_id", "count": {"$sum": 1}}}
+    ]
+    try:
+        model_usage = {doc["_id"]: doc["count"] for doc in chats.aggregate(pipeline)}
+    except Exception:
+        model_usage = {}
+    # Timeseries data for graphs
+    ts_pipeline = [
+        {"$project": {"date": {"$substr": ["$created_at", 0, 10]}}},
+        {"$group": {"_id": "$date", "queries": {"$sum": 1}}},
+        {"$sort": {"_id": 1}},
+        {"$limit": 30}
+    ]
+    try:
+        timeseries = [{"date": doc["_id"], "queries": doc["queries"]} for doc in chats.aggregate(ts_pipeline)]
+    except Exception:
+        timeseries = []
+    return jsonify({
+        "total_users": total_users,
+        "total_queries": total_queries,
+        "model_usage": model_usage,
+        "timeseries": timeseries,
+        "db_mode": "mock" if is_using_mock() else "atlas",
+    })
+@app.route("/api/admin/settings", methods=["GET"])
+@require_admin
+def get_settings():
+    col = settings_col()
+    doc = col.find_one({"_id": "system_config"})
+    if not doc:
+        doc = {"_id": "system_config", "disable_registrations": False, "maintenance_mode": False}
+        col.insert_one(doc)
+    return jsonify({"settings": _serialize(doc)})
+@app.route("/api/admin/settings", methods=["PUT"])
+@require_admin
+def update_settings():
+    data = request.get_json(silent=True) or {}
+    allowed = {"disable_registrations", "maintenance_mode"}
+    update = {k: v for k, v in data.items() if k in allowed}
+    if not update:
+        return jsonify({"error": "No valid settings provided."}), 400
+    col = settings_col()
+    col.update_one({"_id": "system_config"}, {"$set": update}, upsert=True)
+    return jsonify({"message": "Settings updated."})
+@app.route("/api/admin/models/<model_id>", methods=["PUT"])
+@require_admin
+def toggle_model_status(model_id):
+    if model_id not in qa_engine.MODELS:
+        return jsonify({"error": "Invalid model ID."}), 404
+    data = request.get_json(silent=True) or {}
+    target_status = data.get("status")
+    if target_status not in ["ready", "maintenance"]:
+        return jsonify({"error": "Invalid status."}), 400
+    col = settings_col()
+    col.update_one({"_id": "system_config"}, {"$set": {f"model_status.{model_id}": target_status}}, upsert=True)
+    return jsonify({"message": f"Model {model_id} status updated to {target_status}."})
+# ─── Entry Point ──────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    logger.info("=" * 60)
+    logger.info("  SQuAD QA System — Backend Starting")
+    logger.info("=" * 60)
+    # Initialise AI models
+    qa_engine.init_all_models()
+    # Seed admin user
+    _seed_admin()
+    flask_env = os.getenv("FLASK_ENV", "development")
+    debug = flask_env == "development"
+    app.run(host="0.0.0.0", port=5000, debug=debug)

auth.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+auth.py — JWT-based authentication helpers.
+Provides:
+  - generate_token(user_id, role)  → signed JWT string
+  - @require_auth                  → validates JWT, injects g.current_user
+  - @require_admin                 → same as @require_auth + checks admin role
+"""
+import os
+import jwt
+import logging
+from functools import wraps
+from datetime import datetime, timedelta, timezone
+from flask import request, jsonify, g
+from dotenv import load_dotenv
+load_dotenv()
+logger = logging.getLogger(__name__)
+JWT_SECRET = os.getenv("JWT_SECRET", "default-insecure-secret-change-me")
+JWT_EXPIRY_HOURS = int(os.getenv("JWT_EXPIRY_HOURS", "24"))
+# ─── Token Generation ─────────────────────────────────────────────────────────
+def generate_token(user_id: str, role: str) -> str:
+    """Create a signed JWT valid for JWT_EXPIRY_HOURS hours."""
+    payload = {
+        "sub": str(user_id),
+        "role": role,
+        "iat": datetime.now(timezone.utc),
+        "exp": datetime.now(timezone.utc) + timedelta(hours=JWT_EXPIRY_HOURS),
+    }
+    return jwt.encode(payload, JWT_SECRET, algorithm="HS256")
+def decode_token(token: str) -> dict:
+    """Decode and verify a JWT. Raises jwt.exceptions on failure."""
+    return jwt.decode(token, JWT_SECRET, algorithms=["HS256"])
+# ─── Decorators ───────────────────────────────────────────────────────────────
+def require_auth(f):
+    """Decorator: validates Bearer JWT and populates g.current_user."""
+    @wraps(f)
+    def decorated(*args, **kwargs):
+        auth_header = request.headers.get("Authorization", "")
+        if not auth_header.startswith("Bearer "):
+            return jsonify({"error": "Authorization header missing or malformed."}), 401
+        token = auth_header.split(" ", 1)[1]
+        try:
+            payload = decode_token(token)
+            # Real-time suspension check
+            from utils.db import users_col
+            from bson import ObjectId as ObjId
+            col = users_col()
+            try:
+                user = col.find_one({"_id": ObjId(payload["sub"])})
+            except Exception:
+                user = col.find_one({"_id": payload["sub"]})
+            if not user or not user.get("is_active", True):
+                return jsonify({"error": "Your account has been suspended by an administrator."}), 403
+            g.current_user = {
+                "id": payload["sub"],
+                "role": payload["role"],
+            }
+        except jwt.ExpiredSignatureError:
+            return jsonify({"error": "Token expired. Please log in again."}), 401
+        except jwt.InvalidTokenError as exc:
+            return jsonify({"error": f"Invalid token: {exc}"}), 401
+        return f(*args, **kwargs)
+    return decorated
+def require_admin(f):
+    """Decorator: validates JWT AND checks for admin role."""
+    @wraps(f)
+    @require_auth
+    def decorated(*args, **kwargs):
+        if g.current_user.get("role") != "admin":
+            return jsonify({"error": "Admin access required."}), 403
+        return f(*args, **kwargs)
+    return decorated

data_loader/load_squad_json.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+def load_squad_json(path):
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    samples = []
+    for article in data["data"]:
+        for para in article["paragraphs"]:
+            context = para["context"]
+            for qa in para["qas"]:
+                if not qa["answers"]:
+                    continue
+                ans = qa["answers"][0]
+                samples.append({
+                    "context": context,
+                    "question": qa["question"],
+                    "answer_text": ans["text"]
+                })
+    return samples

gunicorn.conf.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+# Gunicorn configuration for production deployment
+port = os.environ.get("PORT", "5000")
+bind = f"0.0.0.0:{port}"
+workers = 2          # Keep low — each worker loads BERT (~400MB RAM)
+timeout = 120        # BERT inference can take a few seconds
+accesslog = "-"      # stdout
+errorlog = "-"       # stdout
+loglevel = "info"
+preload_app = True   # Load model once, share across workers

main.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+from utils.file_loader import load_txt, load_pdf, load_docx
+from models.qa_model import QAModel
+from utils.vocab import encode
+from utils.preprocess import tokenize
+checkpoint = torch.load("qa_model.pth", map_location="cpu")
+vocab = checkpoint["vocab"]
+model = QAModel(len(vocab))
+model.load_state_dict(checkpoint["model_state"])
+model.eval()
+def load_context(path):
+    if path.endswith(".txt"):
+        return load_txt(path)
+    elif path.endswith(".pdf"):
+        return load_pdf(path)
+    elif path.endswith(".docx"):
+        return load_docx(path)
+    else:
+        raise ValueError("Unsupported file format")
+def extract_answer(question, context):
+    q_tokens = tokenize(question)
+    c_tokens = tokenize(context)
+    tokens = q_tokens + ["[SEP]"] + c_tokens
+    encoded = encode(tokens, vocab)
+    max_len = 300
+    if len(encoded) < max_len:
+        encoded += [0] * (max_len - len(encoded))
+    else:
+        encoded = encoded[:max_len]
+    x = torch.tensor(encoded).unsqueeze(0)
+    with torch.no_grad():
+        start_logits, end_logits = model(x)
+    start = torch.argmax(start_logits, dim=1).item()
+    end = torch.argmax(end_logits, dim=1).item()
+    if start > end or start >= len(tokens):
+        return "No answer found"
+    return " ".join(tokens[start:end+1])
+def main():
+    print("===== BiLSTM QA (Fixed) =====\n")
+    path = input("Enter file path: ")
+    context = load_context(path)
+    question = input("Enter question: ")
+    answer = extract_answer(question, context)
+    print("\nAnswer:", answer)
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # models package

models/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (144 Bytes). View file

models/__pycache__/bert_model.cpython-314.pyc ADDED Viewed

Binary file (4.78 kB). View file

models/__pycache__/model2.cpython-314.pyc ADDED Viewed

Binary file (1.31 kB). View file

models/__pycache__/model3.cpython-314.pyc ADDED Viewed

Binary file (4.6 kB). View file

models/__pycache__/qa_model.cpython-314.pyc ADDED Viewed

Binary file (1.7 kB). View file

models/bert_model.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+bert_model.py — HuggingFace BERT Question Answering Model.
+Model: deepset/bert-base-cased-squad2
+Uses direct PyTorch inference (compatible with transformers 5.x).
+"""
+import logging
+logger = logging.getLogger(__name__)
+_tokenizer = None
+_model = None
+MODEL_NAME = "deepset/bert-base-cased-squad2"
+def init_bert_model():
+    """Load the BERT QA model. Called once at app startup."""
+    global _tokenizer, _model
+    try:
+        from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+        logger.info(f"[BERT] Loading model '{MODEL_NAME}' ...")
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        _model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
+        _model.eval()
+        logger.info("[BERT] Model loaded and ready.")
+    except Exception as exc:
+        logger.error(f"[BERT] Failed to load model: {exc}")
+        _tokenizer = None
+        _model = None
+def _run_qa_inference(context: str, question: str) -> dict:
+    """Direct PyTorch inference — works with any transformers version."""
+    import torch
+    import torch.nn.functional as F
+    inputs = _tokenizer(
+        question, context,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512,
+    )
+    with torch.no_grad():
+        outputs = _model(**inputs)
+    start_logits = outputs.start_logits[0]
+    end_logits   = outputs.end_logits[0]
+    start_idx = int(torch.argmax(start_logits))
+    end_idx   = int(torch.argmax(end_logits)) + 1
+    if end_idx <= start_idx:
+        end_idx = start_idx + 1
+    input_ids = inputs["input_ids"][0]
+    answer_tokens = input_ids[start_idx:end_idx]
+    answer = _tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
+    # Confidence approximation via softmax
+    start_prob = float(F.softmax(start_logits, dim=0)[start_idx])
+    end_prob   = float(F.softmax(end_logits,   dim=0)[end_idx - 1])
+    score = round((start_prob + end_prob) / 2, 4)
+    return {"answer": answer, "score": score}
+def predict(context: str, question: str) -> dict:
+    """
+    Run QA inference.
+    Returns:
+        {
+            "answer": str,
+            "score": float (0.0–1.0),
+            "model": "BERT",
+            "model_id": "bert"
+        }
+    """
+    if _model is None or _tokenizer is None:
+        return {
+            "answer": "BERT model is not loaded. Please check server logs.",
+            "score": 0.0,
+            "model": "BERT",
+            "model_id": "bert",
+            "error": True,
+        }
+    if not context or not question:
+        return {
+            "answer": "Context and question must not be empty.",
+            "score": 0.0,
+            "model": "BERT",
+            "model_id": "bert",
+            "error": True,
+        }
+    try:
+        result = _run_qa_inference(context=context, question=question)
+        score = result["score"]
+        answer = result["answer"]
+        if score < 0.05 or "[CLS]" in answer or not answer:
+            answer = "Answer not found with sufficient confidence. Try rephrasing your question or providing more context."
+            score = 0.0
+        return {
+            "answer": answer,
+            "score": score,
+            "model": "BERT",
+            "model_id": "bert",
+            "error": False,
+        }
+    except Exception as exc:
+        logger.error(f"[BERT] Inference error: {exc}")
+        return {
+            "answer": f"Inference error: {exc}",
+            "score": 0.0,
+            "model": "BERT",
+            "model_id": "bert",
+            "error": True,
+        }

models/model2.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+model2.py — Placeholder for Model 2.
+Replace this file with your actual model implementation.
+The predict() function signature must match:
+    predict(context: str, question: str) -> dict
+"""
+import logging
+logger = logging.getLogger(__name__)
+def init_model2():
+    """Called at startup. No-op until model is integrated."""
+    logger.info("[Model2] Placeholder — not yet integrated.")
+def predict(context: str, question: str) -> dict:
+    """Stub: returns a friendly 'coming soon' response."""
+    return {
+        "answer": "Model 2 is not yet integrated. Please use BERT for now.",
+        "score": 0.0,
+        "model": "Model 2",
+        "model_id": "model2",
+        "error": False,
+        "stub": True,
+    }

models/model3.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+model3.py — Integration for BiLSTM Model.
+"""
+import logging
+import torch
+import os
+from models.qa_model import QAModel
+# Import vocab utilities and preprocess utilities
+from utils.preprocess import tokenize
+from utils.vocab import encode
+logger = logging.getLogger(__name__)
+model = None
+vocab = None
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def init_model3():
+    global model, vocab
+    logger.info("[Model3] Initialising BiLSTM from qa_model.pth...")
+    # Assumes qa_model.pth is at the root of the backend directory
+    model_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "qa_model.pth")
+    if not os.path.exists(model_path):
+        logger.warning(f"[Model3] qa_model.pth not found at {model_path}! Model 3 inference will fail.")
+        return
+    try:
+        checkpoint = torch.load(model_path, map_location=device)
+        vocab = checkpoint["vocab"]
+        model = QAModel(len(vocab))
+        model.load_state_dict(checkpoint["model_state"])
+        model.to(device)
+        model.eval()
+        logger.info("[Model3] BiLSTM successfully loaded.")
+    except Exception as e:
+        logger.error(f"[Model3] Failed to load BiLSTM model: {e}")
+def predict(context: str, question: str) -> dict:
+    """Predict using the loaded BiLSTM."""
+    if model is None or vocab is None:
+        return {
+            "answer": "BiLSTM model weights (qa_model.pth) not found or failed to load. Please make sure the trained model is placed in the backend folder.",
+            "score": 0.0,
+            "model": "BiLSTM",
+            "model_id": "model3",
+            "error": True,
+            "stub": False,
+        }
+    try:
+        q_tokens = tokenize(question)
+        c_tokens = tokenize(context)
+        tokens = q_tokens + ["[SEP]"] + c_tokens
+        encoded = encode(tokens, vocab)
+        max_len = 300
+        if len(encoded) < max_len:
+            encoded += [0] * (max_len - len(encoded))
+        else:
+            encoded = encoded[:max_len]
+        x = torch.tensor(encoded).unsqueeze(0).to(device)
+        with torch.no_grad():
+            start_logits, end_logits = model(x)
+        start = torch.argmax(start_logits, dim=1).item()
+        end = torch.argmax(end_logits, dim=1).item()
+        if start > end or start >= len(tokens):
+            answer = "No answer found"
+            score = 0.0
+        else:
+            answer = " ".join(tokens[start:end+1])
+            # Extract basic score approximations from logits if needed, but returning dummy score for now.
+            score = 0.85
+        return {
+            "answer": answer,
+            "score": score,
+            "model": "BiLSTM",
+            "model_id": "model3",
+            "error": False,
+        }
+    except Exception as e:
+        logger.error(f"[Model3] Inference error: {e}")
+        return {
+            "answer": "Inference error occurred.",
+            "score": 0.0,
+            "model": "BiLSTM",
+            "model_id": "model3",
+            "error": True,
+            "stub": False,
+        }

models/qa_model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+import torch.nn as nn
+class QAModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim=200, hidden_dim=256):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.lstm = nn.LSTM(
+            embed_dim,
+            hidden_dim,
+            batch_first=True,
+            bidirectional=True
+        )
+        self.fc_start = nn.Linear(hidden_dim*2, 1)
+        self.fc_end = nn.Linear(hidden_dim*2, 1)
+    def forward(self, x):
+        x = self.embedding(x)
+        out, _ = self.lstm(x)
+        start = self.fc_start(out).squeeze(-1)
+        end = self.fc_end(out).squeeze(-1)
+        return start, end

qa_engine.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+qa_engine.py — Model router.
+Routes inference requests to the correct model module based on model_id.
+Initialises all models at startup.
+"""
+import logging
+from models import bert_model, model2, model3
+from utils.db import settings_col
+logger = logging.getLogger(__name__)
+# ─── Registry ────────────────────────────────────────────────────────────────
+MODELS = {
+    "bert": {
+        "id": "bert",
+        "name": "BERT",
+        "description": "",
+        "status": "ready",
+        "module": bert_model,
+    },
+    "model2": {
+        "id": "model2",
+        "name": "DistilBERT",
+        "description": "",
+        "status": "coming_soon",
+        "module": model2,
+    },
+    "model3": {
+        "id": "model3",
+        "name": "BiLSTM",
+        "description": "",
+        "status": "ready",
+        "module": model3,
+    },
+}
+def init_all_models():
+    """Initialise all models at application startup."""
+    logger.info("[QAEngine] Initialising models...")
+    bert_model.init_bert_model()
+    model2.init_model2()
+    model3.init_model3()
+    logger.info("[QAEngine] All models initialised.")
+def get_models_info() -> list:
+    """Return metadata list for all models (used by /api/models endpoint)."""
+    try:
+        sys_conf = settings_col().find_one({"_id": "system_config"}) or {}
+        model_status_overrides = sys_conf.get("model_status", {})
+    except Exception:
+        model_status_overrides = {}
+    return [
+        {
+            "id": m["id"],
+            "name": m["name"],
+            "description": m["description"],
+            "status": model_status_overrides.get(m["id"], m["status"]),
+        }
+        for m in MODELS.values()
+    ]
+def run_inference(model_id: str, context: str, question: str) -> dict:
+    """
+    Route a QA request to the appropriate model.
+    Args:
+        model_id: One of "bert", "model2", "model3"
+        context:  The passage/document text
+        question: The question to answer
+    Returns:
+        dict with keys: answer, score, model, model_id, error
+    """
+    if model_id not in MODELS:
+        return {
+            "answer": f"Unknown model '{model_id}'. Available: {list(MODELS.keys())}",
+            "score": 0.0,
+            "model": "Unknown",
+            "model_id": model_id,
+            "error": True,
+        }
+    try:
+        sys_conf = settings_col().find_one({"_id": "system_config"}) or {}
+        if sys_conf.get("maintenance_mode", False):
+            return {
+                "answer": "System is currently under maintenance. Please try again later.",
+                "score": 0.0,
+                "model": "System",
+                "model_id": model_id,
+                "error": True
+            }
+        status_override = sys_conf.get("model_status", {}).get(model_id)
+        current_status = status_override if status_override else MODELS[model_id]["status"]
+        if current_status != "ready":
+             return {
+                 "answer": "This model is currently disabled by an administrator.",
+                 "score": 0.0,
+                 "model": MODELS[model_id]["name"],
+                 "model_id": model_id,
+                 "error": True
+             }
+    except Exception:
+        pass
+    module = MODELS[model_id]["module"]
+    return module.predict(context=context, question=question)

qa_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5ff35d1b92957d46df75fa375df83cf39c8998e51d4098cdb061a8b7fa7d028
+size 43858657

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+flask==3.0.3
+flask-cors==4.0.1
+flask-bcrypt==1.0.1
+pymongo==4.7.3
+dnspython==2.6.1
+pyjwt==2.8.0
+python-dotenv==1.0.1
+transformers>=4.40.0
+torch>=2.0.0
+PyPDF2==3.0.1
+python-docx==1.1.2
+gunicorn==22.0.0
+Werkzeug==3.0.3
+mongomock==4.1.2
+python-magic==0.4.27
+flask-limiter==3.7.0

train.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from data_loader.load_squad_json import load_squad_json
+from utils.squad_preprocess import process_sample
+from utils.vocab import build_vocab, encode
+from models.qa_model import QAModel
+class QADataset(Dataset):
+    def __init__(self, samples, vocab, max_len=300):
+        self.data = []
+        for s in samples:
+            item = process_sample(s)
+            if not item:
+                continue
+            tokens = item["tokens"]
+            encoded = encode(tokens, vocab)
+            if len(encoded) < max_len:
+                encoded += [0] * (max_len - len(encoded))
+            else:
+                encoded = encoded[:max_len]
+            start = item["start"]
+            end = item["end"]
+            if start >= max_len or end >= max_len:
+                continue
+            self.data.append((encoded, start, end))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        x, s, e = self.data[idx]
+        return torch.tensor(x), torch.tensor(s), torch.tensor(e)
+def train():
+    print("Loading data...")
+    raw = load_squad_json("data/train-v2.0.json")[:30000]
+    print("Building vocab...")
+    all_tokens = []
+    for s in raw:
+        item = process_sample(s)
+        if item:
+            all_tokens += item["tokens"]
+    vocab = build_vocab(all_tokens)
+    print("Preparing dataset...")
+    dataset = QADataset(raw, vocab)
+    loader = DataLoader(dataset, batch_size=32, shuffle=True)
+    print("Initializing model...")
+    model = QAModel(len(vocab))
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    loss_fn = nn.CrossEntropyLoss()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    print("Training...\n")
+    for epoch in range(5):
+        total_loss = 0
+        for x, start, end in loader:
+            x = x.to(device)
+            start = start.to(device)
+            end = end.to(device)
+            pred_start, pred_end = model(x)
+            loss = loss_fn(pred_start, start) + loss_fn(pred_end, end)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        print(f"Epoch {epoch+1} Loss: {total_loss:.2f}")
+    torch.save({
+        "model_state": model.state_dict(),
+        "vocab": vocab
+    }, "qa_model.pth")
+    print("\n✅ Model trained and saved!")
+if __name__ == "__main__":
+    train()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # utils package

utils/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (143 Bytes). View file

utils/__pycache__/db.cpython-314.pyc ADDED Viewed

Binary file (3.57 kB). View file

utils/__pycache__/pdf_parser.cpython-314.pyc ADDED Viewed

Binary file (3.67 kB). View file

utils/__pycache__/preprocess.cpython-314.pyc ADDED Viewed

Binary file (448 Bytes). View file

utils/__pycache__/vocab.cpython-314.pyc ADDED Viewed

Binary file (713 Bytes). View file

utils/db.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+db.py — MongoDB Atlas connection with mongomock fallback.
+If MONGO_URI is not set or the connection fails, the app runs on an
+in-memory mock store so development works without any database.
+"""
+import os
+import logging
+from dotenv import load_dotenv
+load_dotenv()
+logger = logging.getLogger(__name__)
+MONGO_URI = os.getenv("MONGO_URI") or os.getenv("MONGODB_URI") or ""
+DB_NAME = "squad_qa"
+_client = None
+_db = None
+_using_mock = False
+def _connect_atlas():
+    """Attempt to connect to MongoDB Atlas (or local Mongo)."""
+    global _client, _db, _using_mock
+    try:
+        from pymongo import MongoClient
+        from pymongo.errors import ConnectionFailure, ConfigurationError, ServerSelectionTimeoutError
+        if not MONGO_URI or "username:password" in MONGO_URI:
+            raise ValueError("MONGO_URI not configured — falling back to mock.")
+        _client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000, tls=True, tlsAllowInvalidCertificates=True)
+        # Trigger actual connection check
+        _client.admin.command("ping")
+        _db = _client[DB_NAME]
+        _using_mock = False
+        logger.info("[DB] Connected to MongoDB Atlas successfully.")
+    except Exception as exc:
+        logger.warning(f"[DB] MongoDB connection failed: {exc}")
+        logger.warning("[DB] Falling back to in-memory mongomock.")
+        _connect_mock()
+def _connect_mock():
+    """Fall back to mongomock (in-memory, no persistence)."""
+    global _client, _db, _using_mock
+    try:
+        import mongomock
+        _client = mongomock.MongoClient()
+        _db = _client[DB_NAME]
+        _using_mock = True
+        logger.warning("[DB] Running on mongomock — data will NOT persist across restarts.")
+    except ImportError:
+        logger.error("[DB] mongomock not installed. Database unavailable.")
+        _db = None
+def get_db():
+    """Return the active database handle (Atlas or mock)."""
+    global _db
+    if _db is None:
+        _connect_atlas()
+    return _db
+def is_using_mock():
+    return _using_mock
+# Initialise on import
+_connect_atlas()
+# Convenience collection accessors
+def users_col():
+    return get_db()["users"]
+def chats_col():
+    return get_db()["chats"]
+def settings_col():
+    return get_db()["settings"]

utils/file_loader.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import PyPDF2
+import docx
+def load_txt(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return f.read()
+def load_pdf(file_path):
+    text = ""
+    with open(file_path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            if page.extract_text():
+                text += page.extract_text()
+    return text
+def load_docx(file_path):
+    doc = docx.Document(file_path)
+    return "\n".join([p.text for p in doc.paragraphs])

utils/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+pdf_parser.py — Extract plain text from PDF, DOCX, and TXT files.
+"""
+import os
+import logging
+from io import BytesIO
+logger = logging.getLogger(__name__)
+PDF_MAX_PAGES = int(os.getenv("PDF_MAX_PAGES", "15"))
+def extract_text_from_pdf(file_bytes: bytes) -> str:
+    """Extract text from a PDF byte stream (up to PDF_MAX_PAGES pages)."""
+    try:
+        import PyPDF2
+        reader = PyPDF2.PdfReader(BytesIO(file_bytes))
+        pages = reader.pages[:PDF_MAX_PAGES]
+        text = "\n".join(page.extract_text() or "" for page in pages)
+        return text.strip()
+    except Exception as exc:
+        logger.error(f"[PDF] Extraction failed: {exc}")
+        return ""
+def extract_text_from_docx(file_bytes: bytes) -> str:
+    """Extract text from a DOCX byte stream."""
+    try:
+        import docx
+        from io import BytesIO as _BytesIO
+        doc = docx.Document(_BytesIO(file_bytes))
+        return "\n".join(para.text for para in doc.paragraphs).strip()
+    except Exception as exc:
+        logger.error(f"[DOCX] Extraction failed: {exc}")
+        return ""
+def extract_text(file_bytes: bytes, filename: str) -> str:
+    """Dispatch extraction based on file extension."""
+    ext = os.path.splitext(filename.lower())[1]
+    if ext == ".pdf":
+        return extract_text_from_pdf(file_bytes)
+    elif ext in (".docx", ".doc"):
+        return extract_text_from_docx(file_bytes)
+    elif ext == ".txt":
+        return file_bytes.decode("utf-8", errors="ignore").strip()
+    else:
+        raise ValueError(f"Unsupported file type: {ext}. Allowed: PDF, DOCX, TXT.")

utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import re
+def tokenize(text):
+    text = text.lower()
+    text = re.sub(r"[^\w\s]", "", text)
+    return text.split()

utils/squad_preprocess.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from utils.preprocess import tokenize
+def process_sample(sample):
+    context_tokens = tokenize(sample["context"])
+    question_tokens = tokenize(sample["question"])
+    answer_tokens = tokenize(sample["answer_text"])
+    # 🔥 Combine question + context
+    tokens = question_tokens + ["[SEP]"] + context_tokens
+    start = -1
+    for i in range(len(context_tokens)):
+        if context_tokens[i:i+len(answer_tokens)] == answer_tokens:
+            start = i + len(question_tokens) + 1
+            break
+    if start == -1:
+        return None
+    end = start + len(answer_tokens) - 1
+    return {
+        "tokens": tokens,
+        "start": start,
+        "end": end
+    }

utils/vocab.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from collections import Counter
+def build_vocab(tokens):
+    vocab = {"<PAD>":0, "<UNK>":1}
+    counter = Counter(tokens)
+    for word in counter:
+        vocab[word] = len(vocab)
+    return vocab
+def encode(tokens, vocab):
+    return [vocab.get(t, vocab["<UNK>"]) for t in tokens]