Spaces:

sahanwickramasinghe
/

pseudoscorex-backend

Sleeping

File size: 4,249 Bytes

219ee1a

"""
Flask HTTP API for the Criterion-Wise Neural-LLM Hybrid Grading System.

Endpoints
---------
GET  /health   → liveness probe
GET  /info     → model / pipeline metadata
POST /predict  → score an (answer, criteria) payload against a question

POST /predict expects JSON of shape:
{
  "question": "…",
  "answer":   "…",
  "criteria": [
    {"name": "Input handling", "max_score": 2, "description": "full rubric text"},
    {"name": "Logic",          "max_score": 3, "description": "…"}
  ]
}

It returns JSON of shape:
{
  "total_score": 3.4,
  "max_total_score": 5,
  "percentage": 68.0,
  "results": [ {criterion, score, max_score, pred_norm, signals, explanation}, ... ],
  "overall_explanation": "…"
}
"""
import logging
import os
import time
import uuid
from flask import Flask, request, jsonify, g

# Configure logging BEFORE importing anything that uses it
from model.logging_config import setup_logging
setup_logging()

from model import load_pipeline, predict

logger = logging.getLogger("app")


app = Flask(__name__)


# ── Load model once at process start ───────────────────────────────────────
# For production, prefer gunicorn with --preload so this happens pre-fork.
with app.app_context():
    load_pipeline()


# ── Per-request logging hooks ──────────────────────────────────────────────
@app.before_request
def _log_request_start():
    g.req_id = uuid.uuid4().hex[:8]
    g.req_start = time.perf_counter()
    logger.info("→ %s %s [req=%s]", request.method, request.path, g.req_id)


@app.after_request
def _log_request_end(response):
    if hasattr(g, "req_start"):
        elapsed_ms = (time.perf_counter() - g.req_start) * 1000
        logger.info(
            "← %s %s [req=%s] status=%d in %.1fms",
            request.method, request.path,
            getattr(g, "req_id", "?"),
            response.status_code, elapsed_ms,
        )
    return response


@app.route("/health", methods=["GET"])
def health():
    return jsonify({"status": "ok"}), 200


@app.route("/info", methods=["GET"])
def info():
    import torch
    return jsonify({
        "encoder_space": os.getenv("ENCODER_SPACE_URL", "(unset)"),
        "llm_model": os.getenv("LLM_MODEL_NAME", "Qwen/Qwen2.5-1.5B-Instruct"),
        "llm_disabled": os.getenv("DISABLE_LLM_EXPLAINER", "0") == "1",
        "checkpoint": os.getenv("CHECKPOINT_PATH", "best_model_v5.pt"),
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
    }), 200


@app.route("/predict", methods=["POST"])
def predict_endpoint():
    # Accept either application/json or a raw file upload named "file"
    payload = None
    if request.is_json:
        payload = request.get_json(silent=True)
    elif "file" in request.files:
        import json as _json
        try:
            payload = _json.load(request.files["file"])
        except Exception as e:
            return jsonify({"error": f"Invalid JSON file: {e}"}), 400
    else:
        # Try raw body as JSON anyway
        payload = request.get_json(silent=True)

    if payload is None:
        return jsonify({
            "error": "Request body must be JSON with keys: question, answer, criteria"
        }), 400

    question = payload.get("question")
    answer = payload.get("answer")
    criteria = payload.get("criteria")

    try:
        result = predict(question=question, answer=answer, criteria=criteria)
        return jsonify(result), 200
    except ValueError as ve:
        # Validation errors → 400
        logger.warning("Validation error: %s", ve)
        return jsonify({"error": str(ve)}), 400
    except Exception as e:
        logger.exception("Unhandled error in /predict")
        return jsonify({"error": "Internal error", "detail": str(e)}), 500


if __name__ == "__main__":
    host = os.getenv("HOST", "0.0.0.0")
    port = int(os.getenv("PORT", "8000"))
    # debug=False because we don't want the reloader to load the encoder twice
    app.run(host=host, port=port, debug=False, threaded=False)