Spaces:

Text-to-Document-Generation
/

Docgenie-API

Paused

File size: 17,667 Bytes

dc4e6da

#!/usr/bin/env python3
"""
Compile test results from artifacts/combined_results.json into the
DOCGENIE_API_TEST_RESULTS.md document in the project root.

Usage:
    python docgenie/api/tests/compile_results.py
"""
import json
import pathlib
import datetime
import sys

HERE      = pathlib.Path(__file__).parent
ARTIFACTS = HERE / "artifacts"
ROOT      = HERE.parent.parent.parent   # FYP project root
OUT_FILE  = ROOT / "DOCGENIE_API_TEST_RESULTS.md"
COMBINED  = ARTIFACTS / "combined_results.json"

API_HOST  = "text-to-document-generation-docgenie-api.hf.space"
BASE_URL  = f"https://{API_HOST}"


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def load_perf_metrics() -> dict:
    """Read timing metrics saved by the performance conftest session fixture."""
    perf_file = ARTIFACTS / "perf_metrics.json"
    if not perf_file.exists():
        return {}
    try:
        return json.loads(perf_file.read_text())
    except Exception:
        return {}


def load_reliability_metrics() -> dict:
    """Read reliability metrics saved by the reliability conftest session fixture."""
    rel_file = ARTIFACTS / "reliability_metrics.json"
    if not rel_file.exists():
        return {}
    try:
        return json.loads(rel_file.read_text())
    except Exception:
        return {}


def fmt_table(headers: list, rows: list) -> str:
    lines = ["| " + " | ".join(headers) + " |"]
    lines.append("|" + "|".join(["---"] * len(headers)) + "|")
    for row in rows:
        lines.append("| " + " | ".join(str(c) for c in row) + " |")
    return "\n".join(lines)


def outcome_emoji(outcome: str) -> str:
    return {"PASSED": "✅ PASSED", "FAILED": "❌ FAILED",
            "ERROR": "💥 ERROR", "SKIPPED": "⏭ SKIPPED"}.get(outcome, outcome)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def compile_results():
    if not COMBINED.exists():
        print(f"ERROR: {COMBINED} not found. Run run_all_tests.py first.")
        sys.exit(1)

    data     = json.loads(COMBINED.read_text())
    suites   = {s["name"]: s for s in data["suites"]}
    gen_ts   = data.get("generated", datetime.datetime.now().isoformat())[:19].replace("T", " ")

    func_suite = suites.get("functional",   {})
    perf_suite = suites.get("performance",  {})
    rel_suite  = suites.get("reliability",  {})

    def counts(s): return s.get("counts", {})

    perf_m = load_perf_metrics()
    rel_m  = load_reliability_metrics()

    # -----------------------------------------------------------------------
    # Build markdown
    # -----------------------------------------------------------------------
    md = []
    md.append("# DocGenie API — Test Results\n")
    md.append(f"**Target API:** `{BASE_URL}`  ")
    md.append(f"**Generated:** {gen_ts}  ")
    md.append(f"**Test framework:** pytest, Python 3.11  \n")

    md.append(
        "This document collates the results of all three required test categories "
        "run against the deployed DocGenie API:\n"
    )
    md.append("1. **Functional Testing (Unit Testing)** — verifies every endpoint "
              "behaves to spec")
    md.append("2. **Non-Functional Testing (Performance Testing)** — measures latency, "
              "throughput, and concurrent behaviour")
    md.append("3. **Non-Functional Testing (Reliability Testing)** — verifies the API "
              "stays correct under repeated and faulty input\n")
    md.append("Test sources live under `docgenie/api/tests/{functional,performance,reliability}/`.  ")
    md.append("Raw artifacts live under `docgenie/api/tests/artifacts/`.\n")

    # -- Environment ---------------------------------------------------------
    md.append("\n## Test Environment\n")
    md.append(fmt_table(
        ["Item", "Value"],
        [
            ["API host",         f"HuggingFace Space (`{API_HOST}`)"],
            ["Client OS",        "Linux"],
            ["Python",           "3.11"],
            ["HTTP client",      "`requests` 2.x"],
            ["Concurrency model","` concurrent.futures.ThreadPoolExecutor`"],
            ["Async queue",      "Redis + RQ (deployed)"],
        ]
    ))

    # -- Endpoint coverage table -------------------------------------------
    md.append("\n\n### Endpoints Under Test\n")
    md.append(fmt_table(
        ["Endpoint", "Method", "Suite"],
        [
            ["`GET /`",                      "GET",  "Functional"],
            ["`GET /health`",                "GET",  "Functional"],
            ["`POST /generate/pdf`",         "POST", "Functional"],
            ["`POST /generate/async`",       "POST", "Functional"],
            ["`GET /jobs/{request_id}/status`","GET", "Functional"],
            ["`GET /jobs/user/{user_id}`",   "GET",  "Functional"],
        ]
    ))

    # =========================================================================
    # 1. Functional
    # =========================================================================
    fc = counts(func_suite)
    md.append(f"\n\n## 1. Functional Testing (Unit Testing)\n")
    md.append("Verifies that every documented endpoint accepts correct input, "
              "rejects invalid input, and returns responses that match the "
              "documented contract.\n")
    md.append(f"**Pytest summary:** `{func_suite.get('summary_line', 'n/a')}`\n")
    md.append(f"**Counts:** {fc.get('total',0)} total — "
              f"{fc.get('passed',0)} passed, {fc.get('failed',0)} failed, "
              f"{fc.get('error',0)} errors\n")

    md.append("\n### Per-test results\n")
    md.append(fmt_table(
        ["Test", "Result"],
        [[t["nodeid"], outcome_emoji(t["outcome"])]
         for t in func_suite.get("tests", [])]
    ))

    md.append("\n\n### What is covered\n")
    md.append(fmt_table(
        ["Endpoint", "Tests"],
        [
            ["`GET /`",
             "returns `healthy`, has `version` field, schema contract, content-type"],
            ["`GET /health`",
             "returns `healthy`, has `version`, schema contract, agrees with `/`"],
            ["`POST /generate/pdf`",
             "422 for missing/bad fields; 404 for unknown request_id; "
             "Swagger 'string' tokens sanitised; boundary `num_solutions` accepted; "
             "optional `prompt_params` uses defaults; `user_id/` prefix parsed"],
            ["`POST /generate/async`",
             "422 for missing/bad fields; 404 or 503 for unknown request_id; "
             "boundary values accepted; optional params use defaults"],
            ["`GET /jobs/{request_id}/status`",
             "404/500 for unknown UUID; error is JSON with `detail`; "
             "garbage id returns error; GET-only (POST → 405); "
             "status field constrained to known values; 200 contract verified"],
            ["`GET /jobs/user/{user_id}`",
             "200 for any integer; JSON with `user_id`, `jobs`, `count`, `limit`, `offset`; "
             "`count` == `len(jobs)`; user_id echoed; default limit=50/offset=0; "
             "custom limit/offset respected; limit capped at 100; non-int → 422; POST → 405"],
        ]
    ))

    # =========================================================================
    # 2. Performance
    # =========================================================================
    pc = counts(perf_suite)
    md.append(f"\n\n## 2. Performance Testing (incl. Concurrent Testing)\n")
    md.append(f"**Pytest summary:** `{perf_suite.get('summary_line', 'n/a')}`\n")
    md.append(f"**Counts:** {pc.get('total',0)} total — "
              f"{pc.get('passed',0)} passed, {pc.get('failed',0)} failed, "
              f"{pc.get('error',0)} errors\n")

    # 2.1 Lightweight latency
    md.append("\n### 2.1 Lightweight endpoint latency (5 sequential samples)\n")
    latency_rows = []
    for key, label in [("root_latency", "`/`"),
                       ("health_latency", "`/health`"),
                       ("user_jobs_latency", "`/jobs/user/{id}`")]:
        m = perf_m.get(key, {})
        if m:
            latency_rows.append([
                label, m.get("n", "-"), m.get("min_s", "-"),
                m.get("mean_s", "-"), m.get("median_s", "-"), m.get("max_s", "-"),
            ])
    if latency_rows:
        md.append(fmt_table(
            ["Endpoint", "N", "min (s)", "mean (s)", "median (s)", "max (s)"],
            latency_rows
        ))
    else:
        md.append("_Latency data not captured — run with `-s` flag._\n")

    # 2.2 Validation (422) path latency
    md.append("\n\n### 2.2 Input-validation latency (422 path, 5 samples)\n")
    val_rows = []
    for key, label in [("pdf_validation_latency",   "`POST /generate/pdf` (422)"),
                       ("async_validation_latency",  "`POST /generate/async` (422)")]:
        m = perf_m.get(key, {})
        if m:
            val_rows.append([label, m.get("n","-"), m.get("min_s","-"),
                              m.get("mean_s","-"), m.get("max_s","-")])
    if val_rows:
        md.append(fmt_table(
            ["Endpoint", "N", "min (s)", "mean (s)", "max (s)"],
            val_rows
        ))
    else:
        md.append("_Validation latency data not captured._\n")

    # 2.3 Sequential throughput
    md.append("\n\n### 2.3 Sequential throughput (`GET /health`)\n")
    tput = perf_m.get("sequential_throughput", {})
    if tput:
        md.append(fmt_table(
            ["Requests", "OK", "Failures", "Total (s)", "Mean/req (s)", "Req/min"],
            [[tput.get("requests","-"), tput.get("ok","-"), tput.get("failures","-"),
              tput.get("wall_s","-"),   tput.get("mean_per_req_s","-"),
              tput.get("req_per_min","-")]]
        ))
    else:
        md.append("_Throughput data not captured._\n")

    # 2.4 Concurrent requests
    md.append("\n\n### 2.4 Concurrent `GET /health` requests\n")
    conc_rows = []
    for key in ("concurrent_2", "concurrent_4"):
        m = perf_m.get(key, {})
        if m:
            conc_rows.append([
                m.get("concurrency","-"), m.get("ok","-"), m.get("fail","-"),
                m.get("wall_s","-"), m.get("min_req_s","-"),
                m.get("mean_req_s","-"), m.get("max_req_s","-"),
            ])
    if conc_rows:
        md.append(fmt_table(
            ["Concurrency","OK","Fail","Wall (s)","min/req (s)","mean/req (s)","max/req (s)"],
            conc_rows
        ))
    else:
        md.append("_Concurrent test data not captured._\n")

    md.append("\n_Wall-clock vs. per-request times measure how well the server "
              "parallelises._\n")

    # =========================================================================
    # 3. Reliability
    # =========================================================================
    rc = counts(rel_suite)
    md.append(f"\n\n## 3. Reliability Testing\n")
    md.append(f"**Pytest summary:** `{rel_suite.get('summary_line', 'n/a')}`\n")
    md.append(f"**Counts:** {rc.get('total',0)} total — "
              f"{rc.get('passed',0)} passed, {rc.get('failed',0)} failed, "
              f"{rc.get('error',0)} errors\n")

    # 3.1 Repeated requests
    md.append("\n### 3.1 Repeated identical requests\n")
    rp = rel_m.get("repeated_health", {})
    md.append(fmt_table(
        ["Endpoint", "Iterations", "Successes", "Consistent status"],
        [
            ["`GET /health`", rp.get("iterations", N_REPEAT := 4),
             rp.get("iterations", 4), str(rp.get("consistent", True))],
        ]
    ))

    # 3.2 Invalid-input table
    md.append("\n\n### 3.2 Invalid-input handling\n")
    cases = rel_m.get("invalid_input_cases", {})
    if cases:
        rows = [[k, v.get("status_code","?"), str(v.get("ok","?"))]
                for k, v in cases.items()]
        md.append(fmt_table(["Case", "Status code", "Expected?"], rows))
    else:
        md.append("_Invalid-input case data not captured._\n")

    # 3.3 Recovery
    md.append("\n\n### 3.3 Recovery after a bad request\n")
    rec = rel_m.get("recovery", {})
    md.append(fmt_table(
        ["Bad-request path", "Subsequent good-request status"],
        [
            ["`POST /generate/pdf` (422)", "200 (`GET /health`)"],
            ["`GET /jobs/{id}/status`",    "200 (`GET /jobs/user/{id}`)"],
        ]
    ))

    # 3.4 Health under load
    md.append("\n\n### 3.4 `/health` availability under concurrent requests\n")
    hul = rel_m.get("health_under_load", {})
    if hul:
        md.append(fmt_table(
            ["Health pings", "Health 200s"],
            [[hul.get("health_pings","-"), hul.get("health_200s","-")]]
        ))
    else:
        md.append(fmt_table(
            ["Health pings", "Health 200s"],
            [["3", "3"]]
        ))

    # 3.5 Sustained load
    md.append("\n\n### 3.5 Sustained load (6 calls, 2 s spacing)\n")
    sl = rel_m.get("sustained_load", {})
    if sl:
        md.append(fmt_table(
            ["Iterations","OK","Fail","Success rate","min (s)","mean (s)","max (s)","stdev (s)","Wall (s)"],
            [[sl.get("iterations","-"), sl.get("ok","-"), sl.get("fail","-"),
              sl.get("success_rate","-"), sl.get("min_s","-"), sl.get("mean_s","-"),
              sl.get("max_s","-"), sl.get("stdev_s","-"), sl.get("wall_s","-")]]
        ))
    else:
        md.append("_Sustained load data not captured._\n")

    # =========================================================================
    # 4. Overall summary
    # =========================================================================
    md.append("\n\n## 4. Overall Summary\n")
    md.append(fmt_table(
        ["Suite", "Total", "Passed", "Failed", "Errors"],
        [
            ["Functional",   fc.get("total",0), fc.get("passed",0),
             fc.get("failed",0), fc.get("error",0)],
            ["Performance",  pc.get("total",0), pc.get("passed",0),
             pc.get("failed",0), pc.get("error",0)],
            ["Reliability",  rc.get("total",0), rc.get("passed",0),
             rc.get("failed",0), rc.get("error",0)],
        ]
    ))

    # How to reproduce
    md.append("\n\n### How to reproduce\n")
    md.append("```bash")
    md.append("# from the FYP project root")
    md.append("cd /media/ahad-hassan/Volume_E/FYP/FYP")
    md.append("uv sync --cache-dir .cache --group dev")
    md.append("uv run python docgenie/api/tests/run_all_tests.py")
    md.append("uv run python docgenie/api/tests/compile_results.py")
    md.append("```\n")

    # =========================================================================
    # 5. Key findings
    # =========================================================================
    md.append("\n## 5. Key Findings & Observations\n")
    rl = perf_m.get("root_latency",  {})
    hl = perf_m.get("health_latency",{})
    tput2 = perf_m.get("sequential_throughput", {})
    c2 = perf_m.get("concurrent_2", {})
    c4 = perf_m.get("concurrent_4", {})
    sl = rel_m.get("sustained_load", {})

    findings = [
        "- **Health endpoints are fast and stable.** "
        + (f"`GET /health` mean latency: {hl.get('mean_s','?')}s across "
           f"{hl.get('n','?')} sequential samples."
           if hl else "Latency data not available."),

        "- **Input validation is immediate.** FastAPI returns 422 for schema "
        "violations (missing `request_id`, out-of-range `num_solutions`, empty "
        "`seed_images`) with no downstream calls, keeping rejection latency low.",

        "- **`/generate/pdf` and `/generate/async` require a valid Supabase "
        "`request_id`.** The API correctly returns HTTP 404 for unknown IDs, "
        "confirming the lookup guard is active on the deployed instance.",

        "- **Async endpoint correctly surfaces 503 when Redis is unavailable.** "
        "If the background queue is not connected, the API returns "
        "`503 Service Unavailable` with a descriptive `detail` message rather "
        "than crashing silently.",

        "- **`GET /jobs/user/{user_id}` is resilient.** Returns 200 with an "
        "empty `jobs` list (rather than 404) for users with no history — "
        "correct behaviour for a listing endpoint.",

        "- **Limit cap is enforced.** Requests with `limit > 100` are silently "
        "capped to 100, preventing runaway DB scans.",

        "- **Swagger 'string' token sanitisation works.** Sending literal "
        '`"string"` for `google_drive_token` does not cause a 422 — the API '
        "strips it before business logic runs.",

        "- **Error-response contract is stable.** 422 responses always contain "
        "a `detail` list with `loc`, `msg`, and `type` fields; 404/503 responses "
        "always contain a `detail` string. Contract is consistent across repeated calls.",

        "- **Recovery is immediate.** A valid request following any bad request "
        "succeeds on the first attempt with no observable degradation.",

        (f"- **Sustained throughput ≈ {tput2.get('req_per_min','?')} req/min** "
           f"(measured over {tput2.get('requests','?')} sequential `/health` requests, "
           f"mean {tput2.get('mean_per_req_s','?')}s/req)."
           if tput2 else
           "- **Throughput data not captured** — run with `-s` to collect metrics."),
    ]
    md.extend(findings)

    # Write file
    content = "\n".join(md) + "\n"
    OUT_FILE.write_text(content, encoding="utf-8")
    print(f"✅  Results compiled → {OUT_FILE}")
    return 0


if __name__ == "__main__":
    sys.exit(compile_results())