Spaces:

atkiya110
/

rag_server

Running

App Files Files Community

atkiya110 commited on Mar 16

Commit

34ca45d

verified ·

1 Parent(s): 5a3b8ff

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -640

app.py CHANGED Viewed

@@ -1,59 +1,15 @@
-"""
-EWU RAG Server — v2.1  (Fast-Restart Edition)
-══════════════════════════════════════════════════════════════════════
-Key fixes over v2
-─────────────────
-1. DISK CACHE  — chunks, FAISS index, BM25, KG, entity_index and doc
-   embeddings are all saved to ./cache/ on first boot.  Subsequent
-   restarts load from disk in ~5-10s instead of ~5 minutes.
-2. PRIORITY BOOT ORDER
-     Phase 1 (blocking) : load embedder + cross-encoder → build/load
-                          all indexes → server returns 200 immediately.
-     Phase 2 (background): load TinyLlama in a background task.
-                           /rag returns context-only answer until
-                           the generator is ready, then full answer.
-3. DETAIL-ENDPOINT CIRCUIT BREAKER
-   The API detail pages (/faculty/<id>, /programs/<id>, /documents/<slug>)
-   all returned HTTP 500 in the logs.  We now track failures per URL
-   and skip detail fetches after 3 consecutive 500s to avoid wasting
-   30+ seconds on guaranteed failures every boot.
-4. HyDE DISABLED ON CPU
-   HyDE costs one full TinyLlama forward pass per query.  On CPU that
-   is ~15-30 seconds of extra latency with minimal accuracy gain.
-   Set ENABLE_HYDE = True if you have a GPU.
-5. GRACEFUL DEGRADATION
-   Every component (embedder, reranker, generator, FAISS, BM25, KG)
-   is independent.  The server works — at reduced quality — if any
-   single component fails to load.
-"""
-import asyncio
-import json
-import logging
 import os
-import pickle
-import re
-import string
-import time
-from contextlib import asynccontextmanager
-from typing import Any, Dict, List, Optional
-import httpx
 import numpy as np
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
-logging.basicConfig(level=logging.INFO,
-                    format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger(__name__)
-# ── optional heavy deps ───────────────────────────────────────────────────────
 try:
     import faiss
     FAISS_OK = True
@@ -61,7 +17,7 @@ except ImportError:
     FAISS_OK = False
 try:
-    from sentence_transformers import SentenceTransformer, CrossEncoder
     ST_OK = True
 except ImportError:
     ST_OK = False
@@ -77,49 +33,29 @@ try:
     HF_OK = True
 except ImportError:
     HF_OK = False
-try:
-    import networkx as nx
-    NX_OK = True
-except ImportError:
-    NX_OK = False
-DEVICE = "cpu"
-try:
-    import torch
-    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-except ImportError:
-    pass
-# ═════════════════════════════════════════════════════════════════════════════
 # CONFIG
-# ═════════════════════════════════════════════════════════════════════════════
 API_BASE      = "https://ewu-server.onrender.com/api"
 API_KEY       = "i6EDytaX4E2jI6GvZQc0b1RSZHTI5_wVRa2rfL7rLpk"
 API_HEADERS   = {"x-api-key": API_KEY}
-GITHUB_BASE   = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
-EMBED_MODEL    = "sentence-transformers/all-MiniLM-L6-v2"
-RERANK_MODEL   = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-GEN_MODEL      = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-CHUNK_SIZE     = 512
-CHUNK_OVERLAP  = 100
-TOP_K_RETRIEVE = 20
-TOP_K_FINAL    = 5
-RERANK_THRESH  = -5.0
-MMR_LAMBDA     = 0.65
-COMPRESS_THRESH= 0.25
-# ── Performance switches ──────────────────────────────────────────────────────
-ENABLE_HYDE  = (DEVICE == "cuda")   # disabled on CPU — too slow
-CACHE_DIR    = "./cache"
-CACHE_TTL_H  = 24                   # rebuild disk cache every 24 h
-# ── API circuit-breaker ───────────────────────────────────────────────────────
-_api_fail_count: Dict[str, int] = {}
-API_FAIL_LIMIT  = 3
 API_LIST_ENDPOINTS = [
     "admission-deadlines", "academic-calendar", "grade-scale",
@@ -127,11 +63,13 @@ API_LIST_ENDPOINTS = [
     "governance", "alumni", "helpdesk", "policies", "proctor-schedule",
     "documents", "newsletters", "programs", "faculty", "departments",
 ]
 API_DETAIL_ENDPOINTS = [
     {"list": "programs",  "id_field": "id"},
     {"list": "faculty",   "id_field": "id"},
     {"list": "documents", "id_field": "slug"},
 ]
 GITHUB_FILES = [
     "admission_deadlines.json", "dynamic_admission_process.json",
     "dynamic_admission_requirements.json", "dynamic_tution_fees.json",
@@ -141,8 +79,7 @@ GITHUB_FILES = [
     "static_aboutEWU.json", "static_Admin.json",
     "static_AllAvailablePrograms.json", "static_alumni.json",
     "static_campus_life.json", "static_Career_Counseling_Center.json",
-    "static_clubs.json", "static_depts.json",
-    # "static_facilities.json",  <- 404, skipped
     "static_helpdesk.json", "static_payment_procedure.json",
     "static_Policy.json", "static_Programs.json", "static_Rules.json",
     "static_Sexual_harassment.json", "static_Tuition_fees.json",
@@ -156,307 +93,158 @@ GITHUB_FILES = [
     "syndicate.json", "tesol.json", "ewu_board_of_trustees.json",
 ]
-# ═════════════════════════════════════════════════════════════════════════════
 # APP STATE
-# ═════════════════════════════════════════════════════════════════════════════
 class AppState:
-    embedder      = None
-    reranker      = None
-    generator     = None
-    documents     : List[Dict] = []
-    faiss_index   = None
-    doc_embeddings: Optional[np.ndarray] = None
-    bm25          = None
-    kg            = None
-    entity_index  : Dict[str, List[int]] = {}
-    ready         : bool = False
-    gen_ready     : bool = False
-    error         : str  = ""
 state = AppState()
-# ═════════════════════════════════════════════════════════════════════════════
-# DISK CACHE HELPERS
-# ═════════════════════════════════════════════════════════════════════════════
-os.makedirs(CACHE_DIR, exist_ok=True)
-def _cp(name: str) -> str:
-    return os.path.join(CACHE_DIR, name)
-def _cache_fresh(name: str) -> bool:
-    p = _cp(name)
-    if not os.path.exists(p):
-        return False
-    return (time.time() - os.path.getmtime(p)) / 3600 < CACHE_TTL_H
-def _save(name: str, obj: Any) -> None:
-    try:
-        with open(_cp(name), "wb") as f:
-            pickle.dump(obj, f, protocol=5)
-        logger.info(f"[cache] saved {name}")
-    except Exception as e:
-        logger.warning(f"[cache] save {name} failed: {e}")
-def _load(name: str) -> Optional[Any]:
-    try:
-        with open(_cp(name), "rb") as f:
-            return pickle.load(f)
-    except Exception as e:
-        logger.warning(f"[cache] load {name} failed: {e}")
-        return None
-def _save_faiss(idx) -> None:
-    try:
-        faiss.write_index(idx, _cp("faiss.index"))
-        logger.info("[cache] saved faiss.index")
-    except Exception as e:
-        logger.warning(f"[cache] faiss save failed: {e}")
-def _load_faiss():
-    p = _cp("faiss.index")
-    if not os.path.exists(p):
-        return None
-    try:
-        idx = faiss.read_index(p)
-        logger.info(f"[cache] loaded faiss.index ({idx.ntotal} vectors)")
-        return idx
-    except Exception as e:
-        logger.warning(f"[cache] faiss load failed: {e}")
-        return None
-# ═════════════════════════════════════════════════════════════════════════════
 # DATA LOADING
-# ═════════════════════════════════════════════════════════════════════════════
-async def _fetch(url: str, headers: dict = None, timeout: int = 60) -> Optional[Any]:
-    key = url.split("?")[0]
-    if _api_fail_count.get(key, 0) >= API_FAIL_LIMIT:
-        return None
     try:
         async with httpx.AsyncClient(timeout=timeout) as client:
             r = await client.get(url, headers=headers or {})
             if r.status_code == 200:
-                _api_fail_count[key] = 0
                 return r.json()
-            if r.status_code in (404, 500, 502, 503):
-                _api_fail_count[key] = _api_fail_count.get(key, 0) + 1
-                logger.warning(f"[WARN] {url} → HTTP {r.status_code} "
-                               f"(fail #{_api_fail_count[key]})")
     except Exception as e:
-        _api_fail_count[key] = _api_fail_count.get(key, 0) + 1
-        logger.warning(f"[WARN] {url} → {e}")
     return None
-def _unwrap(data: Any) -> list:
-    if isinstance(data, list): return data
     if isinstance(data, dict):
-        for k in ("data", "results", "items"):
-            if k in data and isinstance(data[k], list):
-                return data[k]
         return [data]
     return []
-async def _wake_api() -> bool:
-    logger.info("[API] Waking render.com server…")
     for attempt in range(3):
-        if await _fetch(f"{API_BASE}/grade-scale", API_HEADERS, timeout=60):
-            logger.info("[API] Server awake.")
             return True
-        logger.info(f"[API] Wake attempt {attempt+1}/3 failed…")
         await asyncio.sleep(10)
-    logger.warning("[API] Server did not wake — skipping API data.")
     return False
-async def load_api() -> List[Dict]:
-    if not await _wake_api():
         return []
     list_results = await asyncio.gather(
-        *[_fetch(f"{API_BASE}/{ep}", API_HEADERS) for ep in API_LIST_ENDPOINTS],
         return_exceptions=True,
     )
     docs, list_cache = [], {}
     for ep, data in zip(API_LIST_ENDPOINTS, list_results):
-        if not data or isinstance(data, Exception): continue
         items = _unwrap(data)
         list_cache[ep] = items
         for item in items:
             text = json.dumps(item, ensure_ascii=False)
             if text.strip():
                 docs.append({"content": text, "source": f"api:{ep}"})
-    logger.info(f"[API lists] {len(docs)} docs")
-    # Detail pages — bounded concurrency, circuit-breaker aware
     detail_tasks = []
     for cfg in API_DETAIL_ENDPOINTS:
         for item in list_cache.get(cfg["list"], []):
             item_id = item.get(cfg["id_field"]) if isinstance(item, dict) else None
-            if item_id is None: continue
-            url = f"{API_BASE}/{cfg['list']}/{item_id}"
-            if _api_fail_count.get(url, 0) < API_FAIL_LIMIT:
                 detail_tasks.append((url, f"api:{cfg['list']}/{item_id}"))
     if detail_tasks:
-        sem = asyncio.Semaphore(5)
-        async def _bounded(url, source):
-            async with sem:
-                return await _fetch(url, API_HEADERS), source
-        results = await asyncio.gather(
-            *[_bounded(u, s) for u, s in detail_tasks],
             return_exceptions=True,
         )
         n = 0
-        for res in results:
-            if isinstance(res, Exception): continue
-            data, source = res
-            if not data: continue
             for item in _unwrap(data):
                 text = json.dumps(item, ensure_ascii=False)
                 if text.strip():
                     docs.append({"content": text, "source": source})
                     n += 1
-        logger.info(f"[API details] {n} docs")
-    logger.info(f"[API total] {len(docs)} raw docs")
     return docs
-async def load_github() -> List[Dict]:
     responses = await asyncio.gather(
-        *[_fetch(GITHUB_BASE + f) for f in GITHUB_FILES],
         return_exceptions=True,
     )
     docs = []
     for fname, data in zip(GITHUB_FILES, responses):
-        if not data or isinstance(data, Exception): continue
         for item in (data if isinstance(data, list) else [data]):
             text = json.dumps(item, ensure_ascii=False)
             if text.strip():
                 docs.append({"content": text, "source": f"github:{fname}"})
-    logger.info(f"[GitHub] {len(docs)} raw docs")
     return docs
-# ═════════════════════════════════════════════════════════════════════════════
 # CHUNKING
-# ═════════════════════════════════════════════════════════════════════════════
-def _flatten_json(obj, path="", sep=" > ") -> List[str]:
-    lines = []
-    if isinstance(obj, dict):
-        for k, v in obj.items():
-            np_ = f"{path}{sep}{k}" if path else k
-            if isinstance(v, (dict, list)):
-                lines.extend(_flatten_json(v, np_, sep))
-            else:
-                val = str(v).strip()
-                if val and val.lower() not in ("null", "none", "", "[]", "{}"):
-                    lines.append(f"{np_}: {val}")
-    elif isinstance(obj, list):
-        for i, item in enumerate(obj):
-            if isinstance(item, (dict, list)):
-                lines.extend(_flatten_json(item, f"{path}[{i}]", sep))
-            else:
-                val = str(item).strip()
-                if val: lines.append(f"{path}[{i}]: {val}")
-    return lines
-def _json_chunks(text: str, source: str) -> List[Dict]:
-    try:
-        obj   = json.loads(text)
-        lines = _flatten_json(obj)
-        if not lines: return []
-        chunks, buf, length = [], [], 0
-        for line in lines:
-            if length + len(line) + 1 > CHUNK_SIZE and buf:
-                chunks.append(" | ".join(buf))
-                keep   = max(1, len(buf) // 5)
-                buf    = buf[-keep:]
-                length = sum(len(l) + 1 for l in buf)
-            buf.append(line); length += len(line) + 1
-        if buf: chunks.append(" | ".join(buf))
-        return [{"content": c, "source": source} for c in chunks if c.strip()]
-    except Exception:
-        return []
-def chunk_documents(docs: List[Dict]) -> List[Dict]:
-    step, out = max(1, CHUNK_SIZE - CHUNK_OVERLAP), []
     for d in docs:
-        text, source = d["content"], d["source"]
-        if not text.strip(): continue
-        jc = _json_chunks(text, source)
-        if jc: out.extend(jc); continue
-        if len(text) <= CHUNK_SIZE: out.append(d); continue
         start = 0
         while start < len(text):
-            chunk = text[start:start + CHUNK_SIZE]
-            if chunk.strip(): out.append({"content": chunk, "source": source})
             start += step
     return out
-# ═════════════════════════════════════════════════════════════════════════════
-# KNOWLEDGE GRAPH
-# ═════════════════════════════════════════════════════════════════════════════
-_STOP = set(string.punctuation) | {
-    "the","a","an","is","are","was","were","of","in","at","to","for",
-    "and","or","not","this","that","it","its","with","as","by","on",
-    "from","all","be","been","has","have","had","will","would","can",
-    "could","do","does","did","he","she","they","we","you","i","me",
-}
-def build_knowledge_graph(docs: List[Dict]):
-    if not NX_OK: return None, {}
-    G = nx.DiGraph()
-    entity_index: Dict[str, List[int]] = {}
-    for ci, doc in enumerate(docs):
-        for line in doc["content"].split(" | "):
-            parts = line.split(": ", 1)
-            if len(parts) != 2: continue
-            key, val = parts[0].strip().lower(), parts[1].strip().lower()
-            if not G.has_node(key): G.add_node(key, type="field")
-            if not G.has_node(val):  G.add_node(val, type="value")
-            G.add_edge(key, val, chunk=ci)
-            for tok in val.split():
-                tok = tok.strip(string.punctuation).lower()
-                if tok and tok not in _STOP and len(tok) > 2:
-                    entity_index.setdefault(tok, []).append(ci)
-    logger.info(f"[KG] nodes={G.number_of_nodes()}, edges={G.number_of_edges()}, "
-                f"tokens={len(entity_index)}")
-    return G, entity_index
-def kg_search(query: str, k: int = 5) -> List[int]:
-    if not state.kg or not state.entity_index: return []
-    tokens = [t.strip(string.punctuation).lower()
-              for t in query.split() if t.lower() not in _STOP]
-    scores: Dict[int, int] = {}
-    for tok in tokens:
-        for idx in state.entity_index.get(tok, []):
-            scores[idx] = scores.get(idx, 0) + 1
-        if state.kg.has_node(tok):
-            for nbr in state.kg.successors(tok):
-                ed = state.kg[tok].get(nbr, {})
-                ci = ed.get("chunk") if isinstance(ed, dict) else None
-                if ci is not None:
-                    scores[ci] = scores.get(ci, 0) + 1
-    return sorted(scores, key=scores.get, reverse=True)[:k]
-# ═════════════════════════════════════════════════════════════════════════════
-# INDEX BUILDING + DISK CACHE
-# ═════════════════════════════════════════════════════════════════════════════
-def build_indexes_from_scratch() -> bool:
     if not state.documents:
-        logger.warning("[WARN] No documents to index.")
         return False
     texts = [d["content"] for d in state.documents]
@@ -464,446 +252,265 @@ def build_indexes_from_scratch() -> bool:
         try:
             emb = state.embedder.encode(
                 texts, normalize_embeddings=True,
-                show_progress_bar=True, batch_size=64,
             )
             emb = np.array(emb, dtype="float32")
             if emb.ndim == 2 and emb.shape[0] > 0:
-                idx = faiss.IndexFlatIP(emb.shape[1])
-                idx.add(emb)
-                state.faiss_index    = idx
-                state.doc_embeddings = emb
-                _save_faiss(idx)
-                _save("doc_embeddings.pkl", emb)
-                logger.info(f"[FAISS] {idx.ntotal} vectors (dim={emb.shape[1]})")
         except Exception as e:
-            logger.error(f"[ERROR] FAISS: {e}")
     if BM25_OK:
         try:
             tok = [t.lower().split() for t in texts if t.strip()]
             if tok:
-                b = BM25Okapi(tok)
-                state.bm25 = b
-                _save("bm25.pkl", b)
-                logger.info(f"[BM25] {len(tok)} docs")
         except Exception as e:
-            logger.error(f"[ERROR] BM25: {e}")
-    kg, ei = build_knowledge_graph(state.documents)
-    state.kg           = kg
-    state.entity_index = ei
-    if kg:
-        _save("kg.pkl", kg)
-        _save("entity_index.pkl", ei)
-    _save("documents.pkl", state.documents)
     return True
-def load_indexes_from_cache() -> bool:
-    docs = _load("documents.pkl")
-    if not docs: return False
-    state.documents = docs
-    if FAISS_OK:
-        idx = _load_faiss()
-        if idx: state.faiss_index = idx
-    emb = _load("doc_embeddings.pkl")
-    if emb is not None: state.doc_embeddings = emb
-    bm25 = _load("bm25.pkl")
-    if bm25: state.bm25 = bm25
-    kg = _load("kg.pkl")
-    ei = _load("entity_index.pkl")
-    if kg:
-        state.kg           = kg
-        state.entity_index = ei or {}
-    return bool(state.documents) and (
-        state.faiss_index is not None or state.bm25 is not None
-    )
-# ═════════════════════════════════════════════════════════════════════════════
 # RETRIEVAL
-# ═════════════════════════════════════════════════════════════════════════════
-def _encode_query(query: str, hyde_text: str = "") -> np.ndarray:
-    q_emb = state.embedder.encode([query], normalize_embeddings=True)
-    if hyde_text and ENABLE_HYDE:
-        h_emb   = state.embedder.encode([hyde_text], normalize_embeddings=True)
-        blended = 0.6 * q_emb + 0.4 * h_emb
-        blended = blended / (np.linalg.norm(blended, axis=1, keepdims=True) + 1e-9)
-        return np.array(blended, dtype="float32")
-    return np.array(q_emb, dtype="float32")
-def _dense(q_vec: np.ndarray, k: int = TOP_K_RETRIEVE) -> List[Dict]:
-    if not state.faiss_index: return []
     try:
         k_a = min(k, state.faiss_index.ntotal)
-        scores, ids = state.faiss_index.search(q_vec, k_a)
         return [{**state.documents[i], "score": float(s)}
                 for s, i in zip(scores[0], ids[0]) if i >= 0]
     except Exception as e:
-        logger.error(f"[ERROR] dense: {e}"); return []
-def _sparse(query: str, k: int = TOP_K_RETRIEVE) -> List[Dict]:
-    if not state.bm25: return []
     try:
         tokens = query.lower().split()
-        if not tokens: return []
         scores = np.array(state.bm25.get_scores(tokens), dtype="float32")
-        idx    = np.argsort(scores)[::-1][:min(k, len(scores))]
         return [{**state.documents[i], "score": float(scores[i])}
                 for i in idx if scores[i] > 0]
     except Exception as e:
-        logger.error(f"[ERROR] sparse: {e}"); return []
-def rrf_fuse(lists: List[List[Dict]], weights: List[float], rrf_k=60) -> List[Dict]:
-    merged, doc_map = {}, {}
-    for lst, w in zip(lists, weights):
-        for rank, d in enumerate(lst):
-            key         = d["content"]
-            merged[key] = merged.get(key, 0.0) + w / (rrf_k + rank + 1)
-            doc_map[key] = d
     return [{**doc_map[c], "rrf_score": round(s, 6)}
-            for c, s in sorted(merged.items(), key=lambda x: x[1], reverse=True)]
-def expand_queries(query: str) -> List[str]:
-    variants = [query]
-    ACRONYMS = {
-        "ewu": "East West University",
-        "cse": "Computer Science Engineering",
-        "eee": "Electrical Electronic Engineering",
-        "ece": "Electronic Communication Engineering",
-        "mba": "Master of Business Administration",
-        "gpa": "grade point average",
-        "cgpa": "cumulative grade point average",
-        "vc":  "Vice Chancellor",
-        "dept": "department",
-    }
-    q_low = query.lower()
-    expanded = q_low
-    for abbr, full in ACRONYMS.items():
-        expanded = re.sub(r"\b" + abbr + r"\b", full, expanded)
-    if expanded != q_low:
-        variants.append(expanded)
-    _QW = {"what","who","when","where","how","why","is","are","does",
-           "do","the","a","an","tell","me","about"}
-    kw = [w for w in re.findall(r"\w+", q_low) if w not in _QW and len(w) > 2]
-    if kw and " ".join(kw) != q_low:
-        variants.append(" ".join(kw))
-    return list(dict.fromkeys(variants))[:3]
-def rerank(query: str, candidates: List[Dict], top_n: int) -> List[Dict]:
-    if not state.reranker or not candidates:
-        return candidates[:top_n]
-    try:
-        pairs  = [(query, d["content"]) for d in candidates]
-        scores = state.reranker.predict(pairs, batch_size=32, show_progress_bar=False)
-        scored = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)
-        out    = [{**doc, "rerank_score": float(sc)}
-                  for sc, doc in scored[:top_n] if sc >= RERANK_THRESH]
-        return out or [{**doc, "rerank_score": float(sc)} for sc, doc in scored[:top_n]]
-    except Exception as e:
-        logger.error(f"[ERROR] rerank: {e}")
-        return candidates[:top_n]
-def mmr_select(q_vec: np.ndarray, candidates: List[Dict], k: int) -> List[Dict]:
-    if state.doc_embeddings is None or not candidates:
-        return candidates[:k]
-    c2i  = {d["content"]: i for i, d in enumerate(state.documents)}
-    idxs = [c2i[d["content"]] for d in candidates if d["content"] in c2i]
-    if not idxs: return candidates[:k]
-    ce   = state.doc_embeddings[idxs]
-    q    = q_vec[0]
-    rel  = ce @ q
-    selected, sel_embs, remaining = [], [], list(range(len(idxs)))
-    for _ in range(min(k, len(remaining))):
-        if not remaining: break
-        if not sel_embs:
-            best = max(remaining, key=lambda i: rel[i])
-        else:
-            S = np.array(sel_embs)
-            best, bs = remaining[0], -1e9
-            for i in remaining:
-                score = MMR_LAMBDA * rel[i] - (1 - MMR_LAMBDA) * float(np.max(S @ ce[i]))
-                if score > bs: bs, best = score, i
-        selected.append(best); sel_embs.append(ce[best]); remaining.remove(best)
-    return [candidates[i] for i in selected]
-def compress_chunk(q_vec: np.ndarray, text: str) -> str:
-    if state.embedder is None: return text
-    lines = [l.strip() for l in re.split(r"[|\n]|(?<=[.!?])\s+", text) if l.strip()]
-    if len(lines) <= 2: return text
-    try:
-        embs = state.embedder.encode(lines, normalize_embeddings=True)
-        sims = embs @ q_vec[0]
-        kept = [l for l, s in zip(lines, sims) if s >= COMPRESS_THRESH]
-        return " | ".join(kept) if kept else text
-    except Exception:
-        return text
-async def full_retrieval(query: str, k: int = TOP_K_FINAL) -> List[Dict]:
-    variants = await asyncio.to_thread(expand_queries, query)
-    all_dense, all_sparse = [], []
-    for v in variants:
-        if state.embedder:
-            vec = await asyncio.to_thread(_encode_query, v)
-            all_dense.append(await asyncio.to_thread(_dense, vec, TOP_K_RETRIEVE))
-        all_sparse.append(await asyncio.to_thread(_sparse, v, TOP_K_RETRIEVE))
-    weights = [1.0 / (i + 1) for i in range(len(variants))]
-    fused   = rrf_fuse(all_dense + all_sparse, weights + weights)
-    kg_idxs  = await asyncio.to_thread(kg_search, query, k * 2)
-    existing = {d["content"] for d in fused}
-    for i in kg_idxs:
-        if 0 <= i < len(state.documents):
-            d = state.documents[i]
-            if d["content"] not in existing:
-                fused.append({**d, "rrf_score": 0.0, "kg_injected": True})
-    reranked = await asyncio.to_thread(rerank, query, fused, top_n=k * 3)
-    if state.embedder:
-        q_vec     = await asyncio.to_thread(_encode_query, query)
-        final_set = await asyncio.to_thread(mmr_select, q_vec, reranked, k)
-        compressed = []
-        for doc in final_set:
-            ct = await asyncio.to_thread(compress_chunk, q_vec, doc["content"])
-            compressed.append({**doc, "content": ct})
-        return compressed
-    return reranked[:k]
-# ═════════════════════════════════════════════════════════════════════════════
-# GENERATION
-# ═════════════════════════════════════════════════════��═══════════════════════
-SYSTEM_PROMPT = """You are EWU Assistant for East West University.
-RULES:
-1. Answer ONLY from the provided context.
-2. If the context lacks the answer, say "I don't have that information."
-3. Be specific — include numbers, names, dates when present.
-4. Do NOT repeat context verbatim. Summarise clearly.
-5. Never hallucinate facts not in the context."""
 def _run_tinyllama(query: str, context: str) -> str:
     if state.generator is None:
-        return f"Based on available information:\n\n{context[:800]}"
-    trimmed  = context[:2000] + ("…" if len(context) > 2000 else "")
     messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {"role": "user",   "content": f"Context:\n{trimmed}\n\nQuestion: {query}\n\nAnswer:"},
     ]
     try:
-        out       = state.generator(messages, max_new_tokens=280,
-                                    do_sample=True, temperature=0.25,
-                                    top_p=0.90, repetition_penalty=1.15)
-        generated = out[0]["generated_text"]
         if isinstance(generated, list):
             for turn in reversed(generated):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     return turn.get("content", "").strip()
         return str(generated).strip()
     except Exception as e:
-        logger.error(f"[ERROR] TinyLlama: {e}")
         return f"[Generation error: {e}]"
 async def generate(query: str, context: str) -> str:
     return await asyncio.to_thread(_run_tinyllama, query, context)
-# ═════════════════════════════════════════════════════════════════════════════
-# BOOT — two-phase, cache-aware
-# ═════════════════════════════════════════════════════════════════════════════
-def _load_models():
-    emb, ce = None, None
-    if ST_OK:
-        try:
-            logger.info(f"  Loading embedder ({EMBED_MODEL}) on {DEVICE}…")
-            emb = SentenceTransformer(EMBED_MODEL, device=DEVICE)
-            logger.info("  Embedder ready.")
-        except Exception as e:
-            logger.error(f"[ERROR] Embedder: {e}")
-        try:
-            logger.info(f"  Loading cross-encoder ({RERANK_MODEL})…")
-            ce = CrossEncoder(RERANK_MODEL, device=DEVICE, max_length=512)
-            logger.info("  Cross-encoder ready.")
-        except Exception as e:
-            logger.warning(f"[WARN] Cross-encoder: {e}")
-    return emb, ce
 def _load_generator():
-    if not HF_OK: return None
     try:
-        logger.info(f"  Loading TinyLlama ({GEN_MODEL}) on {DEVICE}…")
-        gen = hf_pipeline("text-generation", model=GEN_MODEL,
-                          device=0 if DEVICE == "cuda" else -1, dtype="auto")
-        logger.info("  TinyLlama ready.")
         return gen
     except Exception as e:
-        logger.error(f"[ERROR] TinyLlama: {e}")
         return None
-async def _boot_phase1():
-    """Load models + indexes. Sets state.ready = True when done."""
-    logger.info(f"=== PHASE 1: Models + Indexes (device={DEVICE}) ===")
-    emb, ce         = await asyncio.to_thread(_load_models)
-    state.embedder  = emb
-    state.reranker  = ce
-    # Try disk cache
-    cache_ok = (
-        _cache_fresh("documents.pkl")
-        and _cache_fresh("faiss.index")
-        and _cache_fresh("bm25.pkl")
-    )
-    if cache_ok:
-        logger.info("[cache] Loading from disk…")
-        if await asyncio.to_thread(load_indexes_from_cache):
-            logger.info(f"[cache] {len(state.documents)} chunks loaded from disk.")
-            state.ready = True
-            logger.info("✓ Phase 1 complete (cache hit).")
-            return
-    logger.info("Fetching knowledge base (API + GitHub)…")
-    api_docs, gh_docs = await asyncio.gather(load_api(), load_github())
-    raw = api_docs + gh_docs
-    logger.info(f"  Raw docs combined: {len(raw)}")
-    if not raw:
-        logger.warning("[WARN] No documents fetched.")
-        state.ready = True
-        return
-    logger.info("Chunking…")
-    state.documents = await asyncio.to_thread(chunk_documents, raw)
-    logger.info(f"  Total chunks: {len(state.documents)}")
-    logger.info("Building indexes…")
-    await asyncio.to_thread(build_indexes_from_scratch)
-    state.ready = True
-    logger.info("✓ Phase 1 complete — server accepting queries.")
-async def _boot_phase2():
-    """Load TinyLlama in the background — does not block /rag."""
-    logger.info("=== PHASE 2: TinyLlama (background) ===")
-    gen = await asyncio.to_thread(_load_generator)
-    state.generator = gen
-    state.gen_ready = gen is not None
-    if gen:
-        logger.info("✓ Phase 2 complete — full LLM answers active.")
-    else:
-        logger.warning("Phase 2: TinyLlama unavailable — context-only mode.")
-async def _boot():
-    try:
-        await _boot_phase1()
-        asyncio.create_task(_boot_phase2())   # fire-and-forget
     except Exception as e:
         state.error = str(e)
         state.ready = False
-        logger.error(f"[ERROR] Boot failed: {e}")
         import traceback; traceback.print_exc()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    task = asyncio.create_task(_boot())
     try:
         yield
     finally:
-        task.cancel()
         try:
-            await task
         except asyncio.CancelledError:
             pass
-# ═════════════════════════════════════════════════════════════════════════════
-# ENDPOINTS
-# ═════════════════════════════════════════════════════════════════════════════
-app = FastAPI(title="EWU RAG Server v2.1", lifespan=lifespan)
 class Query(BaseModel):
     query : str
-    top_k : int = TOP_K_FINAL
 @app.post("/rag")
 async def rag_endpoint(q: Query):
     if not state.ready:
-        raise HTTPException(503, detail=state.error or "Initialising — retry in 30s.")
     if not q.query.strip():
         raise HTTPException(400, detail="Query must not be empty.")
-    results = await full_retrieval(q.query, k=q.top_k)
     if not results:
         return {"answer": "No relevant information found.", "sources": []}
     context = "\n\n---\n\n".join(r["content"] for r in results)
     answer  = await generate(q.query, context)
     return {
-        "answer"   : answer,
-        "gen_ready": state.gen_ready,
-        "sources"  : [
-            {
-                "source"      : r.get("source"),
-                "rerank_score": round(r.get("rerank_score", 0), 4),
-                "rrf_score"   : round(r.get("rrf_score",   0), 6),
-                "kg_injected" : r.get("kg_injected", False),
-            }
-            for r in results
-        ],
     }
 @app.get("/health")
 async def health():
     return JSONResponse(200, {
-        "status"      : "ready" if state.ready else ("error" if state.error else "loading"),
-        "gen_ready"   : state.gen_ready,
-        "docs"        : len(state.documents),
-        "device"      : DEVICE,
-        "faiss"       : state.faiss_index is not None,
-        "bm25"        : state.bm25 is not None,
-        "reranker"    : state.reranker is not None,
-        "generator"   : state.generator is not None,
-        "hyde_enabled": ENABLE_HYDE,
-        "kg_nodes"    : state.kg.number_of_nodes() if state.kg else 0,
-        "kg_edges"    : state.kg.number_of_edges() if state.kg else 0,
-        "error"       : state.error or None,
     })
-@app.post("/cache/clear")
-async def clear_cache():
-    """Delete disk cache — server will rebuild from scratch on next restart."""
-    import shutil
-    try:
-        shutil.rmtree(CACHE_DIR, ignore_errors=True)
-        os.makedirs(CACHE_DIR, exist_ok=True)
-        return {"status": "cache cleared — restart the server to rebuild"}
-    except Exception as e:
-        raise HTTPException(500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 import os
+import json
+import asyncio
 import numpy as np
 import uvicorn
+import httpx
+from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 try:
     import faiss
     FAISS_OK = True
     FAISS_OK = False
 try:
+    from sentence_transformers import SentenceTransformer
     ST_OK = True
 except ImportError:
     ST_OK = False
     HF_OK = True
 except ImportError:
     HF_OK = False
+    print("[WARN] transformers not installed — generation disabled.")
+# ─────────────────────────────────────────────
 # CONFIG
+# ─────────────────────────────────────────────
 API_BASE      = "https://ewu-server.onrender.com/api"
 API_KEY       = "i6EDytaX4E2jI6GvZQc0b1RSZHTI5_wVRa2rfL7rLpk"
 API_HEADERS   = {"x-api-key": API_KEY}
+GITHUB_BASE   = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
+EMBED_MODEL   = "sentence-transformers/all-MiniLM-L6-v2"
+GEN_MODEL     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+CHUNK_SIZE    = 400
+CHUNK_OVERLAP = 80
+DEVICE        = "cpu"
+try:
+    import torch
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+except ImportError:
+    pass
 API_LIST_ENDPOINTS = [
     "admission-deadlines", "academic-calendar", "grade-scale",
     "governance", "alumni", "helpdesk", "policies", "proctor-schedule",
     "documents", "newsletters", "programs", "faculty", "departments",
 ]
 API_DETAIL_ENDPOINTS = [
     {"list": "programs",  "id_field": "id"},
     {"list": "faculty",   "id_field": "id"},
     {"list": "documents", "id_field": "slug"},
 ]
 GITHUB_FILES = [
     "admission_deadlines.json", "dynamic_admission_process.json",
     "dynamic_admission_requirements.json", "dynamic_tution_fees.json",
     "static_aboutEWU.json", "static_Admin.json",
     "static_AllAvailablePrograms.json", "static_alumni.json",
     "static_campus_life.json", "static_Career_Counseling_Center.json",
+    "static_clubs.json", "static_depts.json", "static_facilities.json",
     "static_helpdesk.json", "static_payment_procedure.json",
     "static_Policy.json", "static_Programs.json", "static_Rules.json",
     "static_Sexual_harassment.json", "static_Tuition_fees.json",
     "syndicate.json", "tesol.json", "ewu_board_of_trustees.json",
 ]
+# ─────────────────────────────────────────────
 # APP STATE
+# ─────────────────────────────────────────────
 class AppState:
+    embedder    = None
+    generator   = None   # TinyLlama pipeline
+    documents   : list = []
+    faiss_index = None
+    bm25        = None
+    ready       : bool = False
+    error       : str  = ""
 state = AppState()
+# ─────────────────────────────────────────────
 # DATA LOADING
+# ─────────────────────────────────────────────
+async def fetch_json(url: str, headers: dict = None, timeout: int = 60):
     try:
         async with httpx.AsyncClient(timeout=timeout) as client:
             r = await client.get(url, headers=headers or {})
             if r.status_code == 200:
                 return r.json()
+            print(f"[WARN] {url} → HTTP {r.status_code}")
     except Exception as e:
+        print(f"[WARN] {url} → {e}")
     return None
+def _unwrap(data) -> list:
+    if isinstance(data, list):
+        return data
     if isinstance(data, dict):
+        for key in ("data", "results", "items"):
+            if key in data and isinstance(data[key], list):
+                return data[key]
         return [data]
     return []
+async def _wake_api_server():
+    """
+    render.com free tier sleeps after inactivity.
+    Hit a cheap endpoint first and wait for it to wake up (can take ~50s).
+    """
+    print("  [API] Waking render.com server (free tier may be sleeping)…")
     for attempt in range(3):
+        result = await fetch_json(f"{API_BASE}/grade-scale", API_HEADERS, timeout=60)
+        if result is not None:
+            print("  [API] Server awake.")
             return True
+        print(f"  [API] Wake attempt {attempt+1}/3 failed, retrying…")
         await asyncio.sleep(10)
+    print("  [API] Server did not wake — skipping API data.")
     return False
+async def load_api() -> list:
+    awake = await _wake_api_server()
+    if not awake:
         return []
     list_results = await asyncio.gather(
+        *[fetch_json(f"{API_BASE}/{ep}", API_HEADERS) for ep in API_LIST_ENDPOINTS],
         return_exceptions=True,
     )
     docs, list_cache = [], {}
     for ep, data in zip(API_LIST_ENDPOINTS, list_results):
+        if not data or isinstance(data, Exception):
+            continue
         items = _unwrap(data)
         list_cache[ep] = items
         for item in items:
             text = json.dumps(item, ensure_ascii=False)
             if text.strip():
                 docs.append({"content": text, "source": f"api:{ep}"})
+    print(f"  [API lists] {len(docs)} docs")
     detail_tasks = []
     for cfg in API_DETAIL_ENDPOINTS:
         for item in list_cache.get(cfg["list"], []):
             item_id = item.get(cfg["id_field"]) if isinstance(item, dict) else None
+            if item_id is not None:
+                url = f"{API_BASE}/{cfg['list']}/{item_id}"
                 detail_tasks.append((url, f"api:{cfg['list']}/{item_id}"))
     if detail_tasks:
+        detail_results = await asyncio.gather(
+            *[fetch_json(url, API_HEADERS) for url, _ in detail_tasks],
             return_exceptions=True,
         )
         n = 0
+        for (_, source), data in zip(detail_tasks, detail_results):
+            if not data or isinstance(data, Exception):
+                continue
             for item in _unwrap(data):
                 text = json.dumps(item, ensure_ascii=False)
                 if text.strip():
                     docs.append({"content": text, "source": source})
                     n += 1
+        print(f"  [API details] {n} docs from {len(detail_tasks)} pages")
+    print(f"  [API total] {len(docs)} raw docs")
     return docs
+async def load_github() -> list:
     responses = await asyncio.gather(
+        *[fetch_json(GITHUB_BASE + f) for f in GITHUB_FILES],
         return_exceptions=True,
     )
     docs = []
     for fname, data in zip(GITHUB_FILES, responses):
+        if not data or isinstance(data, Exception):
+            continue
         for item in (data if isinstance(data, list) else [data]):
             text = json.dumps(item, ensure_ascii=False)
             if text.strip():
                 docs.append({"content": text, "source": f"github:{fname}"})
+    print(f"  [GitHub] {len(docs)} raw docs")
     return docs
+# ─────────────────────────────────────────────
 # CHUNKING
+# ─────────────────────────────────────────────
+def chunk_documents(docs, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    step, out = max(1, size - overlap), []
     for d in docs:
+        text = d["content"]
+        if not text.strip():
+            continue
+        if len(text) <= size:
+            out.append(d)
+            continue
         start = 0
         while start < len(text):
+            chunk = text[start:start+size]
+            if chunk.strip():
+                out.append({"content": chunk, "source": d["source"]})
             start += step
     return out
+# ─────────────────────────────────────────────
+# INDEX BUILDING
+# ─────────────────────────────────────────────
+def build_indexes():
     if not state.documents:
+        print("[WARN] No documents to index.")
         return False
     texts = [d["content"] for d in state.documents]
         try:
             emb = state.embedder.encode(
                 texts, normalize_embeddings=True,
+                show_progress_bar=False, batch_size=64,
             )
             emb = np.array(emb, dtype="float32")
             if emb.ndim == 2 and emb.shape[0] > 0:
+                state.faiss_index = faiss.IndexFlatIP(emb.shape[1])
+                state.faiss_index.add(emb)
+                print(f"  [FAISS] {state.faiss_index.ntotal} vectors (dim={emb.shape[1]})")
         except Exception as e:
+            print(f"[ERROR] FAISS: {e}")
+            state.faiss_index = None
     if BM25_OK:
         try:
             tok = [t.lower().split() for t in texts if t.strip()]
             if tok:
+                state.bm25 = BM25Okapi(tok)
+                print(f"  [BM25] {len(tok)} docs")
         except Exception as e:
+            print(f"[ERROR] BM25: {e}")
+            state.bm25 = None
     return True
+# ─────────────────────────────────────────────
 # RETRIEVAL
+# ─────────────────────────────────────────────
+def search_dense(query, k=8):
+    if not state.faiss_index or not state.embedder:
+        return []
     try:
+        vec = np.array(
+            state.embedder.encode([query], normalize_embeddings=True), dtype="float32"
+        )
         k_a = min(k, state.faiss_index.ntotal)
+        if not k_a:
+            return []
+        scores, ids = state.faiss_index.search(vec, k_a)
         return [{**state.documents[i], "score": float(s)}
                 for s, i in zip(scores[0], ids[0]) if i >= 0]
     except Exception as e:
+        print(f"[ERROR] dense: {e}")
+        return []
+def search_sparse(query, k=8):
+    if not state.bm25 or not state.documents:
+        return []
     try:
         tokens = query.lower().split()
+        if not tokens:
+            return []
         scores = np.array(state.bm25.get_scores(tokens), dtype="float32")
+        idx = np.argsort(scores)[::-1][:min(k, len(scores))]
         return [{**state.documents[i], "score": float(scores[i])}
                 for i in idx if scores[i] > 0]
     except Exception as e:
+        print(f"[ERROR] sparse: {e}")
+        return []
+def hybrid_search(query, k=5, alpha=0.65):
+    dense  = search_dense(query, k * 3)
+    sparse = search_sparse(query, k * 3)
+    if not dense and not sparse:
+        return []
+    rrf_k, merged, doc_map = 60, {}, {}
+    for rank, d in enumerate(dense):
+        key = d["content"]
+        merged[key]  = merged.get(key, 0.0) + alpha / (rrf_k + rank + 1)
+        doc_map[key] = d
+    for rank, d in enumerate(sparse):
+        key = d["content"]
+        merged[key]  = merged.get(key, 0.0) + (1 - alpha) / (rrf_k + rank + 1)
+        doc_map[key] = d
     return [{**doc_map[c], "rrf_score": round(s, 6)}
+            for c, s in sorted(merged.items(), key=lambda x: x[1], reverse=True)[:k]]
+async def async_hybrid_search(query, k=5):
+    return await asyncio.to_thread(hybrid_search, query, k)
+# ─────────────────────────────────────────────
+# GENERATION  — TinyLlama (local, no API key)
+# ─────────────────────────────────────────────
+SYSTEM_PROMPT = (
+    "You are a helpful assistant for East West University (EWU). "
+    "Answer using ONLY the context provided. "
+    "If the context does not contain enough information, say so honestly. "
+    "Be concise and accurate. Do not repeat the context."
+)
 def _run_tinyllama(query: str, context: str) -> str:
+    """
+    Synchronous TinyLlama call.
+    Uses the chat template format TinyLlama-1.1B-Chat was trained on.
+    Always call via asyncio.to_thread — never directly from async code.
+    """
     if state.generator is None:
+        return f"[Generator not loaded]\n\nContext:\n{context}"
+    # TinyLlama chat template: <|system|>...<|user|>...<|assistant|>
+    # Trim context to ~1500 chars so it fits in the 2048-token window
+    trimmed_context = context[:1500] + ("…" if len(context) > 1500 else "")
     messages = [
+        {"role": "system",  "content": SYSTEM_PROMPT},
+        {"role": "user",    "content": f"Context:\n{trimmed_context}\n\nQuestion: {query}"},
     ]
     try:
+        outputs = state.generator(
+            messages,
+            max_new_tokens=256,
+            do_sample=True,
+            temperature=0.3,    # low = more factual, less hallucination
+            top_p=0.9,
+            repetition_penalty=1.1,
+        )
+        # transformers pipeline returns list of dicts with generated_text
+        generated = outputs[0]["generated_text"]
+        # generated_text is the full conversation list; grab the last assistant turn
         if isinstance(generated, list):
             for turn in reversed(generated):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     return turn.get("content", "").strip()
+        # Fallback: return raw string
         return str(generated).strip()
     except Exception as e:
+        print(f"[ERROR] TinyLlama inference: {e}")
         return f"[Generation error: {e}]"
 async def generate(query: str, context: str) -> str:
+    """Async wrapper — runs TinyLlama in a thread so the event loop stays free."""
     return await asyncio.to_thread(_run_tinyllama, query, context)
+# ─────────────────────────────────────────────
+# BOOT
+# ─────────────────────────────────────────────
 def _load_generator():
+    """Load TinyLlama pipeline. Runs in a thread during boot."""
+    if not HF_OK:
+        print("[WARN] transformers unavailable — generation disabled.")
+        return None
     try:
+        print(f"  Loading TinyLlama on {DEVICE}…")
+        gen = hf_pipeline(
+            "text-generation",
+            model=GEN_MODEL,
+            device=0 if DEVICE == "cuda" else -1,   # -1 = CPU for transformers pipeline
+            dtype="auto",
+        )
+        print("  TinyLlama ready.")
         return gen
     except Exception as e:
+        print(f"[ERROR] Could not load TinyLlama: {e}")
         return None
+async def _boot():
+    try:
+        # 1. Load both models concurrently in threads
+        print(f"Loading models on {DEVICE}…")
+        state.embedder, state.generator = await asyncio.gather(
+            asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
+                else asyncio.to_thread(lambda: None),
+            asyncio.to_thread(_load_generator),
+        )
+        if state.embedder:
+            print("  Embedder ready.")
+        # 2. Fetch API + GitHub concurrently
+        print("Fetching knowledge base (API + GitHub)…")
+        api_docs, gh_docs = await asyncio.gather(
+            load_api(), load_github(), return_exceptions=False,
+        )
+        raw_docs = api_docs + gh_docs
+        print(f"  Combined raw docs: {len(raw_docs)}")
+        if not raw_docs:
+            print("[WARN] No documents fetched.")
+        # 3. Chunk
+        state.documents = await asyncio.to_thread(chunk_documents, raw_docs)
+        print(f"  Total chunks: {len(state.documents)}")
+        # 4. Build indexes
+        print("Building indexes…")
+        await asyncio.to_thread(build_indexes)
+        state.ready = True
+        print("✓ RAG server ready.")
     except Exception as e:
         state.error = str(e)
         state.ready = False
+        print(f"[ERROR] Boot failed: {e}")
         import traceback; traceback.print_exc()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    boot_task = asyncio.create_task(_boot())
     try:
         yield
     finally:
+        boot_task.cancel()
         try:
+            await boot_task
         except asyncio.CancelledError:
             pass
+# ─────────────────────────────────────────────
+# APP + ENDPOINTS
+# ─────────────────────────────────────────────
+app = FastAPI(title="EWU RAG Server", lifespan=lifespan)
 class Query(BaseModel):
     query : str
+    top_k : int = 5
 @app.post("/rag")
 async def rag_endpoint(q: Query):
     if not state.ready:
+        raise HTTPException(503, detail=state.error or "Still initializing — retry shortly.")
     if not q.query.strip():
         raise HTTPException(400, detail="Query must not be empty.")
+    results = await async_hybrid_search(q.query, k=q.top_k)
     if not results:
         return {"answer": "No relevant information found.", "sources": []}
     context = "\n\n---\n\n".join(r["content"] for r in results)
     answer  = await generate(q.query, context)
     return {
+        "answer": answer,
+        "sources": [{"source": r.get("source"), "rrf_score": r.get("rrf_score", 0)}
+                    for r in results],
     }
 @app.get("/health")
 async def health():
     return JSONResponse(200, {
+        "status"    : "ready" if state.ready else ("error" if state.error else "loading"),
+        "docs"      : len(state.documents),
+        "device"    : DEVICE,
+        "faiss"     : state.faiss_index is not None,
+        "bm25"      : state.bm25 is not None,
+        "generator" : state.generator is not None,
+        "error"     : state.error or None,
     })
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)