Spaces:

ruslanmv
/

matrix-ai

Sleeping

App Files Files Community

ruslanmv commited on Sep 27, 2025

Commit

215df55

1 Parent(s): 6338f31

VectorDB

Browse files

Files changed (11) hide show

.gitignore +2 -0
Makefile +14 -1
app/core/rag/build.py +300 -0
app/main.py +49 -25
app/middleware.py +51 -8
app/routers/chat.py +30 -19
app/services/chat_service.py +236 -34
configs/rag_sources.yaml +41 -0
data/kb.jsonl +0 -0
requirements.txt +5 -0
scripts/build_kb.py +54 -0

.gitignore CHANGED Viewed

@@ -32,3 +32,5 @@ Thumbs.db
 # RAG index files
 .faiss/
 /backup

 # RAG index files
 .faiss/
 /backup
+ copy *.*
+* copy.*

Makefile CHANGED Viewed

@@ -55,6 +55,10 @@ help:
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "run" "Run uvicorn (PORT=$(PORT))"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "run-hot" "Run with --reload"
 	@echo
 	@echo "$(BRIGHT_GREEN)Docker$(RESET)"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "docker-build" "Build local image ($(IMG_NAME))"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "docker-run" "Run local container (maps $(PORT))"
@@ -100,6 +104,15 @@ run: install
 run-hot: install
 	@PORT=$(PORT) $(VENV_DIR)/bin/uvicorn $(APP_MODULE) --host 0.0.0.0 --port $(PORT) --reload
 # ---------------------------------------------------------------------------
 # Docker
 # ---------------------------------------------------------------------------
@@ -121,4 +134,4 @@ space-url:
 clean:
 	@rm -rf .venv __pycache__ .pytest_cache .ruff_cache .mypy_cache dist build *.egg-info
-.PHONY: help venv install lint fmt test run run-hot docker-build docker-run space-url clean

 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "run" "Run uvicorn (PORT=$(PORT))"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "run-hot" "Run with --reload"
 	@echo
+	@echo "$(BRIGHT_GREEN)RAG / Knowledge Base$(RESET)"
+	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "kb" "Build/refresh KB from GitHub + local docs (writes data/kb.jsonl)"
+	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "kb-force" "Force rebuild KB (deletes existing data/kb.jsonl)"
+	@echo
 	@echo "$(BRIGHT_GREEN)Docker$(RESET)"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "docker-build" "Build local image ($(IMG_NAME))"
 	@printf "  $(BRIGHT_GREEN)%-22s$(RESET) $(DIM_GREEN)%s$(RESET)\n" "docker-run" "Run local container (maps $(PORT))"
 run-hot: install
 	@PORT=$(PORT) $(VENV_DIR)/bin/uvicorn $(APP_MODULE) --host 0.0.0.0 --port $(PORT) --reload
+# ---------------------------------------------------------------------------
+# RAG / Knowledge Base
+# ---------------------------------------------------------------------------
+kb: install
+	@PYTHONPATH=. $(PYTHON) scripts/build_kb.py --config configs/rag_sources.yaml --out data/kb.jsonl
+kb-force: install
+	@rm -f data/kb.jsonl && PYTHONPATH=. $(PYTHON) scripts/build_kb.py --config configs/rag_sources.yaml --out data/kb.jsonl
 # ---------------------------------------------------------------------------
 # Docker
 # ---------------------------------------------------------------------------
 clean:
 	@rm -rf .venv __pycache__ .pytest_cache .ruff_cache .mypy_cache dist build *.egg-info
+.PHONY: help venv install lint fmt test run run-hot kb kb-force docker-build docker-run space-url clean

app/core/rag/build.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from __future__ import annotations
+import json, os, re, time, math, logging
+from pathlib import Path
+from typing import Dict, List, Iterable, Tuple, Optional
+import yaml
+import requests
+log = logging.getLogger(__name__)
+# -------------------------
+# Text cleaning & chunking
+# -------------------------
+_MD_FRONTMATTER = re.compile(r"^---\s*\n.*?\n---\s*\n", re.DOTALL)
+def normalize_text(text: str) -> str:
+    lines = [ln.strip() for ln in text.splitlines()]
+    cleaned = []
+    for ln in lines:
+        if not ln:
+            continue
+        if sum(ch.isalnum() for ch in ln) < 3:
+            continue
+        cleaned.append(ln)
+    s = "\n".join(cleaned)
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s.strip()
+def md_to_text(md: str) -> str:
+    md = re.sub(_MD_FRONTMATTER, "", md)
+    md = re.sub(r"```.*?```", "", md, flags=re.DOTALL)  # drop fenced code
+    md = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", md)        # drop images
+    md = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", md)    # links -> label
+    md = re.sub(r"^\s{0,3}#{1,6}\s*", "", md, flags=re.MULTILINE)
+    md = md.replace("`", "")
+    md = re.sub(r"^\s*[-*+]\s+", "• ", md, flags=re.MULTILINE)
+    md = re.sub(r"^\s*>\s?", "", md, flags=re.MULTILINE)
+    return normalize_text(md)
+def chunk_text(text: str, max_chars: int = 800, overlap: int = 120) -> List[str]:
+    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
+    out: List[str] = []
+    buf = ""
+    for p in paras:
+        if len(p) > max_chars:
+            i = 0
+            while i < len(p):
+                j = min(i + max_chars, len(p))
+                out.append(p[i:j])
+                i = j - overlap if j - overlap > i else j
+            continue
+        if len(buf) + 2 + len(p) <= max_chars:
+            buf = (buf + "\n\n" + p) if buf else p
+        else:
+            if buf:
+                out.append(buf)
+            buf = p
+    if buf:
+        out.append(buf)
+    return out
+def write_jsonl(records: Iterable[Dict], out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+# -------------------------
+# GitHub API helpers
+# -------------------------
+def gh_session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({
+        "Accept": "application/vnd.github+json",
+        "User-Agent": "matrix-ai-rag-builder/1.0",
+    })
+    tok = os.getenv("GITHUB_TOKEN")
+    if tok:
+        s.headers["Authorization"] = f"Bearer {tok}"
+    return s
+def gh_get_json(url: str, sess: requests.Session, max_retries: int = 3) -> Dict | List:
+    backoff = 1.0
+    for attempt in range(max_retries):
+        r = sess.get(url, timeout=25)
+        if r.status_code == 403 and "rate limit" in r.text.lower():
+            log.warning("GitHub rate-limited; sleeping %.1fs", backoff)
+            time.sleep(backoff)
+            backoff = min(backoff * 2, 30)
+            continue
+        r.raise_for_status()
+        return r.json()
+    r.raise_for_status()
+    return {}
+def gh_list_org_repos(org: str, sess: requests.Session) -> List[Dict]:
+    repos: List[Dict] = []
+    page = 1
+    while True:
+        url = f"https://api.github.com/orgs/{org}/repos?per_page=100&page={page}"
+        js = gh_get_json(url, sess)
+        if not js:
+            break
+        repos.extend(js)
+        if len(js) < 100:
+            break
+        page += 1
+    return repos
+def gh_list_tree(owner: str, repo: str, branch: str, sess: requests.Session) -> List[Dict]:
+    url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
+    js = gh_get_json(url, sess)
+    return js.get("tree", []) if isinstance(js, dict) else []
+def gh_fetch_raw(owner: str, repo: str, branch: str, path: str, sess: requests.Session) -> Optional[str]:
+    raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
+    r = sess.get(raw_url, timeout=25)
+    if r.status_code == 404 and branch == "main":  # try master fallback
+        raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{path}"
+        r = sess.get(raw_url, timeout=25)
+    if r.status_code == 200:
+        return r.text
+    return None
+# -------------------------
+# Builders
+# -------------------------
+def ingest_github_repo(owner: str, name: str, branch: str, docs_paths: List[str],
+                       include_readme: bool, exts: Tuple[str,...] = (".md",".mdx",".txt")) -> List[Tuple[str,str]]:
+    sess = gh_session()
+    out: List[Tuple[str,str]] = []
+    # README
+    if include_readme:
+        for candidate in ("README.md", "readme.md", "README.MD"):
+            t = gh_fetch_raw(owner, name, branch, candidate, sess)
+            if t:
+                out.append((f"github:{owner}/{name}/{candidate}", md_to_text(t)))
+                break
+    # Tree -> docs paths
+    tree = gh_list_tree(owner, name, branch, sess)
+    if not tree:
+        return out
+    wanted_dirs = [p.strip("/").lower() for p in docs_paths]
+    for entry in tree:
+        if entry.get("type") != "blob":
+            continue
+        path = entry.get("path", "")
+        lower = path.lower()
+        if not lower.endswith(exts):
+            continue
+        if any(lower.startswith(d + "/") for d in wanted_dirs):
+            t = gh_fetch_raw(owner, name, branch, path, sess)
+            if not t:
+                continue
+            txt = md_to_text(t) if lower.endswith((".md",".mdx")) else normalize_text(t)
+            if txt:
+                out.append((f"github:{owner}/{name}/{path}", txt))
+    return out
+def ingest_github_sources(cfg: Dict) -> List[Tuple[str,str]]:
+    out: List[Tuple[str,str]] = []
+    gh = cfg.get("github") or {}
+    sess = gh_session()
+    # explicit repos
+    for repo in (gh.get("repos") or []):
+        owner = repo["owner"]
+        name = repo["name"]
+        branch = repo.get("branch", "main")
+        docs_paths = repo.get("docs_paths", ["docs"])
+        include_readme = bool(repo.get("include_readme", True))
+        out.extend(ingest_github_repo(owner, name, branch, docs_paths, include_readme))
+    # whole org scan (README + docs/)
+    for org in (gh.get("orgs") or []):
+        try:
+            repos = gh_list_org_repos(org, sess)
+        except Exception as e:
+            log.warning("Failed to list org %s: %s", org, e)
+            continue
+        for r in repos:
+            owner = r["owner"]["login"]
+            name = r["name"]
+            default_branch = r.get("default_branch", "main")
+            # README + docs/
+            out.extend(ingest_github_repo(owner, name, default_branch, ["docs"], include_readme=True))
+    return out
+def ingest_local_sources(cfg: Dict) -> List[Tuple[str,str]]:
+    out: List[Tuple[str,str]] = []
+    local = cfg.get("local") or {}
+    paths = local.get("paths") or []
+    glob_pat = local.get("glob", "**/*.md")
+    for p in paths:
+        fp = Path(p)
+        if fp.is_file():
+            try:
+                raw = fp.read_text(encoding="utf-8", errors="ignore")
+                txt = md_to_text(raw) if fp.suffix.lower() in {".md",".mdx"} else normalize_text(raw)
+                if txt:
+                    out.append((str(fp), txt))
+            except Exception as e:
+                log.warning("Failed reading %s: %s", fp, e)
+        elif fp.is_dir():
+            for f in fp.rglob(glob_pat):
+                try:
+                    raw = f.read_text(encoding="utf-8", errors="ignore")
+                    txt = md_to_text(raw) if f.suffix.lower() in {".md",".mdx"} else normalize_text(raw)
+                    if txt:
+                        out.append((str(f), txt))
+                except Exception as e:
+                    log.warning("Failed reading %s: %s", f, e)
+    return out
+def build_kb_from_config(config_path: str = "configs/rag_sources.yaml",
+                         out_jsonl: str = "data/kb.jsonl",
+                         max_chars: int = 800,
+                         overlap: int = 120,
+                         minlen: int = 200,
+                         dedupe: bool = True) -> int:
+    cfg: Dict = {}
+    p = Path(config_path)
+    if p.exists():
+        cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+    else:
+        log.warning("rag_sources.yaml not found at %s (using defaults)", p)
+    records: List[Dict] = []
+    # GitHub
+    try:
+        gh_docs = ingest_github_sources(cfg)
+        for src, text in gh_docs:
+            for chunk in chunk_text(text, max_chars, overlap):
+                if len(chunk) >= minlen:
+                    records.append({"text": chunk, "source": src})
+    except Exception as e:
+        log.warning("GitHub ingest failed: %s", e)
+    # Local
+    try:
+        loc_docs = ingest_local_sources(cfg)
+        for src, text in loc_docs:
+            for chunk in chunk_text(text, max_chars, overlap):
+                if len(chunk) >= minlen:
+                    records.append({"text": chunk, "source": src})
+    except Exception as e:
+        log.warning("Local ingest failed: %s", e)
+    # URLs (optional)
+    for url in (cfg.get("urls") or []):
+        try:
+            r = requests.get(url, timeout=25)
+            r.raise_for_status()
+            txt = normalize_text(r.text)
+            for chunk in chunk_text(txt, max_chars, overlap):
+                if len(chunk) >= minlen:
+                    records.append({"text": chunk, "source": url})
+        except Exception as e:
+            log.warning("URL ingest failed for %s: %s", url, e)
+    if dedupe:
+        seen = set()
+        deduped: List[Dict] = []
+        for rec in records:
+            h = hash(rec["text"])
+            if h in seen:
+                continue
+            seen.add(h)
+            deduped.append(rec)
+        records = deduped
+    if not records:
+        log.warning("No KB records produced.")
+        return 0
+    out_path = Path(out_jsonl)
+    write_jsonl(records, out_path)
+    log.info("Wrote %d chunks to %s", len(records), out_path)
+    return len(records)
+def ensure_kb(out_jsonl: str = "data/kb.jsonl",
+              config_path: str = "configs/rag_sources.yaml",
+              skip_if_exists: bool = True) -> bool:
+    """
+    If kb.jsonl exists -> return True.
+    Else -> build from config and return True on success.
+    """
+    out = Path(out_jsonl)
+    if skip_if_exists and out.exists() and out.stat().st_size > 0:
+        log.info("KB already present at %s (skipping build)", out)
+        return True
+    n = build_kb_from_config(config_path=config_path, out_jsonl=out_jsonl)
+    return n > 0

app/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from __future__ import annotations
 import logging
@@ -9,19 +10,16 @@ from typing import Any, Dict
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
-# --- ADDED: Import dependencies needed for pre-loading ---
-from .deps import get_settings
-from .services.chat_service import get_retriever
-# -----------------------------------------------------------------------------
-# Early: load .env (so HF_TOKEN, ADMIN_TOKEN, etc. are available locally)
-# -----------------------------------------------------------------------------
 def _load_env_file(paths: list[str]) -> None:
-    """Load environment variables from the first existing path in `paths`.
-    Prefer python-dotenv if present; otherwise use a tiny fallback parser."""
     logger = logging.getLogger("uvicorn.error")
-    # 1) Try python-dotenv (best)
     try:
         from dotenv import load_dotenv  # type: ignore
         for p in paths:
@@ -32,7 +30,7 @@ def _load_env_file(paths: list[str]) -> None:
         logger.info("No .env file found in %s (skipping)", paths)
         return
     except Exception:
-        # 2) Fallback: simple parser
         for p in paths:
             if not os.path.exists(p):
                 continue
@@ -53,7 +51,7 @@ def _load_env_file(paths: list[str]) -> None:
                             val.startswith("'") and val.endswith("'")
                         ):
                             val = val[1:-1]
-                        # do not clobber existing env (Space Secrets)
                         os.environ.setdefault(key, val)
                 logger.info("Loaded environment from %s (fallback parser)", p)
                 return
@@ -62,13 +60,17 @@ def _load_env_file(paths: list[str]) -> None:
     logger.info("No .env loaded (none found / parsers failed)")
-# Try typical locations for local dev. HF Spaces will ignore this and use Secrets.
 _load_env_file([".env", "configs/.env", ".env.local", "configs/.env.local"])
-# -----------------------------------------------------------------------------
-# Middlewares
-# -----------------------------------------------------------------------------
 try:
     from .middleware import attach_middlewares  # singular
 except Exception:
@@ -80,11 +82,11 @@ except Exception:
                 "attach_middlewares not found; continuing without custom middlewares."
             )
-# -----------------------------------------------------------------------------
-# Routers
-# -----------------------------------------------------------------------------
 from .routers import health, plan, chat
 try:
     from .ui import router as ui_router  # type: ignore
     HAS_UI = True
@@ -105,12 +107,27 @@ async def lifespan(app: FastAPI):
     app.state.started_at = time.time()
     app.state.version = os.getenv("APP_VERSION", "1.0.0")
-    # --- ADDED: Pre-load the RAG model and index on startup ---
     logger = logging.getLogger("uvicorn.error")
     logger.info("Warming up RAG retriever...")
     get_retriever(get_settings())
     logger.info("RAG retriever is ready.")
     hf_token_present = bool(os.getenv("HF_TOKEN"))
     logger.info(
         "matrix-ai starting (version=%s, port=%s, hf_token_present=%s)",
@@ -118,13 +135,12 @@ async def lifespan(app: FastAPI):
         os.getenv("PORT", "7860"),
         "yes" if hf_token_present else "no",
     )
     try:
         yield
     finally:
         uptime = time.time() - getattr(app.state, "started_at", time.time())
-        logger.info(
-            "matrix-ai shutting down (uptime=%.2fs)", uptime
-        )
 def create_app() -> FastAPI:
@@ -138,14 +154,19 @@ def create_app() -> FastAPI:
         lifespan=lifespan,
     )
     attach_middlewares(app)
     app.include_router(health.router, tags=["Health"])
     app.include_router(plan.router, prefix="/v1", tags=["Planning"])
     app.include_router(chat.router, prefix="/v1", tags=["Chat"])
     if HAS_UI:
         app.include_router(ui_router, tags=["UI"])
     else:
         @app.get("/", include_in_schema=False)
         async def root() -> Dict[str, Any]:
             return {
@@ -155,9 +176,12 @@ def create_app() -> FastAPI:
                 "docs": "/docs",
                 "endpoints": {"plan": "/v1/plan", "chat": "/v1/chat", "healthz": "/healthz"},
             }
         @app.get("/home", include_in_schema=False)
         async def home_redirect():
             return RedirectResponse(url="/docs", status_code=302)
     return app
-app = create_app()

+# app/main.py
 from __future__ import annotations
 import logging
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
+# ---- Early env load (HF_TOKEN, ADMIN_TOKEN, GITHUB_TOKEN, etc.) ----
 def _load_env_file(paths: list[str]) -> None:
+    """
+    Load environment variables from the first existing path in `paths`.
+    Prefer python-dotenv if present; otherwise use a tiny fallback parser.
+    Does not override pre-existing env vars (e.g., Space Secrets).
+    """
     logger = logging.getLogger("uvicorn.error")
+    # 1) Try python-dotenv
     try:
         from dotenv import load_dotenv  # type: ignore
         for p in paths:
         logger.info("No .env file found in %s (skipping)", paths)
         return
     except Exception:
+        # 2) Fallback minimal parser
         for p in paths:
             if not os.path.exists(p):
                 continue
                             val.startswith("'") and val.endswith("'")
                         ):
                             val = val[1:-1]
+                        # do not clobber existing env (e.g., HF Secrets)
                         os.environ.setdefault(key, val)
                 logger.info("Loaded environment from %s (fallback parser)", p)
                 return
     logger.info("No .env loaded (none found / parsers failed)")
+# Try common local locations. HF Spaces will rely on Secrets instead.
 _load_env_file([".env", "configs/.env", ".env.local", "configs/.env.local"])
+# ---- RAG bootstrap & warm-up ----
+from .deps import get_settings
+from .services.chat_service import get_retriever
+from .core.rag.build import ensure_kb
+# ---- Middlewares ----
 try:
     from .middleware import attach_middlewares  # singular
 except Exception:
                 "attach_middlewares not found; continuing without custom middlewares."
             )
+# ---- Routers ----
 from .routers import health, plan, chat
+# Optional UI bundle (/, /chat, /dev)
 try:
     from .ui import router as ui_router  # type: ignore
     HAS_UI = True
     app.state.started_at = time.time()
     app.state.version = os.getenv("APP_VERSION", "1.0.0")
     logger = logging.getLogger("uvicorn.error")
+    # 1) Build KB on first boot (skips if already present)
+    try:
+        if ensure_kb(
+            out_jsonl="data/kb.jsonl",
+            config_path="configs/rag_sources.yaml",
+            skip_if_exists=True,
+        ):
+            logger.info("KB ready at data/kb.jsonl")
+        else:
+            logger.warning("KB build produced no records; running LLM-only.")
+    except Exception as e:
+        logger.warning("KB build failed (%s); running LLM-only.", e)
+    # 2) Warm up RAG retriever (indexes data/kb.jsonl if present)
     logger.info("Warming up RAG retriever...")
     get_retriever(get_settings())
     logger.info("RAG retriever is ready.")
+    # 3) Boot log
     hf_token_present = bool(os.getenv("HF_TOKEN"))
     logger.info(
         "matrix-ai starting (version=%s, port=%s, hf_token_present=%s)",
         os.getenv("PORT", "7860"),
         "yes" if hf_token_present else "no",
     )
     try:
         yield
     finally:
         uptime = time.time() - getattr(app.state, "started_at", time.time())
+        logger.info("matrix-ai shutting down (uptime=%.2fs)", uptime)
 def create_app() -> FastAPI:
         lifespan=lifespan,
     )
+    # Middlewares (gzip, CORS, rate-limit, req-logs, etc.)
     attach_middlewares(app)
+    # Core routers
     app.include_router(health.router, tags=["Health"])
     app.include_router(plan.router, prefix="/v1", tags=["Planning"])
     app.include_router(chat.router, prefix="/v1", tags=["Chat"])
+    # UI (/, /chat, /dev). Your ui.py already defines "/" → /chat
     if HAS_UI:
         app.include_router(ui_router, tags=["UI"])
     else:
+        # Minimal root so HF root probes pass even without UI
         @app.get("/", include_in_schema=False)
         async def root() -> Dict[str, Any]:
             return {
                 "docs": "/docs",
                 "endpoints": {"plan": "/v1/plan", "chat": "/v1/chat", "healthz": "/healthz"},
             }
         @app.get("/home", include_in_schema=False)
         async def home_redirect():
             return RedirectResponse(url="/docs", status_code=302)
     return app
+app = create_app()

app/middleware.py CHANGED Viewed

@@ -1,30 +1,68 @@
 import time
 import logging
 from typing import Callable
 from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.gzip import GZipMiddleware
-from pythonjsonlogger import jsonlogger
 from .deps import get_settings
 from .core.rate_limit import RateLimiter
 from .core.logging import add_trace_id
 # Setup structured logging
 logger = logging.getLogger("matrix-ai")
 if not logger.handlers:
     logger.setLevel(logging.INFO)
     handler = logging.StreamHandler()
-    formatter = jsonlogger.JsonFormatter(
-        '%(asctime)s %(name)s %(levelname)s %(message)s %(trace_id)s'
-    )
     handler.setFormatter(formatter)
     logger.addHandler(handler)
 _rate_limiter = RateLimiter()
-def attach_middlewares(app: FastAPI):
     """Attaches all required middlewares to the FastAPI app."""
     app.add_middleware(GZipMiddleware, minimum_size=512)
     app.add_middleware(
         CORSMiddleware,
         allow_origins=["*"],
@@ -35,20 +73,25 @@ def attach_middlewares(app: FastAPI):
     @app.middleware("http")
     async def rate_limit_and_log_middleware(request: Request, call_next: Callable):
         add_trace_id(request)
         settings = get_settings()
         client_ip = request.client.host if request.client else "unknown"
-        if not _rate_limiter.allow(client_ip, request.url.path, settings.limits.rate_per_min):
             return Response(status_code=429, content="Rate limit exceeded")
         start_time = time.time()
         response = await call_next(request)
-        process_time = (time.time() - start_time) * 1000
         response.headers["X-Process-Time-Ms"] = f"{process_time:.2f}"
         logger.info(
             f'"{request.method} {request.url.path}" {response.status_code}',
-            extra={'trace_id': getattr(request.state, 'trace_id', 'N/A')}
         )
         return response

 import time
 import logging
+import json
 from typing import Callable
 from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.gzip import GZipMiddleware
+# Try to import python-json-logger; fall back to a tiny JSON formatter if missing.
+try:
+    from pythonjsonlogger import jsonlogger  # type: ignore[import-not-found]
+    _HAS_PY_JSON_LOGGER = True
+except Exception:
+    _HAS_PY_JSON_LOGGER = False
 from .deps import get_settings
 from .core.rate_limit import RateLimiter
 from .core.logging import add_trace_id
+# ---- Fallback JSON formatter (if python-json-logger isn't available) ----
+class _SimpleJsonFormatter(logging.Formatter):
+    def format(self, record: logging.LogRecord) -> str:
+        payload = {
+            "asctime": self.formatTime(record, "%Y-%m-%d %H:%M:%S"),
+            "name": record.name,
+            "levelname": record.levelname,
+            "message": record.getMessage(),
+            # We attach trace_id via logger.info(..., extra={"trace_id": "..."}).
+            "trace_id": getattr(record, "trace_id", None),
+        }
+        try:
+            return json.dumps(payload, ensure_ascii=False)
+        except Exception:
+            # Last-ditch plain log if JSON serialization ever fails
+            return (
+                f'{payload["asctime"]} {payload["name"]} {payload["levelname"]} '
+                f'{payload["message"]} trace_id={payload["trace_id"]}'
+            )
 # Setup structured logging
 logger = logging.getLogger("matrix-ai")
 if not logger.handlers:
     logger.setLevel(logging.INFO)
     handler = logging.StreamHandler()
+    if _HAS_PY_JSON_LOGGER:
+        # Same fields you had; python-json-logger builds JSON from this format string
+        formatter = jsonlogger.JsonFormatter(
+            "%(asctime)s %(name)s %(levelname)s %(message)s %(trace_id)s"
+        )
+    else:
+        formatter = _SimpleJsonFormatter()
+        logging.getLogger("uvicorn.error").warning(
+            "python-json-logger not found; using a minimal JSON formatter."
+        )
     handler.setFormatter(formatter)
     logger.addHandler(handler)
 _rate_limiter = RateLimiter()
+def attach_middlewares(app: FastAPI) -> None:
     """Attaches all required middlewares to the FastAPI app."""
+    # NOTE: We keep GZip, but your SSE endpoints already set `Content-Encoding: identity`
+    # so they won't be buffered/compressed.
     app.add_middleware(GZipMiddleware, minimum_size=512)
     app.add_middleware(
         CORSMiddleware,
         allow_origins=["*"],
     @app.middleware("http")
     async def rate_limit_and_log_middleware(request: Request, call_next: Callable):
+        # Attach per-request trace id
         add_trace_id(request)
         settings = get_settings()
         client_ip = request.client.host if request.client else "unknown"
+        # Simple fixed-window limiter
+        if not _rate_limiter.allow(
+            client_ip, request.url.path, settings.limits.rate_per_min
+        ):
             return Response(status_code=429, content="Rate limit exceeded")
         start_time = time.time()
         response = await call_next(request)
+        process_time = (time.time() - start_time) * 1000.0
         response.headers["X-Process-Time-Ms"] = f"{process_time:.2f}"
         logger.info(
             f'"{request.method} {request.url.path}" {response.status_code}',
+            extra={"trace_id": getattr(request.state, "trace_id", "N/A")},
         )
         return response

app/routers/chat.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
 import json
-from typing import Any, AsyncIterator, List, Optional
 from fastapi import APIRouter, Depends, HTTPException, Query
 from pydantic import BaseModel, Field
-from starlette.concurrency import run_in_threadpool
 from starlette.responses import StreamingResponse
 from ..deps import get_settings
@@ -27,12 +27,16 @@ class ChatRequest(BaseModel):
     messages: Optional[List[ChatMessage]] = None
     def as_text(self) -> str:
-        if self.query: return self.query
-        if self.question: return self.question
-        if self.prompt: return self.prompt
         if self.messages:
             for m in reversed(self.messages):
-                if m.role.lower() == "user": return m.content
             return self.messages[-1].content
         raise ValueError("Body must include 'query'/'question'/'prompt' or 'messages'")
@@ -50,7 +54,7 @@ async def chat(req: ChatRequest, settings: Settings = Depends(get_settings)):
         raise HTTPException(status_code=422, detail=str(e))
     svc = ChatService(settings)
     try:
-        # Run the blocking call in a thread pool to avoid freezing the server
         answer, sources = await run_in_threadpool(svc.answer_with_sources, text)
         return ChatResponse(answer=answer, sources=sources)
     except PermissionError as e:
@@ -63,7 +67,6 @@ async def chat(req: ChatRequest, settings: Settings = Depends(get_settings)):
 async def chat_get(query: str = Query(...), settings: Settings = Depends(get_settings)):
     svc = ChatService(settings)
     try:
-        # Run the blocking call in a thread pool
         answer, sources = await run_in_threadpool(svc.answer_with_sources, query)
         return ChatResponse(answer=answer, sources=sources)
     except PermissionError as e:
@@ -72,7 +75,6 @@ async def chat_get(query: str = Query(...), settings: Settings = Depends(get_set
         raise HTTPException(status_code=502, detail=f"Inference error: {e}")
-# ---------- Streaming (SSE) ----------
 def _sse_line(obj: Any) -> str:
     payload = obj if isinstance(obj, str) else json.dumps(obj, ensure_ascii=False)
     return f"data: {payload}\n\n"
@@ -80,25 +82,29 @@ def _sse_line(obj: Any) -> str:
 @router.get("/chat/stream")
 async def chat_stream(query: str = Query(...), settings: Settings = Depends(get_settings)):
     svc = ChatService(settings)
-    async def gen() -> AsyncIterator[str]:
-        # Anti-buffer padding and initial ping
         yield ":" + (" " * 2048) + "\n\n"
         yield "event: ping\ndata: 0\n\n"
         try:
-            # Run the blocking retrieval part in a thread pool, then stream the results
-            stream_generator = await run_in_threadpool(svc.stream_answer, query)
-            any_tokens = False
-            for token in stream_generator:
                 if token:
                     any_tokens = True
                     yield _sse_line({"delta": token})
             if not any_tokens:
                 yield _sse_line({"delta": ""})
             yield _sse_line("[DONE]")
         except Exception as e:
             yield _sse_line({"error": str(e)})
@@ -108,7 +114,12 @@ async def chat_stream(query: str = Query(...), settings: Settings = Depends(get_
         "Connection": "keep-alive",
         "Content-Encoding": "identity",
     }
-    return StreamingResponse(gen(), media_type="text/event-stream; charset=utf-8", headers=headers)
 @router.post("/chat/stream")
@@ -117,4 +128,4 @@ async def chat_stream_post(req: ChatRequest, settings: Settings = Depends(get_se
         q = req.as_text()
     except ValueError as e:
         raise HTTPException(status_code=422, detail=str(e))
-    return await chat_stream(query=q, settings=settings)

 from __future__ import annotations
 import json
+from typing import Any, Iterator, List, Optional
 from fastapi import APIRouter, Depends, HTTPException, Query
 from pydantic import BaseModel, Field
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 from starlette.responses import StreamingResponse
 from ..deps import get_settings
     messages: Optional[List[ChatMessage]] = None
     def as_text(self) -> str:
+        if self.query:
+            return self.query
+        if self.question:
+            return self.question
+        if self.prompt:
+            return self.prompt
         if self.messages:
             for m in reversed(self.messages):
+                if m.role.lower() == "user":
+                    return m.content
             return self.messages[-1].content
         raise ValueError("Body must include 'query'/'question'/'prompt' or 'messages'")
         raise HTTPException(status_code=422, detail=str(e))
     svc = ChatService(settings)
     try:
+        # run blocking client in a threadpool
         answer, sources = await run_in_threadpool(svc.answer_with_sources, text)
         return ChatResponse(answer=answer, sources=sources)
     except PermissionError as e:
 async def chat_get(query: str = Query(...), settings: Settings = Depends(get_settings)):
     svc = ChatService(settings)
     try:
         answer, sources = await run_in_threadpool(svc.answer_with_sources, query)
         return ChatResponse(answer=answer, sources=sources)
     except PermissionError as e:
         raise HTTPException(status_code=502, detail=f"Inference error: {e}")
 def _sse_line(obj: Any) -> str:
     payload = obj if isinstance(obj, str) else json.dumps(obj, ensure_ascii=False)
     return f"data: {payload}\n\n"
 @router.get("/chat/stream")
 async def chat_stream(query: str = Query(...), settings: Settings = Depends(get_settings)):
+    """
+    SSE of token deltas. We iterate the sync streaming client in a threadpool
+    so the event loop stays free.
+    """
     svc = ChatService(settings)
+    def sync_stream() -> Iterator[str]:
+        # send anti-buffer padding + ping immediately
         yield ":" + (" " * 2048) + "\n\n"
+        yield "retry: 1500\n\n"
         yield "event: ping\ndata: 0\n\n"
+        any_tokens = False
         try:
+            for token in svc.stream_answer(query):
                 if token:
                     any_tokens = True
                     yield _sse_line({"delta": token})
             if not any_tokens:
                 yield _sse_line({"delta": ""})
             yield _sse_line("[DONE]")
+        except GeneratorExit:
+            return
         except Exception as e:
             yield _sse_line({"error": str(e)})
         "Connection": "keep-alive",
         "Content-Encoding": "identity",
     }
+    # iterate the sync generator in a threadpool (non-blocking for the loop)
+    return StreamingResponse(
+        iterate_in_threadpool(sync_stream()),
+        media_type="text/event-stream; charset=utf-8",
+        headers=headers,
+    )
 @router.post("/chat/stream")
         q = req.as_text()
     except ValueError as e:
         raise HTTPException(status_code=422, detail=str(e))
+    return await chat_stream(query=q, settings=settings)

app/services/chat_service.py CHANGED Viewed

@@ -1,9 +1,12 @@
 from __future__ import annotations
 import logging
 import os
 from pathlib import Path
-from typing import List, Tuple
 from ..core.config import Settings
 from ..core.inference.client import RouterRequestsClient
@@ -11,33 +14,203 @@ from ..core.rag.retriever import Retriever
 logger = logging.getLogger(__name__)
 SYSTEM_PROMPT = (
-    "You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem. "
-    "Answer clearly and briefly. If unsure, say so."
 )
-# --- Singleton instance for the expensive Retriever class ---
-_retriever_instance: Retriever | None = None
-def get_retriever(settings: Settings) -> Retriever | None:
-    """Initializes and returns a single instance of the Retriever."""
     global _retriever_instance
     if _retriever_instance is not None:
         return _retriever_instance
     kb_path = os.getenv("RAG_KB_PATH", "data/kb.jsonl")
-    try:
-        if Path(kb_path).exists():
             _retriever_instance = Retriever(kb_path=kb_path, top_k=settings.rag.top_k)
             logger.info("RAG enabled with KB at %s (top_k=%d)", kb_path, settings.rag.top_k)
-        else:
-            logger.info("RAG KB not found at %s — running LLM-only.", kb_path)
     except Exception as e:
-        logger.warning("RAG disabled (failed to initialize Retriever: %s)", e)
-    return _retriever_instance
 class ChatService:
     def __init__(self, settings: Settings):
         self.settings = settings
@@ -46,51 +219,80 @@ class ChatService:
             fallback=settings.model.fallback,
             provider=getattr(settings.model, "provider", None),
             max_retries=2,
         )
-        # Get the singleton retriever instance
         self.retriever = get_retriever(settings)
-    def _build_context(self, query: str) -> Tuple[str, List[str]]:
         if not self.retriever:
             return "", []
-        docs = self.retriever.retrieve(query, self.settings.rag.top_k)
-        if not docs:
             return "", []
-        blocks = [f"[{i+1}] {d['text']} (source: {d['source']})" for i, d in enumerate(docs)]
-        context = "CONTEXT (use only these facts; if missing, say you don't know):\n" + "\n\n".join(blocks)
-        sources = [d["source"] for d in docs]
-        return context, sources
     def _augment(self, query: str) -> Tuple[str, List[str]]:
         """
         Build the final user message (with optional CONTEXT) and return sources.
         """
-        ctx, sources = self._build_context(query)
-        # --- THIS IS THE CORRECTED PROMPT ---
         if ctx:
-            # New, clearer instruction format
-            augmented = f"{ctx}\n\nBased only on the context provided above, answer the following question.\nQuestion: {query}"
         else:
-            # If no context, just pass the original query
-            augmented = query
-        return augmented, sources
-    # Note: These methods are now called from a thread pool in the router
     def answer_with_sources(self, query: str) -> Tuple[str, List[str]]:
         user_msg, sources = self._augment(query)
         text = self.client.chat_nonstream(
-            SYSTEM_PROMPT, user_msg,
             max_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
         )
         return text, sources
     def stream_answer(self, query: str):
         user_msg, _ = self._augment(query)
         return self.client.chat_stream(
-            SYSTEM_PROMPT, user_msg,
             max_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
-        )

+# app/services/chat_service.py
 from __future__ import annotations
 import logging
 import os
+import re
+import threading
 from pathlib import Path
+from typing import List, Tuple, Dict, Optional
 from ..core.config import Settings
 from ..core.inference.client import RouterRequestsClient
 logger = logging.getLogger(__name__)
+# --- Optional cross-encoder reranker (graceful fallback) ---
+try:
+    from sentence_transformers import CrossEncoder  # type: ignore
+except Exception:  # pragma: no cover
+    CrossEncoder = None  # type: ignore
 SYSTEM_PROMPT = (
+    "You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem.\n"
+    "You MUST use the provided CONTEXT. If an answer is not supported by the context, say you don't know.\n"
+    "Prefer short, clear sentences. Include product/feature names exactly as written in the context.\n"
 )
+# Thread-safe singleton retriever
+_retriever_instance: Optional[Retriever] = None
+_retriever_lock = threading.Lock()
+def get_retriever(settings: Settings) -> Optional[Retriever]:
+    """Initialize and return a single Retriever instance (double-checked locking)."""
     global _retriever_instance
     if _retriever_instance is not None:
         return _retriever_instance
     kb_path = os.getenv("RAG_KB_PATH", "data/kb.jsonl")
+    if not Path(kb_path).exists():
+        logger.info("RAG KB not found at %s — running LLM-only.", kb_path)
+        return None
+    with _retriever_lock:
+        if _retriever_instance is not None:
+            return _retriever_instance
+        try:
             _retriever_instance = Retriever(kb_path=kb_path, top_k=settings.rag.top_k)
             logger.info("RAG enabled with KB at %s (top_k=%d)", kb_path, settings.rag.top_k)
+        except Exception as e:
+            logger.warning("RAG disabled (failed to initialize Retriever: %s)", e)
+            _retriever_instance = None
+    return _retriever_instance
+# ----------------------------
+# RAG utilities (ranking & snippets)
+# ----------------------------
+_ALIAS_TABLE: Dict[str, List[str]] = {
+    # canonical -> aliases
+    "matrixhub": ["matrix hub", "matrixhub", "hub api", "catalog", "registry", "cas"],
+    "mcp": ["model context protocol", "mcp", "manifest", "server manifest", "admin api"],
+    "agent-matrix": ["agent-matrix", "matrix agents", "matrix ecosystem", "matrix toolkit"],
+}
+_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
+def _normalize(text: str) -> List[str]:
+    return [t.lower() for t in _WORD_RE.findall(text)]
+def _expand_query(q: str) -> str:
+    """Add domain aliases to help the embedding retrieve the right docs."""
+    ql = q.lower()
+    extras: List[str] = []
+    for canon, variants in _ALIAS_TABLE.items():
+        if any(v in ql for v in variants):
+            extras.extend([canon] + variants)
+    if extras:
+        return q + " | " + " ".join(sorted(set(extras)))
+    return q
+def _keyword_overlap_score(query: str, text: str) -> float:
+    """Simple lexical grounding score: Jaccard over unique tokens (stopword-agnostic)."""
+    q_tokens = set(_normalize(query))
+    d_tokens = set(_normalize(text))
+    if not q_tokens or not d_tokens:
+        return 0.0
+    inter = len(q_tokens & d_tokens)
+    union = len(q_tokens | d_tokens)
+    return inter / max(1, union)
+def _domain_boost(text: str) -> float:
+    t = text.lower()
+    boost = 0.0
+    for term in ("matrixhub", "hub api", "catalog", "mcp", "server manifest", "cas"):
+        if term in t:
+            boost += 0.05
+    return min(boost, 0.25)
+def _best_paragraphs(text: str, query: str, max_chars: int = 700) -> str:
+    """
+    Split by blank lines and pick 1-2 best paragraphs by lexical overlap.
+    Keep it compact to avoid swamping the LLM.
+    """
+    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    if not paras:
+        return text[:max_chars]
+    scored = [(p, _keyword_overlap_score(query, p)) for p in paras]
+    scored.sort(key=lambda x: x[1], reverse=True)
+    picked: List[str] = []
+    used = 0
+    for p, _s in scored[:4]:
+        if used >= max_chars:
+            break
+        picked.append(p)
+        used += len(p) + 2
+        if used >= max_chars or len(picked) >= 2:
+            break
+    return "\n".join(picked)
+def _cross_encoder_scores(
+    model: Optional["CrossEncoder"],
+    query: str,
+    docs: List[Dict],
+    max_pairs: int = 50,
+) -> Optional[List[float]]:
+    if not model:
+        return None
+    try:
+        pairs = [(query, d["text"][:1200]) for d in docs[:max_pairs]]
+        return list(model.predict(pairs))
     except Exception as e:
+        logger.warning("Cross-encoder scoring failed; continuing without it (%s)", e)
+        return None
+def _rerank_docs(
+    docs: List[Dict],
+    query: str,
+    k_final: int,
+    reranker: Optional["CrossEncoder"] = None,
+) -> List[Dict]:
+    """
+    Combine vector score, keyword overlap, domain boost, and optional cross-encoder.
+    Score = 0.55*vec + 0.35*lex + 0.10*boost (+ 0.20*ce if available, rescaled).
+    """
+    if not docs:
+        return []
+    # Normalize vector scores (cosine similarities 0..1-ish) to 0..1
+    vec_scores = [float(d.get("score", 0.0)) for d in docs]
+    if vec_scores:
+        vmin = min(vec_scores)
+        vmax = max(vec_scores)
+        rng = max(1e-6, (vmax - vmin))
+        vec_norm = [(v - vmin) / rng for v in vec_scores]
+    else:
+        vec_norm = [0.0] * len(docs)
+    lex_scores = [_keyword_overlap_score(query, d["text"]) for d in docs]
+    boosts = [_domain_boost(d["text"]) for d in docs]
+    ce_scores = _cross_encoder_scores(reranker, query, docs)
+    if ce_scores:
+        # Min-max normalize CE too
+        cmin, cmax = min(ce_scores), max(ce_scores)
+        crng = max(1e-6, (cmax - cmin))
+        ce_norm = [(c - cmin) / crng for c in ce_scores]
+    else:
+        ce_norm = None
+    merged: List[Tuple[float, Dict]] = []
+    for i, d in enumerate(docs):
+        score = 0.55 * vec_norm[i] + 0.35 * lex_scores[i] + 0.10 * boosts[i]
+        if ce_norm is not None:
+            score = 0.80 * score + 0.20 * ce_norm[i]
+        merged.append((score, d))
+    merged.sort(key=lambda x: x[0], reverse=True)
+    top = [d for _s, d in merged[:k_final]]
+    return top
+def _build_context_from_docs(docs: List[Dict], query: str, max_blocks: int = 4) -> Tuple[str, List[str]]:
+    """
+    Build a compact CONTEXT section using best paragraphs from top docs.
+    Return (context_text, sources).
+    """
+    blocks: List[str] = []
+    sources: List[str] = []
+    for i, d in enumerate(docs[:max_blocks]):
+        snip = _best_paragraphs(d["text"], query, max_chars=700)
+        src = d.get("source", f"kb:{i}")
+        blocks.append(f"[{i+1}] {snip}\n(source: {src})")
+        sources.append(src)
+    if not blocks:
+        return "", []
+    prelude = "CONTEXT (use only these facts; if missing, say you don't know):"
+    return prelude + "\n\n" + "\n\n".join(blocks), sources
+# ----------------------------
+# Service
+# ----------------------------
 class ChatService:
     def __init__(self, settings: Settings):
         self.settings = settings
             fallback=settings.model.fallback,
             provider=getattr(settings.model, "provider", None),
             max_retries=2,
+            connect_timeout=10.0,
+            read_timeout=60.0,
         )
+        # RAG (singleton)
         self.retriever = get_retriever(settings)
+        # Optional cross-encoder (large; disable via env RAG_RERANK=false)
+        self.reranker = None
+        use_rerank = os.getenv("RAG_RERANK", "true").lower() in ("1", "true", "yes")
+        if use_rerank and CrossEncoder is not None:
+            try:
+                self.reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-2-v2")
+                logger.info("RAG cross-encoder reranker enabled.")
+            except Exception as e:
+                logger.warning("Reranker disabled: %s", e)
+    # ---------- RAG core ----------
+    def _retrieve_best(self, query: str) -> Tuple[str, List[str]]:
+        """
+        Retrieve many, rerank, and build a compact, high-signal CONTEXT.
+        Returns (context_text, sources).
+        """
         if not self.retriever:
             return "", []
+        expanded = _expand_query(query)
+        # Retrieve a wider candidate pool, then rerank.
+        k_base = max(4, int(self.settings.rag.top_k) * 5)
+        try:
+            cands = self.retriever.retrieve(expanded, k=k_base)  # [{'text','source','score'}]
+        except Exception as e:
+            logger.warning("Retriever failed (%s); falling back to LLM-only.", e)
+            return "", []
+        if not cands:
             return "", []
+        top = _rerank_docs(cands, query, k_final=max(3, self.settings.rag.top_k), reranker=self.reranker)
+        ctx, sources = _build_context_from_docs(top, query, max_blocks=max(3, self.settings.rag.top_k))
+        return ctx, sources
     def _augment(self, query: str) -> Tuple[str, List[str]]:
         """
         Build the final user message (with optional CONTEXT) and return sources.
         """
+        ctx, sources = self._retrieve_best(query)
         if ctx:
+            user_msg = (
+                f"{ctx}\n\n"
+                "Based only on the context above, answer the question succinctly.\n"
+                f"Question: {query}\nAnswer:"
+            )
         else:
+            user_msg = query  # LLM-only fallback
+        return user_msg, sources
+    # ---------- Non-stream ----------
     def answer_with_sources(self, query: str) -> Tuple[str, List[str]]:
         user_msg, sources = self._augment(query)
         text = self.client.chat_nonstream(
+            SYSTEM_PROMPT,
+            user_msg,
             max_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
         )
         return text, sources
+    # ---------- Stream ----------
     def stream_answer(self, query: str):
         user_msg, _ = self._augment(query)
+        # SYNC generator yielding token deltas; router wraps in SSE
         return self.client.chat_stream(
+            SYSTEM_PROMPT,
+            user_msg,
             max_tokens=self.settings.model.max_new_tokens,
             temperature=self.settings.model.temperature,
+        )

configs/rag_sources.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Where to pull documentation from when building the RAG knowledge base.
+# You can add/remove repos here; the builder will respect these sources.
+github:
+  # 1) Explicit repos (stable)
+  repos:
+    - owner: agent-matrix
+      name: matrix-cli
+      branch: master
+      docs_paths: ["docs"]      # folders to harvest (recursive)
+      include_readme: true
+    - owner: agent-matrix
+      name: matrix-python-sdk
+      branch: master
+      docs_paths: ["docs"]
+      include_readme: true
+    - owner: agent-matrix
+      name: matrixlink
+      branch: master
+      docs_paths: ["docs"]
+      include_readme: true
+    - owner: agent-matrix
+      name: matrix-hub
+      branch: master
+      docs_paths: ["docs"]
+      include_readme: true
+  # 2) Optionally scan an entire org for repos (README + docs/ if present)
+  #    Comment out if you want only the explicit list above.
+  orgs:
+    - agent-matrix
+# Local content in THIS repo (optional but recommended)
+local:
+  paths:
+    - docs           # everything under /docs
+    - README.md      # root readme
+  glob: "**/*.md"    # or "**/*.{md,mdx,txt}"
+# Extra public URLs to pull (optional)
+urls: []

data/kb.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -18,3 +18,8 @@ pytest
 ruff
 mypy
 pytest-asyncio

 ruff
 mypy
 pytest-asyncio
+requests>=2.32.0
+beautifulsoup4>=4.12.3   # only used if you later add generic HTML URLs
+PyYAML>=6.0.1

scripts/build_kb.py ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/usr/bin/env python3
+"""
+Builds/refreshes the local RAG KB (data/kb.jsonl) from GitHub + local docs.
+Usage:
+  python scripts/build_kb.py --config configs/rag_sources.yaml --out data/kb.jsonl
+  python scripts/build_kb.py --config ... --out ... --force
+"""
+from __future__ import annotations
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+# --- Ensure THIS repo is first on sys.path (avoid clashing 'app' packages) ---
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+logger = logging.getLogger("build_kb")
+logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+# Import the builder from this project
+try:
+    from app.core.rag.build import build_kb_from_config, ensure_kb  # type: ignore
+except Exception as e:  # pragma: no cover
+    logger.error("Failed importing KB builder from app.core.rag.build: %s", e)
+    logger.error("Make sure you're running from the project root and PYTHONPATH includes '.'.")
+    sys.exit(2)
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--config", required=True, help="Path to configs/rag_sources.yaml")
+    p.add_argument("--out", required=True, help="Output JSONL file, e.g., data/kb.jsonl")
+    p.add_argument("--force", action="store_true", help="Delete output file first, then rebuild")
+    args = p.parse_args()
+    out_path = Path(args.out)
+    if args.force and out_path.exists():
+        logger.info("Removing existing %s", out_path)
+        out_path.unlink()
+    # If you want a one-liner that skips if exists, use ensure_kb:
+    #   created = ensure_kb(out_jsonl=args.out, config_path=args.config, skip_if_exists=True)
+    #   logger.info("KB %s at %s", "ready" if created else "unchanged", args.out)
+    # Otherwise, always (re)build:
+    n = build_kb_from_config(config_path=args.config, out_jsonl=args.out)
+    logger.info("Wrote %d records to %s", n, args.out)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())