First version rag
Browse files- app/core/rag/__init__.py +0 -0
- app/core/rag/retriever.py +42 -0
- app/main.py +13 -14
- app/routers/chat.py +34 -26
- app/services/chat_service.py +70 -10
- data/kb.jsonl +70 -0
app/core/rag/__init__.py
ADDED
|
File without changes
|
app/core/rag/retriever.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/core/rag/retriever.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import json, logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
import numpy as np
|
| 7 |
+
import faiss
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
|
| 10 |
+
log = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class Retriever:
|
| 13 |
+
def __init__(self, kb_path: str = "data/kb.jsonl",
|
| 14 |
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
| 15 |
+
top_k: int = 4):
|
| 16 |
+
self.kb_path = Path(kb_path)
|
| 17 |
+
self.top_k = top_k
|
| 18 |
+
if not self.kb_path.exists():
|
| 19 |
+
raise FileNotFoundError(f"KB file not found: {self.kb_path} (jsonl with {{text,source}})")
|
| 20 |
+
self.model = SentenceTransformer(model_name)
|
| 21 |
+
self.docs: List[Dict[str, str]] = []
|
| 22 |
+
with self.kb_path.open("r", encoding="utf-8") as f:
|
| 23 |
+
for line in f:
|
| 24 |
+
line = line.strip()
|
| 25 |
+
if not line: continue
|
| 26 |
+
self.docs.append(json.loads(line))
|
| 27 |
+
texts = [d["text"] for d in self.docs]
|
| 28 |
+
emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
|
| 29 |
+
self.dim = int(emb.shape[1])
|
| 30 |
+
self.index = faiss.IndexFlatIP(self.dim) # cosine via normalized vectors = dot product
|
| 31 |
+
self.index.add(emb.astype("float32"))
|
| 32 |
+
|
| 33 |
+
def retrieve(self, query: str, k: Optional[int] = None) -> List[Dict]:
|
| 34 |
+
k = k or self.top_k
|
| 35 |
+
vec = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
|
| 36 |
+
D, I = self.index.search(vec.astype("float32"), k)
|
| 37 |
+
out: List[Dict] = []
|
| 38 |
+
for idx, score in zip(I[0], D[0]):
|
| 39 |
+
if int(idx) < 0: continue
|
| 40 |
+
d = self.docs[int(idx)]
|
| 41 |
+
out.append({"text": d["text"], "source": d.get("source", f"kb:{idx}"), "score": float(score)})
|
| 42 |
+
return out
|
app/main.py
CHANGED
|
@@ -9,6 +9,10 @@ from typing import Any, Dict
|
|
| 9 |
from fastapi import FastAPI
|
| 10 |
from fastapi.responses import RedirectResponse
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# -----------------------------------------------------------------------------
|
| 13 |
# Early: load .env (so HF_TOKEN, ADMIN_TOKEN, etc. are available locally)
|
| 14 |
# -----------------------------------------------------------------------------
|
|
@@ -65,7 +69,6 @@ _load_env_file([".env", "configs/.env", ".env.local", "configs/.env.local"])
|
|
| 65 |
# -----------------------------------------------------------------------------
|
| 66 |
# Middlewares
|
| 67 |
# -----------------------------------------------------------------------------
|
| 68 |
-
# Prefer the canonical package name; if your repo uses "middlewares/", this tries both.
|
| 69 |
try:
|
| 70 |
from .middleware import attach_middlewares # singular
|
| 71 |
except Exception:
|
|
@@ -82,7 +85,6 @@ except Exception:
|
|
| 82 |
# -----------------------------------------------------------------------------
|
| 83 |
from .routers import health, plan, chat
|
| 84 |
|
| 85 |
-
# Optional UI (Home/Chat/Dev). If missing, we gracefully fall back to a JSON root.
|
| 86 |
try:
|
| 87 |
from .ui import router as ui_router # type: ignore
|
| 88 |
HAS_UI = True
|
|
@@ -103,9 +105,14 @@ async def lifespan(app: FastAPI):
|
|
| 103 |
app.state.started_at = time.time()
|
| 104 |
app.state.version = os.getenv("APP_VERSION", "1.0.0")
|
| 105 |
|
| 106 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
hf_token_present = bool(os.getenv("HF_TOKEN"))
|
| 108 |
-
|
| 109 |
"matrix-ai starting (version=%s, port=%s, hf_token_present=%s)",
|
| 110 |
app.state.version,
|
| 111 |
os.getenv("PORT", "7860"),
|
|
@@ -115,7 +122,7 @@ async def lifespan(app: FastAPI):
|
|
| 115 |
yield
|
| 116 |
finally:
|
| 117 |
uptime = time.time() - getattr(app.state, "started_at", time.time())
|
| 118 |
-
|
| 119 |
"matrix-ai shutting down (uptime=%.2fs)", uptime
|
| 120 |
)
|
| 121 |
|
|
@@ -131,19 +138,14 @@ def create_app() -> FastAPI:
|
|
| 131 |
lifespan=lifespan,
|
| 132 |
)
|
| 133 |
|
| 134 |
-
# Middlewares (request-id, gzip, rate-limit, etc.)
|
| 135 |
attach_middlewares(app)
|
| 136 |
-
|
| 137 |
-
# Core routers
|
| 138 |
app.include_router(health.router, tags=["Health"])
|
| 139 |
app.include_router(plan.router, prefix="/v1", tags=["Planning"])
|
| 140 |
app.include_router(chat.router, prefix="/v1", tags=["Chat"])
|
| 141 |
|
| 142 |
-
# Optional UI (adds '/', '/chat', '/dev')
|
| 143 |
if HAS_UI:
|
| 144 |
app.include_router(ui_router, tags=["UI"])
|
| 145 |
else:
|
| 146 |
-
# Minimal root so HF root probes pass even without UI
|
| 147 |
@app.get("/", include_in_schema=False)
|
| 148 |
async def root() -> Dict[str, Any]:
|
| 149 |
return {
|
|
@@ -153,12 +155,9 @@ def create_app() -> FastAPI:
|
|
| 153 |
"docs": "/docs",
|
| 154 |
"endpoints": {"plan": "/v1/plan", "chat": "/v1/chat", "healthz": "/healthz"},
|
| 155 |
}
|
| 156 |
-
|
| 157 |
@app.get("/home", include_in_schema=False)
|
| 158 |
async def home_redirect():
|
| 159 |
return RedirectResponse(url="/docs", status_code=302)
|
| 160 |
-
|
| 161 |
return app
|
| 162 |
|
| 163 |
-
|
| 164 |
-
app = create_app()
|
|
|
|
| 9 |
from fastapi import FastAPI
|
| 10 |
from fastapi.responses import RedirectResponse
|
| 11 |
|
| 12 |
+
# --- ADDED: Import dependencies needed for pre-loading ---
|
| 13 |
+
from .deps import get_settings
|
| 14 |
+
from .services.chat_service import get_retriever
|
| 15 |
+
|
| 16 |
# -----------------------------------------------------------------------------
|
| 17 |
# Early: load .env (so HF_TOKEN, ADMIN_TOKEN, etc. are available locally)
|
| 18 |
# -----------------------------------------------------------------------------
|
|
|
|
| 69 |
# -----------------------------------------------------------------------------
|
| 70 |
# Middlewares
|
| 71 |
# -----------------------------------------------------------------------------
|
|
|
|
| 72 |
try:
|
| 73 |
from .middleware import attach_middlewares # singular
|
| 74 |
except Exception:
|
|
|
|
| 85 |
# -----------------------------------------------------------------------------
|
| 86 |
from .routers import health, plan, chat
|
| 87 |
|
|
|
|
| 88 |
try:
|
| 89 |
from .ui import router as ui_router # type: ignore
|
| 90 |
HAS_UI = True
|
|
|
|
| 105 |
app.state.started_at = time.time()
|
| 106 |
app.state.version = os.getenv("APP_VERSION", "1.0.0")
|
| 107 |
|
| 108 |
+
# --- ADDED: Pre-load the RAG model and index on startup ---
|
| 109 |
+
logger = logging.getLogger("uvicorn.error")
|
| 110 |
+
logger.info("Warming up RAG retriever...")
|
| 111 |
+
get_retriever(get_settings())
|
| 112 |
+
logger.info("RAG retriever is ready.")
|
| 113 |
+
|
| 114 |
hf_token_present = bool(os.getenv("HF_TOKEN"))
|
| 115 |
+
logger.info(
|
| 116 |
"matrix-ai starting (version=%s, port=%s, hf_token_present=%s)",
|
| 117 |
app.state.version,
|
| 118 |
os.getenv("PORT", "7860"),
|
|
|
|
| 122 |
yield
|
| 123 |
finally:
|
| 124 |
uptime = time.time() - getattr(app.state, "started_at", time.time())
|
| 125 |
+
logger.info(
|
| 126 |
"matrix-ai shutting down (uptime=%.2fs)", uptime
|
| 127 |
)
|
| 128 |
|
|
|
|
| 138 |
lifespan=lifespan,
|
| 139 |
)
|
| 140 |
|
|
|
|
| 141 |
attach_middlewares(app)
|
|
|
|
|
|
|
| 142 |
app.include_router(health.router, tags=["Health"])
|
| 143 |
app.include_router(plan.router, prefix="/v1", tags=["Planning"])
|
| 144 |
app.include_router(chat.router, prefix="/v1", tags=["Chat"])
|
| 145 |
|
|
|
|
| 146 |
if HAS_UI:
|
| 147 |
app.include_router(ui_router, tags=["UI"])
|
| 148 |
else:
|
|
|
|
| 149 |
@app.get("/", include_in_schema=False)
|
| 150 |
async def root() -> Dict[str, Any]:
|
| 151 |
return {
|
|
|
|
| 155 |
"docs": "/docs",
|
| 156 |
"endpoints": {"plan": "/v1/plan", "chat": "/v1/chat", "healthz": "/healthz"},
|
| 157 |
}
|
|
|
|
| 158 |
@app.get("/home", include_in_schema=False)
|
| 159 |
async def home_redirect():
|
| 160 |
return RedirectResponse(url="/docs", status_code=302)
|
|
|
|
| 161 |
return app
|
| 162 |
|
| 163 |
+
app = create_app()
|
|
|
app/routers/chat.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from fastapi import APIRouter, Depends, HTTPException, Query
|
|
|
|
|
|
|
| 2 |
from starlette.responses import StreamingResponse
|
| 3 |
-
from pydantic import BaseModel
|
| 4 |
-
from typing import List, Optional, Any, Iterator
|
| 5 |
-
import json, time
|
| 6 |
|
| 7 |
from ..deps import get_settings
|
| 8 |
from ..core.config import Settings
|
|
@@ -10,10 +14,12 @@ from ..services.chat_service import ChatService
|
|
| 10 |
|
| 11 |
router = APIRouter()
|
| 12 |
|
|
|
|
| 13 |
class ChatMessage(BaseModel):
|
| 14 |
role: str
|
| 15 |
content: str
|
| 16 |
|
|
|
|
| 17 |
class ChatRequest(BaseModel):
|
| 18 |
query: Optional[str] = None
|
| 19 |
question: Optional[str] = None
|
|
@@ -26,13 +32,15 @@ class ChatRequest(BaseModel):
|
|
| 26 |
if self.prompt: return self.prompt
|
| 27 |
if self.messages:
|
| 28 |
for m in reversed(self.messages):
|
| 29 |
-
if m.role.lower() == "user":
|
| 30 |
-
return m.content
|
| 31 |
return self.messages[-1].content
|
| 32 |
raise ValueError("Body must include 'query'/'question'/'prompt' or 'messages'")
|
| 33 |
|
|
|
|
| 34 |
class ChatResponse(BaseModel):
|
| 35 |
answer: str
|
|
|
|
|
|
|
| 36 |
|
| 37 |
@router.post("/chat", response_model=ChatResponse)
|
| 38 |
async def chat(req: ChatRequest, settings: Settings = Depends(get_settings)):
|
|
@@ -42,71 +50,71 @@ async def chat(req: ChatRequest, settings: Settings = Depends(get_settings)):
|
|
| 42 |
raise HTTPException(status_code=422, detail=str(e))
|
| 43 |
svc = ChatService(settings)
|
| 44 |
try:
|
| 45 |
-
|
| 46 |
-
|
|
|
|
| 47 |
except PermissionError as e:
|
| 48 |
raise HTTPException(status_code=403, detail=str(e))
|
| 49 |
except Exception as e:
|
| 50 |
raise HTTPException(status_code=502, detail=f"Inference error: {e}")
|
| 51 |
|
|
|
|
| 52 |
@router.get("/chat", response_model=ChatResponse)
|
| 53 |
async def chat_get(query: str = Query(...), settings: Settings = Depends(get_settings)):
|
| 54 |
svc = ChatService(settings)
|
| 55 |
try:
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
except PermissionError as e:
|
| 59 |
raise HTTPException(status_code=403, detail=str(e))
|
| 60 |
except Exception as e:
|
| 61 |
raise HTTPException(status_code=502, detail=f"Inference error: {e}")
|
| 62 |
|
|
|
|
| 63 |
# ---------- Streaming (SSE) ----------
|
| 64 |
def _sse_line(obj: Any) -> str:
|
| 65 |
payload = obj if isinstance(obj, str) else json.dumps(obj, ensure_ascii=False)
|
| 66 |
return f"data: {payload}\n\n"
|
| 67 |
|
|
|
|
| 68 |
@router.get("/chat/stream")
|
| 69 |
async def chat_stream(query: str = Query(...), settings: Settings = Depends(get_settings)):
|
| 70 |
-
"""
|
| 71 |
-
SSE stream of token deltas: emits {"delta": "..."} chunks, then final [DONE].
|
| 72 |
-
"""
|
| 73 |
svc = ChatService(settings)
|
| 74 |
|
| 75 |
-
def gen() ->
|
| 76 |
-
# Anti-buffer padding
|
| 77 |
yield ":" + (" " * 2048) + "\n\n"
|
| 78 |
yield "event: ping\ndata: 0\n\n"
|
| 79 |
-
|
| 80 |
try:
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
if token:
|
| 83 |
any_tokens = True
|
| 84 |
yield _sse_line({"delta": token})
|
|
|
|
| 85 |
if not any_tokens:
|
| 86 |
yield _sse_line({"delta": ""})
|
| 87 |
yield _sse_line("[DONE]")
|
| 88 |
-
except GeneratorExit:
|
| 89 |
-
return
|
| 90 |
except Exception as e:
|
| 91 |
-
|
| 92 |
-
yield _sse_line({"error": str(e)})
|
| 93 |
-
except Exception:
|
| 94 |
-
return
|
| 95 |
|
| 96 |
headers = {
|
| 97 |
-
# Critical for proxies/browsers
|
| 98 |
"Cache-Control": "no-cache, no-transform",
|
| 99 |
-
"X-Accel-Buffering": "no",
|
| 100 |
"Connection": "keep-alive",
|
| 101 |
-
"Content-Encoding": "identity",
|
| 102 |
}
|
| 103 |
return StreamingResponse(gen(), media_type="text/event-stream; charset=utf-8", headers=headers)
|
| 104 |
|
|
|
|
| 105 |
@router.post("/chat/stream")
|
| 106 |
async def chat_stream_post(req: ChatRequest, settings: Settings = Depends(get_settings)):
|
| 107 |
try:
|
| 108 |
q = req.as_text()
|
| 109 |
except ValueError as e:
|
| 110 |
raise HTTPException(status_code=422, detail=str(e))
|
| 111 |
-
# Reuse GET logic to keep one code path
|
| 112 |
return await chat_stream(query=q, settings=settings)
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from typing import Any, AsyncIterator, List, Optional
|
| 5 |
+
|
| 6 |
from fastapi import APIRouter, Depends, HTTPException, Query
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from starlette.concurrency import run_in_threadpool
|
| 9 |
from starlette.responses import StreamingResponse
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from ..deps import get_settings
|
| 12 |
from ..core.config import Settings
|
|
|
|
| 14 |
|
| 15 |
router = APIRouter()
|
| 16 |
|
| 17 |
+
|
| 18 |
class ChatMessage(BaseModel):
|
| 19 |
role: str
|
| 20 |
content: str
|
| 21 |
|
| 22 |
+
|
| 23 |
class ChatRequest(BaseModel):
|
| 24 |
query: Optional[str] = None
|
| 25 |
question: Optional[str] = None
|
|
|
|
| 32 |
if self.prompt: return self.prompt
|
| 33 |
if self.messages:
|
| 34 |
for m in reversed(self.messages):
|
| 35 |
+
if m.role.lower() == "user": return m.content
|
|
|
|
| 36 |
return self.messages[-1].content
|
| 37 |
raise ValueError("Body must include 'query'/'question'/'prompt' or 'messages'")
|
| 38 |
|
| 39 |
+
|
| 40 |
class ChatResponse(BaseModel):
|
| 41 |
answer: str
|
| 42 |
+
sources: List[str] = Field(default_factory=list)
|
| 43 |
+
|
| 44 |
|
| 45 |
@router.post("/chat", response_model=ChatResponse)
|
| 46 |
async def chat(req: ChatRequest, settings: Settings = Depends(get_settings)):
|
|
|
|
| 50 |
raise HTTPException(status_code=422, detail=str(e))
|
| 51 |
svc = ChatService(settings)
|
| 52 |
try:
|
| 53 |
+
# Run the blocking call in a thread pool to avoid freezing the server
|
| 54 |
+
answer, sources = await run_in_threadpool(svc.answer_with_sources, text)
|
| 55 |
+
return ChatResponse(answer=answer, sources=sources)
|
| 56 |
except PermissionError as e:
|
| 57 |
raise HTTPException(status_code=403, detail=str(e))
|
| 58 |
except Exception as e:
|
| 59 |
raise HTTPException(status_code=502, detail=f"Inference error: {e}")
|
| 60 |
|
| 61 |
+
|
| 62 |
@router.get("/chat", response_model=ChatResponse)
|
| 63 |
async def chat_get(query: str = Query(...), settings: Settings = Depends(get_settings)):
|
| 64 |
svc = ChatService(settings)
|
| 65 |
try:
|
| 66 |
+
# Run the blocking call in a thread pool
|
| 67 |
+
answer, sources = await run_in_threadpool(svc.answer_with_sources, query)
|
| 68 |
+
return ChatResponse(answer=answer, sources=sources)
|
| 69 |
except PermissionError as e:
|
| 70 |
raise HTTPException(status_code=403, detail=str(e))
|
| 71 |
except Exception as e:
|
| 72 |
raise HTTPException(status_code=502, detail=f"Inference error: {e}")
|
| 73 |
|
| 74 |
+
|
| 75 |
# ---------- Streaming (SSE) ----------
|
| 76 |
def _sse_line(obj: Any) -> str:
|
| 77 |
payload = obj if isinstance(obj, str) else json.dumps(obj, ensure_ascii=False)
|
| 78 |
return f"data: {payload}\n\n"
|
| 79 |
|
| 80 |
+
|
| 81 |
@router.get("/chat/stream")
|
| 82 |
async def chat_stream(query: str = Query(...), settings: Settings = Depends(get_settings)):
|
|
|
|
|
|
|
|
|
|
| 83 |
svc = ChatService(settings)
|
| 84 |
|
| 85 |
+
async def gen() -> AsyncIterator[str]:
|
| 86 |
+
# Anti-buffer padding and initial ping
|
| 87 |
yield ":" + (" " * 2048) + "\n\n"
|
| 88 |
yield "event: ping\ndata: 0\n\n"
|
| 89 |
+
|
| 90 |
try:
|
| 91 |
+
# Run the blocking retrieval part in a thread pool, then stream the results
|
| 92 |
+
stream_generator = await run_in_threadpool(svc.stream_answer, query)
|
| 93 |
+
any_tokens = False
|
| 94 |
+
for token in stream_generator:
|
| 95 |
if token:
|
| 96 |
any_tokens = True
|
| 97 |
yield _sse_line({"delta": token})
|
| 98 |
+
|
| 99 |
if not any_tokens:
|
| 100 |
yield _sse_line({"delta": ""})
|
| 101 |
yield _sse_line("[DONE]")
|
|
|
|
|
|
|
| 102 |
except Exception as e:
|
| 103 |
+
yield _sse_line({"error": str(e)})
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
headers = {
|
|
|
|
| 106 |
"Cache-Control": "no-cache, no-transform",
|
| 107 |
+
"X-Accel-Buffering": "no",
|
| 108 |
"Connection": "keep-alive",
|
| 109 |
+
"Content-Encoding": "identity",
|
| 110 |
}
|
| 111 |
return StreamingResponse(gen(), media_type="text/event-stream; charset=utf-8", headers=headers)
|
| 112 |
|
| 113 |
+
|
| 114 |
@router.post("/chat/stream")
|
| 115 |
async def chat_stream_post(req: ChatRequest, settings: Settings = Depends(get_settings)):
|
| 116 |
try:
|
| 117 |
q = req.as_text()
|
| 118 |
except ValueError as e:
|
| 119 |
raise HTTPException(status_code=422, detail=str(e))
|
|
|
|
| 120 |
return await chat_stream(query=q, settings=settings)
|
app/services/chat_service.py
CHANGED
|
@@ -1,36 +1,96 @@
|
|
| 1 |
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from ..core.config import Settings
|
| 3 |
from ..core.inference.client import RouterRequestsClient
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
SYSTEM_PROMPT = (
|
| 6 |
"You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem. "
|
| 7 |
"Answer clearly and briefly. If unsure, say so."
|
| 8 |
)
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class ChatService:
|
| 11 |
def __init__(self, settings: Settings):
|
| 12 |
self.settings = settings
|
| 13 |
self.client = RouterRequestsClient(
|
| 14 |
model=settings.model.name,
|
| 15 |
fallback=settings.model.fallback,
|
| 16 |
-
provider=settings.model
|
| 17 |
max_retries=2,
|
| 18 |
-
connect_timeout=10.0,
|
| 19 |
-
read_timeout=60.0,
|
| 20 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
return
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
max_tokens=self.settings.model.max_new_tokens,
|
| 27 |
temperature=self.settings.model.temperature,
|
| 28 |
)
|
|
|
|
| 29 |
|
| 30 |
-
# Expose a generator for streaming endpoints
|
| 31 |
def stream_answer(self, query: str):
|
|
|
|
| 32 |
return self.client.chat_stream(
|
| 33 |
-
SYSTEM_PROMPT,
|
| 34 |
max_tokens=self.settings.model.max_new_tokens,
|
| 35 |
temperature=self.settings.model.temperature,
|
| 36 |
-
)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import List, Tuple
|
| 7 |
+
|
| 8 |
from ..core.config import Settings
|
| 9 |
from ..core.inference.client import RouterRequestsClient
|
| 10 |
+
from ..core.rag.retriever import Retriever
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
SYSTEM_PROMPT = (
|
| 15 |
"You are MATRIX-AI, a concise, helpful assistant for the Matrix EcoSystem. "
|
| 16 |
"Answer clearly and briefly. If unsure, say so."
|
| 17 |
)
|
| 18 |
|
| 19 |
+
# --- Singleton instance for the expensive Retriever class ---
|
| 20 |
+
_retriever_instance: Retriever | None = None
|
| 21 |
+
|
| 22 |
+
def get_retriever(settings: Settings) -> Retriever | None:
|
| 23 |
+
"""Initializes and returns a single instance of the Retriever."""
|
| 24 |
+
global _retriever_instance
|
| 25 |
+
if _retriever_instance is not None:
|
| 26 |
+
return _retriever_instance
|
| 27 |
+
|
| 28 |
+
kb_path = os.getenv("RAG_KB_PATH", "data/kb.jsonl")
|
| 29 |
+
try:
|
| 30 |
+
if Path(kb_path).exists():
|
| 31 |
+
_retriever_instance = Retriever(kb_path=kb_path, top_k=settings.rag.top_k)
|
| 32 |
+
logger.info("RAG enabled with KB at %s (top_k=%d)", kb_path, settings.rag.top_k)
|
| 33 |
+
else:
|
| 34 |
+
logger.info("RAG KB not found at %s — running LLM-only.", kb_path)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.warning("RAG disabled (failed to initialize Retriever: %s)", e)
|
| 37 |
+
|
| 38 |
+
return _retriever_instance
|
| 39 |
+
|
| 40 |
+
|
| 41 |
class ChatService:
|
| 42 |
def __init__(self, settings: Settings):
|
| 43 |
self.settings = settings
|
| 44 |
self.client = RouterRequestsClient(
|
| 45 |
model=settings.model.name,
|
| 46 |
fallback=settings.model.fallback,
|
| 47 |
+
provider=getattr(settings.model, "provider", None),
|
| 48 |
max_retries=2,
|
|
|
|
|
|
|
| 49 |
)
|
| 50 |
+
# Get the singleton retriever instance
|
| 51 |
+
self.retriever = get_retriever(settings)
|
| 52 |
+
|
| 53 |
+
def _build_context(self, query: str) -> Tuple[str, List[str]]:
|
| 54 |
+
if not self.retriever:
|
| 55 |
+
return "", []
|
| 56 |
+
docs = self.retriever.retrieve(query, self.settings.rag.top_k)
|
| 57 |
+
if not docs:
|
| 58 |
+
return "", []
|
| 59 |
+
blocks = [f"[{i+1}] {d['text']} (source: {d['source']})" for i, d in enumerate(docs)]
|
| 60 |
+
context = "CONTEXT (use only these facts; if missing, say you don't know):\n" + "\n\n".join(blocks)
|
| 61 |
+
sources = [d["source"] for d in docs]
|
| 62 |
+
return context, sources
|
| 63 |
|
| 64 |
+
def _augment(self, query: str) -> Tuple[str, List[str]]:
|
| 65 |
+
"""
|
| 66 |
+
Build the final user message (with optional CONTEXT) and return sources.
|
| 67 |
+
"""
|
| 68 |
+
ctx, sources = self._build_context(query)
|
| 69 |
+
|
| 70 |
+
# --- THIS IS THE CORRECTED PROMPT ---
|
| 71 |
+
if ctx:
|
| 72 |
+
# New, clearer instruction format
|
| 73 |
+
augmented = f"{ctx}\n\nBased only on the context provided above, answer the following question.\nQuestion: {query}"
|
| 74 |
+
else:
|
| 75 |
+
# If no context, just pass the original query
|
| 76 |
+
augmented = query
|
| 77 |
+
|
| 78 |
+
return augmented, sources
|
| 79 |
+
|
| 80 |
+
# Note: These methods are now called from a thread pool in the router
|
| 81 |
+
def answer_with_sources(self, query: str) -> Tuple[str, List[str]]:
|
| 82 |
+
user_msg, sources = self._augment(query)
|
| 83 |
+
text = self.client.chat_nonstream(
|
| 84 |
+
SYSTEM_PROMPT, user_msg,
|
| 85 |
max_tokens=self.settings.model.max_new_tokens,
|
| 86 |
temperature=self.settings.model.temperature,
|
| 87 |
)
|
| 88 |
+
return text, sources
|
| 89 |
|
|
|
|
| 90 |
def stream_answer(self, query: str):
|
| 91 |
+
user_msg, _ = self._augment(query)
|
| 92 |
return self.client.chat_stream(
|
| 93 |
+
SYSTEM_PROMPT, user_msg,
|
| 94 |
max_tokens=self.settings.model.max_new_tokens,
|
| 95 |
temperature=self.settings.model.temperature,
|
| 96 |
+
)
|
data/kb.jsonl
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"text":"MatrixHub is the API and registry for Matrix apps, tracking health and LKG metadata.","source":"kb:matrixhub"}
|
| 2 |
+
{"text":"Matrix Guardian coordinates probes, writes /status to MatrixHub, and asks matrix-ai for plans.","source":"kb:guardian"}
|
| 3 |
+
{"text":"Matrix System 1.0 turns a static registry into an alive, policy-governed, self-healing platform (observe → plan → approve/execute → audit).","source":"kb:overview"}
|
| 4 |
+
{"text":"Core components: Matrix-Hub (API/registry), MatrixDB (Postgres schema), Matrix-Guardian (control plane), Matrix-AI (planning service on Hugging Face).","source":"kb:overview:components"}
|
| 5 |
+
{"text":"End-to-end loop: Guardian probes targets → POST /status to Hub → AI returns a low-risk JSON plan → policy gate decides HITL or Autopilot → actions recorded as events.","source":"kb:overview:loop"}
|
| 6 |
+
{"text":"MatrixHub is the API and registry for Matrix apps, tracking health and LKG metadata.","source":"kb:matrixhub"}
|
| 7 |
+
{"text":"Matrix-Hub Stage-1 routes: GET /apps, GET /apps/{id}, GET /apps/{id}/status, GET /apps/{id}/bundle, POST /apps, POST /status, POST /guardian/approve|reject.","source":"kb:hub:routes:stage1"}
|
| 8 |
+
{"text":"Matrix-Hub Stage-2 (flagged) routes: POST /bundles, GET /advisories, POST /patches/proposals, POST /patches/accept.","source":"kb:hub:routes:stage2"}
|
| 9 |
+
{"text":"New middlewares (additive): in-memory RateLimit (fixed window per IP/route) and Idempotency-Key propagation to request.state.","source":"kb:hub:middleware"}
|
| 10 |
+
{"text":"Security: optional JWT RS256 role checks via JWT_PUBLIC_KEY_PEM; falls back to API token dependency if configured; otherwise dev-mode allow.","source":"kb:hub:security"}
|
| 11 |
+
{"text":"ETag behavior: GET endpoints return weak ETags on stable payloads; If-None-Match yields 304 to save bandwidth and enable client caching.","source":"kb:http:etag"}
|
| 12 |
+
{"text":"Idempotency: POST endpoints accept Idempotency-Key; server persists request body hash and response to avoid duplicate side effects on retries.","source":"kb:http:idempotency"}
|
| 13 |
+
{"text":"Entity additive fields: health_score (float), health_status (text), health_last_checked (timestamptz), lkg_version (text), lkg_digest (text).","source":"kb:hub:models:entity"}
|
| 14 |
+
{"text":"Tables (Hub models): HealthCheck(id, entity_uid, check, result, latency_ms, reasons, ts), Event(id, type, entity_uid, payload, ts), IdempotencyKey(key, route, body_hash, status_code, response_json, created_at), Advisory(id, entity_uid, source, severity, title, description, cve, cvss, detected_at, status, created_at).","source":"kb:hub:models:tables"}
|
| 15 |
+
{"text":"CAS redirection: GET /apps/{id}/bundle issues 307 to CAS_BASE_URL/sha256/{lkg_digest}.","source":"kb:hub:cas"}
|
| 16 |
+
{"text":"Environment flags (Hub): ADVISORIES_ENABLED, PATCHES_FACADE_ENABLED, RATE_LIMIT_PER_MIN, IDEMPOTENCY_ENABLED, CAS_BASE_URL, JWT_PUBLIC_KEY_PEM.","source":"kb:hub:env"}
|
| 17 |
+
{"text":"Bundles push: POST /bundles accepts {entity_uid, digest, version, lkg?}; optionally updates Entity.lkg_* when lkg=true; records event=bundle.push; idempotent.","source":"kb:hub:bundles"}
|
| 18 |
+
{"text":"Patches façade: POST /patches/proposals and /patches/accept record audit events; /patches/accept may set new lkg_version/lkg_digest.","source":"kb:hub:patches:facade"}
|
| 19 |
+
{"text":"Advisories (optional): GET /advisories supports filter by q, entity_uid, severity; paginated with X-Total-Count and ETag.","source":"kb:hub:advisories"}
|
| 20 |
+
{"text":"Matrix Guardian coordinates probes, writes /status to MatrixHub, and asks matrix-ai for plans.","source":"kb:guardian"}
|
| 21 |
+
{"text":"Guardian probes: safe HTTP checks, optional MCP echo handshake; short timeouts, retries, and sandboxed execution by default.","source":"kb:guardian:probes"}
|
| 22 |
+
{"text":"Guardian policy gate: evaluates AI plan risk vs policy thresholds and allowlists; enforces HITL (human-in-the-loop) by default; supports Autopilot when enabled.","source":"kb:guardian:policy"}
|
| 23 |
+
{"text":"Autonomy levels: A0 Manual (observe/plan only), A1 Suggest (propose; auto re-probe/metadata), A2 Safe Autopilot (LKG pin/rollback, cache warm-ups), A3 Extended (sandboxed patchers).","source":"kb:guardian:autonomy"}
|
| 24 |
+
{"text":"Autopilot (LangGraph): multi-agent orchestration with nodes for sensing (probes), planning (matrix-ai), critic, and execution; always emits audit events.","source":"kb:guardian:autopilot"}
|
| 25 |
+
{"text":"Guardian REST (core): GET /healthz, GET /readyz; HITL resume endpoint POST /threads/{thread_id}/resume; optional Autopilot API when AUTOPILOT_API_ENABLED=true.","source":"kb:guardian:api"}
|
| 26 |
+
{"text":"Guardian configuration (key vars): DATABASE_URL, MATRIXHUB_API_BASE, MATRIX_AI_BASE, API_TOKEN, AUTOPILOT_ENABLED, AUTOPILOT_INTERVAL_SEC, AUTOPILOT_POLICY, AUTOPILOT_SAFE_MODE.","source":"kb:guardian:env"}
|
| 27 |
+
{"text":"Matrix-AI service (HF): POST /v1/plan returns short, low-risk JSON plan; POST /v1/chat reserved for RAG Q&A (optional).","source":"kb:ai:overview"}
|
| 28 |
+
{"text":"Matrix-AI safety: PII redaction on prompts, strict JSON schema validation, exponential backoff to HF, structured logging, in-memory rate limiting.","source":"kb:ai:safety"}
|
| 29 |
+
{"text":"Deployment Matrix-AI: recommend Hugging Face Spaces with HF_TOKEN secret; CPU for tests, GPU for larger models.","source":"kb:ai:deploy"}
|
| 30 |
+
{"text":"MatrixDB (Postgres) additive schema: Stage‑1 tables versions, artifacts, health, checks, events; Stage‑2 tables bundles, proposals, optional jobs.","source":"kb:db:tables"}
|
| 31 |
+
{"text":"Indexes (DB): time-series and lookup indexes such as idx_*_app_id_ts, idx_artifacts_sha256, idx_proposals_state; all created IF NOT EXISTS.","source":"kb:db:indexes"}
|
| 32 |
+
{"text":"Why additive patches: zero-downtime upgrades; no destructive ALTER/DROP; feature flags hide Stage‑2 until enabled.","source":"kb:upgrade:rationale"}
|
| 33 |
+
{"text":"Hub patch script: apply_matrixhub_patch_v2.sh creates new routers/middleware and appends guarded blocks to app.py, models.py, config.py, utils/jwt_helper.py, and optionally reqlog.","source":"kb:patch:hub"}
|
| 34 |
+
{"text":"DB patch script: apply_matrixhub_db_patch_v2a.sh drops SQL init files for Stage‑1/2 into db/docker-entrypoint-initdb.d; safe to apply to running clusters via psql.","source":"kb:patch:db"}
|
| 35 |
+
{"text":"Alembic migrations (Hub repo): stage1_health_lkg (entity health/LKG, tables health_check/event/idempotency_key), stage2_addons (advisory).","source":"kb:hub:migrations"}
|
| 36 |
+
{"text":"Events audit trail: every plan/proposal/approval is appended to event(type, entity_uid, payload, ts) for compliance and explainability.","source":"kb:audit:events"}
|
| 37 |
+
{"text":"Checks timeline: HealthCheck captures rolling probe history per entity; use for SLOs and anomaly detection.","source":"kb:observability:checks"}
|
| 38 |
+
{"text":"LKG semantics: last-known-good digest/version allow quick rollback or pin; GET /apps/{id}/bundle redirects to CAS for the pinned artifact.","source":"kb:lkg"}
|
| 39 |
+
{"text":"CAS integration: CAS_BASE_URL points to content-addressed storage; URL format /sha256/{digest}; digest is lowercased.","source":"kb:cas"}
|
| 40 |
+
{"text":"Security posture: JWT roles for write operations (when JWT_PUBLIC_KEY_PEM set); otherwise API tokens or dev-mode; minimal logging of sensitive data.","source":"kb:security:auth"}
|
| 41 |
+
{"text":"Rate limiting: simple in-memory fixed window per (client_ip, route) with default RATE_LIMIT_PER_MIN=600 (override via env).","source":"kb:security:ratelimit"}
|
| 42 |
+
{"text":"Reqlog trace propagation: optional adapter appends trace_id from request.state.request_id into logging context for correlation.","source":"kb:observability:trace"}
|
| 43 |
+
{"text":"Feature flags: Stage‑2 routes guarded by ADVISORIES_ENABLED and PATCHES_FACADE_ENABLED to allow gradual rollout.","source":"kb:featureflags"}
|
| 44 |
+
{"text":"HITL approvals: POST /guardian/approve or /guardian/reject record decision events (202 Accepted) and allow workflows to resume.","source":"kb:hitl"}
|
| 45 |
+
{"text":"Jobs table (optional): lightweight visibility/queue for background workers; not required for Stage‑1; helpful for Autopilot executors.","source":"kb:jobs"}
|
| 46 |
+
{"text":"Deployment topologies: local Docker Compose (Hub+DB+Guardian), cloud k8s for Hub/Guardian with DBaaS, Matrix-AI on HF Space.","source":"kb:deploy:topologies"}
|
| 47 |
+
{"text":"Rollout order: Patch DB → deploy Matrix-AI → patch & redeploy Hub → deploy Guardian → validate flow → enable Stage‑2 flags → optionally enable Autopilot.","source":"kb:deploy:rollout"}
|
| 48 |
+
{"text":"Client contracts: GETs support ETag and X-Total-Count; POSTs support Idempotency-Key; responses are JSON and append-only state updates.","source":"kb:contracts"}
|
| 49 |
+
{"text":"Guardian Autopilot policy file: YAML defining risk thresholds, allowed actions, protected entities, and autonomy level; referenced by AUTOPILOT_POLICY.","source":"kb:guardian:policy:file"}
|
| 50 |
+
{"text":"Safety rails for Autopilot: dry-run mode, global kill-switch, blast-radius caps, rate limits, and human override at any step.","source":"kb:guardian:autopilot:safety"}
|
| 51 |
+
{"text":"Adoption of orphaned apps: detect dead upstreams, pin LKG, mirror artifacts to CAS, open proposal of type 'orphan-adopt' based on policy.","source":"kb:ecosystem:orphanage"}
|
| 52 |
+
{"text":"Consulting value: blueprint for AI-assisted SRE with policy gates, auditability, and staged autonomy; reduces MTTR and operational toil.","source":"kb:value:consulting"}
|
| 53 |
+
{"text":"Community value: portable OSS pattern (HF + FastAPI + Postgres) that runs locally and scales to k8s; additive upgrades encourage safe contribution.","source":"kb:value:community"}
|
| 54 |
+
{"text":"Matrix-AI request contract (/v1/plan): input context summarizing health/last checks; output JSON with {plan_id, steps[], risk, rationale}; strictly validated.","source":"kb:ai:contract"}
|
| 55 |
+
{"text":"Guardian → AI interaction: Guardian sanitizes context, calls /v1/plan with retries/backoff, verifies schema, then emits guardian.plan event.","source":"kb:ai:interaction"}
|
| 56 |
+
{"text":"Hub advisories feed: list potential risks or CVEs associated with entities; consumers can filter by severity and entity_uid.","source":"kb:advisories:semantics"}
|
| 57 |
+
{"text":"Patch proposals workflow: create proposal with diff; on acceptance, optionally update Entity.lkg_* or create jobs; everything audited.","source":"kb:patches:workflow"}
|
| 58 |
+
{"text":"Performance notes: DB indexes optimized for app_uid+ts scans; API supports pagination and caching via ETag to scale reads.","source":"kb:performance"}
|
| 59 |
+
{"text":"Testing strategy: smoke tests for routers; schema migrations idempotent; Autopilot smoke test validates graph assembly and no-ops in safe mode.","source":"kb:testing"}
|
| 60 |
+
{"text":"Observability checklist: ensure health lag SLO, events/sec, 4xx/5xx on APIs, probe latency, plan generation timing, Autopilot actions count.","source":"kb:observability:slo"}
|
| 61 |
+
{"text":"Backup/DR: periodic backups of MatrixDB; versioned CAS artifacts; ability to roll back to previous Hub/Guardian images safely.","source":"kb:operations:dr"}
|
| 62 |
+
{"text":"Compliance: append-only events and idempotent writes help satisfy change-management and audit requirements in regulated environments.","source":"kb:compliance"}
|
| 63 |
+
{"text":"Security hardening: secrets in vault/Space Secrets, minimal scopes, locked egress for probes, signed artifacts, statement timeouts in DB.","source":"kb:security:hardening"}
|
| 64 |
+
{"text":"Known limitations: in-memory rate limiting is single-instance only; for multi-node deploys prefer Redis-based limiter.","source":"kb:limitations:ratelimit"}
|
| 65 |
+
{"text":"Known limitations: Autopilot should start in A1/A2 with conservative policies; sandboxed patchers require extra isolation and tests.","source":"kb:limitations:autopilot"}
|
| 66 |
+
{"text":"API compatibility: patches are additive and use guarded markers; re-running patch scripts is idempotent and safe.","source":"kb:compatibility"}
|
| 67 |
+
{"text":"Versioning: use semantic versioning for entities (versions table) and link artifacts by foreign key; LKG marks last-known-good release.","source":"kb:versioning"}
|
| 68 |
+
{"text":"Data retention suggestion: keep detailed checks for N days, aggregate beyond; events retained longer for audits; configurable per org.","source":"kb:data:retention"}
|
| 69 |
+
{"text":"CLI/SDK consumers: Matrix clients rely on stable Hub contracts, ETags, and LKG semantics to deliver reliable local installs.","source":"kb:clients"}
|
| 70 |
+
{"text":"SRE workflow: measure, propose, approve, execute safe steps, validate via re-probe; everything tracked in events and health.","source":"kb:sre:workflow"}
|