Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on Mar 23

Commit

1d47e3c

1 Parent(s): c1411e9

Deploy 555915a

Browse files

Files changed (7) hide show

app/api/chat.py +25 -11
app/core/config.py +3 -0
app/main.py +104 -2
app/pipeline/nodes/generate.py +23 -0
tests/test_chat_source_filtering.py +37 -0
tests/test_qdrant_keepalive.py +39 -0
tests/test_qdrant_url_normalization.py +16 -0

app/api/chat.py CHANGED Viewed

@@ -29,6 +29,28 @@ def _is_criticism(message: str) -> bool:
     return any(sig in lowered for sig in _CRITICISM_SIGNALS)
 async def _generate_follow_ups(
     query: str,
     answer: str,
@@ -270,17 +292,9 @@ async def chat_endpoint(
             elapsed_ms = int((time.monotonic() - start_time) * 1000)
-            # Citation-index filtering — single serialisation-time safety net.
-            # Applies to all paths (RAG, Gemini fast-path, enumeration).
-            # If the answer cites only [3][5], only sources 3 and 5 are sent;
-            # all other chunks retrieved but not cited are discarded here.
-            if final_answer and final_sources:
-                cited_nums = {int(m) for m in re.findall(r"\[(\d+)\]", final_answer)}
-                if cited_nums:
-                    final_sources = [
-                        s for i, s in enumerate(final_sources, start=1)
-                        if i in cited_nums
-                    ]
             sources_list = [
                 s.model_dump() if hasattr(s, "model_dump")

     return any(sig in lowered for sig in _CRITICISM_SIGNALS)
+def _filter_sources_by_citations(answer: str, sources: list) -> list:
+    """
+    Keep only sources explicitly cited in answer text.
+    If sources are already pre-filtered upstream (e.g. generate node returned
+    only cited sources from original indices), citation numbers may no longer
+    match local list positions. In that case, keep the original list unchanged.
+    """
+    if not answer or not sources:
+        return sources
+    cited_nums = {int(m) for m in re.findall(r"\[(\d+)\]", answer)}
+    if not cited_nums:
+        return sources
+    max_cited = max(cited_nums)
+    if max_cited > len(sources):
+        return sources
+    return [s for i, s in enumerate(sources, start=1) if i in cited_nums]
 async def _generate_follow_ups(
     query: str,
     answer: str,
             elapsed_ms = int((time.monotonic() - start_time) * 1000)
+            # Citation-index filtering safety net for paths that return full
+            # source lists. No-op when sources are already citation-filtered.
+            final_sources = _filter_sources_by_citations(final_answer, final_sources)
             sources_list = [
                 s.model_dump() if hasattr(s, "model_dump")

app/core/config.py CHANGED Viewed

@@ -17,6 +17,9 @@ class Settings(BaseSettings):
     QDRANT_URL: str
     QDRANT_API_KEY: Optional[str] = None
     QDRANT_COLLECTION: str = "knowledge_base"
     # In-memory semantic cache
     # Replaces Redis. No external service required.

     QDRANT_URL: str
     QDRANT_API_KEY: Optional[str] = None
     QDRANT_COLLECTION: str = "knowledge_base"
+    # Keepalive ping interval to touch Qdrant regularly and avoid idle expiry.
+    # Default is 6 days (< 1 week) so the database is contacted at least weekly.
+    QDRANT_KEEPALIVE_SECONDS: int = 518400
     # In-memory semantic cache
     # Replaces Redis. No external service required.

app/main.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from contextlib import asynccontextmanager
 import os
 import sqlite3
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from slowapi.errors import RateLimitExceeded
 from app.api.admin import router as admin_router
@@ -28,6 +31,40 @@ from qdrant_client import QdrantClient
 logger = get_logger(__name__)
 def _sqlite_row_count(db_path: str) -> int:
     """Return the current interactions row count, or 0 if the table doesn't exist."""
     try:
@@ -39,6 +76,33 @@ def _sqlite_row_count(db_path: str) -> int:
         return 0
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     settings = get_settings()
@@ -96,8 +160,9 @@ async def lifespan(app: FastAPI):
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
     qdrant = QdrantClient(
-        url=settings.QDRANT_URL,
         api_key=settings.QDRANT_API_KEY,
         timeout=60,
     )
@@ -105,7 +170,26 @@ async def lifespan(app: FastAPI):
     vector_store = VectorStore(qdrant, settings.QDRANT_COLLECTION)
     # Idempotent: creates collection if absent so a cold-start before first
     # ingest run doesn't crash every search with "collection not found".
-    vector_store.ensure_collection()
     # Issue 7: shared TPM bucket tracks token consumption across the current 60s
     # window.  Injected into GroqClient so it can downgrade 70B → 8B automatically
@@ -130,10 +214,28 @@ async def lifespan(app: FastAPI):
     app.state.settings = settings
     app.state.qdrant = qdrant
     logger.info("Startup complete")
     yield
     logger.info("Shutting down")
     app.state.semantic_cache = None
     app.state.qdrant.close()
     # Only attempt to end an MLflow run when DagsHub tracking was enabled at startup.

+import asyncio
 from contextlib import asynccontextmanager
 import os
 import sqlite3
+from urllib.parse import urlsplit, urlunsplit
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
+from qdrant_client.http.exceptions import UnexpectedResponse
 from slowapi.errors import RateLimitExceeded
 from app.api.admin import router as admin_router
 logger = get_logger(__name__)
+def _is_qdrant_not_found(exc: Exception) -> bool:
+    """Return True when Qdrant responded with HTTP 404."""
+    if isinstance(exc, UnexpectedResponse):
+        status_code = getattr(exc, "status_code", None)
+        if status_code == 404:
+            return True
+    message = str(exc)
+    return "404" in message and "page not found" in message.lower()
+def _normalize_qdrant_url(url: str) -> str:
+    """
+    Normalize QDRANT_URL to an API base URL.
+    If the configured URL includes a non-root path (for example, a dashboard
+    URL), strip the path and keep scheme + host(+port) only.
+    """
+    raw = (url or "").strip().rstrip("/")
+    if not raw:
+        return raw
+    if "://" not in raw:
+        scheme = "http" if raw.startswith(("localhost", "127.0.0.1")) else "https"
+        raw = f"{scheme}://{raw}"
+    parsed = urlsplit(raw)
+    if not parsed.netloc:
+        return raw
+    if parsed.path and parsed.path != "/":
+        return urlunsplit((parsed.scheme, parsed.netloc, "", "", "")).rstrip("/")
+    return raw
 def _sqlite_row_count(db_path: str) -> int:
     """Return the current interactions row count, or 0 if the table doesn't exist."""
     try:
         return 0
+async def _qdrant_keepalive_loop(
+    qdrant: QdrantClient,
+    interval_seconds: int,
+    stop_event: asyncio.Event,
+) -> None:
+    """
+    Periodically ping Qdrant so the deployment keeps an active connection.
+    Uses asyncio.to_thread because qdrant-client methods are synchronous.
+    """
+    if interval_seconds <= 0:
+        return
+    while not stop_event.is_set():
+        try:
+            await asyncio.wait_for(stop_event.wait(), timeout=interval_seconds)
+            break
+        except TimeoutError:
+            pass
+        try:
+            await asyncio.to_thread(qdrant.get_collections)
+            logger.info("Qdrant keepalive ping succeeded")
+        except Exception as exc:
+            logger.warning("Qdrant keepalive ping failed: %s", exc)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     settings = get_settings()
     from app.services.vector_store import VectorStore
     from app.security.guard_classifier import GuardClassifier
+    qdrant_url = (settings.QDRANT_URL or "").strip()
     qdrant = QdrantClient(
+        url=qdrant_url,
         api_key=settings.QDRANT_API_KEY,
         timeout=60,
     )
     vector_store = VectorStore(qdrant, settings.QDRANT_COLLECTION)
     # Idempotent: creates collection if absent so a cold-start before first
     # ingest run doesn't crash every search with "collection not found".
+    try:
+        vector_store.ensure_collection()
+    except UnexpectedResponse as exc:
+        fallback_url = _normalize_qdrant_url(qdrant_url)
+        if _is_qdrant_not_found(exc) and fallback_url and fallback_url != qdrant_url:
+            logger.warning(
+                "Qdrant URL returned 404, retrying with normalized root URL | original=%s normalized=%s",
+                qdrant_url,
+                fallback_url,
+            )
+            qdrant.close()
+            qdrant = QdrantClient(
+                url=fallback_url,
+                api_key=settings.QDRANT_API_KEY,
+                timeout=60,
+            )
+            vector_store = VectorStore(qdrant, settings.QDRANT_COLLECTION)
+            vector_store.ensure_collection()
+        else:
+            raise
     # Issue 7: shared TPM bucket tracks token consumption across the current 60s
     # window.  Injected into GroqClient so it can downgrade 70B → 8B automatically
     app.state.settings = settings
     app.state.qdrant = qdrant
+    keepalive_stop = asyncio.Event()
+    keepalive_task = asyncio.create_task(
+        _qdrant_keepalive_loop(
+            qdrant=qdrant,
+            interval_seconds=settings.QDRANT_KEEPALIVE_SECONDS,
+            stop_event=keepalive_stop,
+        )
+    )
+    app.state.qdrant_keepalive_stop = keepalive_stop
+    app.state.qdrant_keepalive_task = keepalive_task
     logger.info("Startup complete")
     yield
     logger.info("Shutting down")
+    app.state.qdrant_keepalive_stop.set()
+    try:
+        await asyncio.wait_for(app.state.qdrant_keepalive_task, timeout=2)
+    except TimeoutError:
+        app.state.qdrant_keepalive_task.cancel()
+    except Exception:
+        pass
     app.state.semantic_cache = None
     app.state.qdrant.close()
     # Only attempt to end an MLflow run when DagsHub tracking was enabled at startup.

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import logging
 import re
 from typing import Callable
@@ -220,6 +221,26 @@ def _dedup_sources(source_refs: list[SourceRef], limit: int | None = None) -> li
     return result
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
     # Number of token chunks to buffer before deciding there is no CoT block.
     # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
@@ -434,6 +455,8 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
             if reformatted:
                 full_answer = reformatted
         # Only surface sources the LLM actually cited, deduplicated by URL so
         # multiple chunks from the same document show as one source card.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}

+import asyncio
 import logging
 import re
 from typing import Callable
     return result
+def _normalise_answer_text(answer: str, max_citation_index: int) -> str:
+    """
+    Clean up model output while preserving citation semantics.
+    - Drops out-of-range citation markers like [99] when only 5 passages exist.
+    - Collapses adjacent duplicate citations ([2][2] -> [2]).
+    - Normalizes punctuation spacing and excess blank lines.
+    """
+    def _keep_valid_citation(match: re.Match[str]) -> str:
+        idx = int(match.group(1))
+        return f"[{idx}]" if 1 <= idx <= max_citation_index else ""
+    cleaned = re.sub(r"\[(\d+)\]", _keep_valid_citation, answer)
+    cleaned = re.sub(r"(\[\d+\])(\1)+", r"\1", cleaned)
+    cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
+    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
+    return cleaned.strip()
 def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[PipelineState], dict]:  # noqa: ANN001
     # Number of token chunks to buffer before deciding there is no CoT block.
     # Llama 3.1 8B may omit <think> entirely; Llama 3.3 70B always starts with one.
             if reformatted:
                 full_answer = reformatted
+        full_answer = _normalise_answer_text(full_answer, max_citation_index=len(source_refs))
         # Only surface sources the LLM actually cited, deduplicated by URL so
         # multiple chunks from the same document show as one source card.
         cited_indices = {int(m) for m in re.findall(r"\[(\d+)\]", full_answer)}

tests/test_chat_source_filtering.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from app.api.chat import _filter_sources_by_citations
+def test_filter_sources_by_citations_keeps_matching_positions() -> None:
+    sources = [
+        {"title": "A"},
+        {"title": "B"},
+        {"title": "C"},
+    ]
+    answer = "Uses A [1] and C [3]."
+    filtered = _filter_sources_by_citations(answer, sources)
+    assert [s["title"] for s in filtered] == ["A", "C"]
+def test_filter_sources_by_citations_skips_reindex_mismatch() -> None:
+    # Upstream may already return only cited sources while answer keeps original
+    # citation numbers (e.g. [3][5]). We must not strip them again here.
+    sources = [
+        {"title": "Third source"},
+        {"title": "Fifth source"},
+    ]
+    answer = "Summary from [3] and [5]."
+    filtered = _filter_sources_by_citations(answer, sources)
+    assert filtered == sources
+def test_filter_sources_by_citations_no_citations_returns_input() -> None:
+    sources = [{"title": "A"}]
+    answer = "No explicit references in this sentence."
+    filtered = _filter_sources_by_citations(answer, sources)
+    assert filtered == sources

tests/test_qdrant_keepalive.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import asyncio
+import pytest
+from app.main import _qdrant_keepalive_loop
+class _FakeQdrant:
+    def __init__(self) -> None:
+        self.calls = 0
+    def get_collections(self) -> None:
+        self.calls += 1
+@pytest.mark.asyncio
+async def test_keepalive_loop_pings_qdrant() -> None:
+    qdrant = _FakeQdrant()
+    stop_event = asyncio.Event()
+    task = asyncio.create_task(
+        _qdrant_keepalive_loop(qdrant=qdrant, interval_seconds=1, stop_event=stop_event)
+    )
+    await asyncio.sleep(1.2)
+    stop_event.set()
+    await asyncio.wait_for(task, timeout=1)
+    assert qdrant.calls >= 1
+@pytest.mark.asyncio
+async def test_keepalive_loop_disabled_when_interval_non_positive() -> None:
+    qdrant = _FakeQdrant()
+    stop_event = asyncio.Event()
+    await _qdrant_keepalive_loop(qdrant=qdrant, interval_seconds=0, stop_event=stop_event)
+    assert qdrant.calls == 0

tests/test_qdrant_url_normalization.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from app.main import _normalize_qdrant_url
+def test_normalize_qdrant_url_strips_dashboard_path() -> None:
+    url = "https://example.qdrant.io/dashboard"
+    assert _normalize_qdrant_url(url) == "https://example.qdrant.io"
+def test_normalize_qdrant_url_adds_scheme_for_cloud_host() -> None:
+    url = "cluster-id.aws.cloud.qdrant.io"
+    assert _normalize_qdrant_url(url) == "https://cluster-id.aws.cloud.qdrant.io"
+def test_normalize_qdrant_url_uses_http_for_localhost() -> None:
+    url = "localhost:6333"
+    assert _normalize_qdrant_url(url) == "http://localhost:6333"