Spaces:

Param20h
/

PDF-Assit_RAG

Running

App Files Files Community

Paramjit Singh commited on 4 days ago

Commit

3e08504

unverified ·

2 Parent(s): efb7f42 c752a7a

Merge pull request #255 from Kishalll/feature/graphrag-knowledge-graph

Browse files

Files changed (12) hide show

.env.example +10 -0
Dockerfile +3 -2
backend/app/config.py +16 -0
backend/app/rag/agent.py +33 -2
backend/app/rag/graph_builder.py +185 -0
backend/app/rag/graph_retriever.py +123 -0
backend/app/routes/documents.py +17 -0
backend/requirements.txt +3 -0
backend/tests/test_documents.py +79 -0
backend/tests/test_graph_builder.py +89 -0
backend/tests/test_graph_retriever.py +97 -0
backend/tests/test_graphrag_agent.py +92 -0

.env.example CHANGED Viewed

@@ -122,6 +122,16 @@ HF_TOKEN=your_huggingface_token_here
 # ── RAG Config (Optional — defaults shown) ───────────
 # ── ChromaDB (Vector Store) ─────────────────────────────────
 # Directory where ChromaDB persists its vector index to disk.

 # ── RAG Config (Optional — defaults shown) ───────────
+# ── Knowledge Graph / GraphRAG (Optional — defaults shown) ─────────────────
+# Directory where GraphRAG stores per-document knowledge graphs.
+# Optional — defaults to "./data/graphs"
+# GRAPH_PERSIST_DIR=./data/graphs
+# Maximum number of graph relationships appended to the RAG prompt.
+# Optional — defaults to 12
+# GRAPH_MAX_RELATIONSHIPS=12
 # ── ChromaDB (Vector Store) ─────────────────────────────────
 # Directory where ChromaDB persists its vector index to disk.

Dockerfile CHANGED Viewed

@@ -33,7 +33,8 @@ RUN python -m venv "$VIRTUAL_ENV"
 COPY backend/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip install --no-cache-dir -r requirements.txt
 # --------------------------------------------------------
 # Stage 3: Runtime image with only app code and artifacts
@@ -68,7 +69,7 @@ COPY backend/__init__.py ./backend/__init__.py
 COPY --from=frontend-builder /app/frontend/out ./frontend/out
 # Create data directories with proper permissions
-RUN mkdir -p /app/data/uploads /app/data/chroma_db /app/data/huggingface && \
     chown -R appuser:appuser /app
 # Copy entrypoint

 COPY backend/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir -r requirements.txt && \
+    python -m spacy download en_core_web_sm
 # --------------------------------------------------------
 # Stage 3: Runtime image with only app code and artifacts
 COPY --from=frontend-builder /app/frontend/out ./frontend/out
 # Create data directories with proper permissions
+RUN mkdir -p /app/data/uploads /app/data/chroma_db /app/data/graphs /app/data/huggingface && \
     chown -R appuser:appuser /app
 # Copy entrypoint

backend/app/config.py CHANGED Viewed

@@ -45,6 +45,22 @@ class Settings(BaseSettings):
     TOP_K_RETRIEVAL: int = 10
     TOP_K_RERANK: int = 5
     # ── Embeddings (local HuggingFace model) ─────────────
     EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
     EMBEDDING_DIMENSION: int = 384

     TOP_K_RETRIEVAL: int = 10
     TOP_K_RERANK: int = 5
+    # ── Knowledge Graph (GraphRAG) ───────────────────────
+    GRAPH_PERSIST_DIR: str = "./data/graphs"
+    GRAPH_ENTITY_LABELS: set = {
+        "PERSON",
+        "ORG",
+        "GPE",
+        "LOC",
+        "PRODUCT",
+        "EVENT",
+        "WORK_OF_ART",
+        "LAW",
+        "NORP",
+        "FAC",
+    }
+    GRAPH_MAX_RELATIONSHIPS: int = 12
     # ── Embeddings (local HuggingFace model) ─────────────
     EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
     EMBEDDING_DIMENSION: int = 384

backend/app/rag/agent.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import List, Dict, Any, Optional, Generator
 from huggingface_hub import InferenceClient
 from app.config import get_settings
 from app.rag.retriever import retrieve
 from app.rag.prompts import SYSTEM_PROMPT, RAG_PROMPT_TEMPLATE, GREETING_PROMPT
 from app.rag.tracing import trace_function
@@ -48,6 +49,26 @@ def build_context(chunks: List[Dict[str, Any]]) -> str:
     return "\n\n---\n\n".join(context_parts)
 def _chat_messages(system: str, user_content: str) -> list:
     """Build messages list for chat completion API."""
     return [
@@ -108,7 +129,12 @@ def generate_answer(
     # ── Build prompt ─────────────────────────────────
     # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
-    context = build_context(chunks)
     user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
     messages = _chat_messages(SYSTEM_PROMPT, user_content)
@@ -222,7 +248,12 @@ def generate_answer_stream(
     # ── Build prompt ─────────────────────────────────
     # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
-    context = build_context(chunks)
     user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
     messages = _chat_messages(SYSTEM_PROMPT, user_content)

 from huggingface_hub import InferenceClient
 from app.config import get_settings
 from app.rag.retriever import retrieve
+from app.rag.graph_retriever import get_entity_context
 from app.rag.prompts import SYSTEM_PROMPT, RAG_PROMPT_TEMPLATE, GREETING_PROMPT
 from app.rag.tracing import trace_function
     return "\n\n---\n\n".join(context_parts)
+def build_augmented_context(
+    chunks: List[Dict[str, Any]],
+    question: str,
+    user_id: str,
+    document_id: Optional[str] = None,
+) -> str:
+    """Combine vector-retrieved excerpts with GraphRAG relationships."""
+    context = build_context(chunks)
+    graph_context = get_entity_context(
+        query=question,
+        user_id=user_id,
+        document_id=document_id,
+    )
+    if not graph_context:
+        return context
+    return f"{context}\n\n---\n\n{graph_context}"
 def _chat_messages(system: str, user_content: str) -> list:
     """Build messages list for chat completion API."""
     return [
     # ── Build prompt ─────────────────────────────────
     # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
+    context = build_augmented_context(
+        chunks=chunks,
+        question=question,
+        user_id=user_id,
+        document_id=document_id,
+    )
     user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
     messages = _chat_messages(SYSTEM_PROMPT, user_content)
     # ── Build prompt ─────────────────────────────────
     # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
+    context = build_augmented_context(
+        chunks=chunks,
+        question=question,
+        user_id=user_id,
+        document_id=document_id,
+    )
     user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
     messages = _chat_messages(SYSTEM_PROMPT, user_content)

backend/app/rag/graph_builder.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Knowledge graph construction and persistence for GraphRAG.
+"""
+import json
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+import networkx as nx
+from app.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+_nlp = None
+@dataclass(frozen=True)
+class Entity:
+    id: str
+    text: str
+    label: str
+def _safe_id(value: str) -> str:
+    safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("._")
+    return safe or "unknown"
+def get_graph_path(user_id: str, document_id: str) -> Path:
+    """Return the on-disk graph path for one user/document pair."""
+    filename = f"{_safe_id(user_id)}_{_safe_id(document_id)}.json"
+    return Path(settings.GRAPH_PERSIST_DIR) / filename
+def iter_graph_paths(user_id: str) -> Iterable[Path]:
+    """Yield every persisted graph path for a user."""
+    graph_dir = Path(settings.GRAPH_PERSIST_DIR)
+    if not graph_dir.exists():
+        return []
+    prefix = f"{_safe_id(user_id)}_"
+    return sorted(graph_dir.glob(f"{prefix}*.json"))
+def _get_nlp():
+    """Load the spaCy English NER model lazily."""
+    global _nlp
+    if _nlp is None:
+        import spacy
+        try:
+            _nlp = spacy.load("en_core_web_sm")
+        except OSError as exc:
+            raise RuntimeError(
+                "spaCy model 'en_core_web_sm' is required for GraphRAG entity extraction. "
+                "Install it with: python -m spacy download en_core_web_sm"
+            ) from exc
+    return _nlp
+def _entity_id(text: str, label: str) -> str:
+    normalized = " ".join(text.split()).casefold()
+    return f"{label}:{normalized}"
+def extract_entities(text: str) -> List[Entity]:
+    """Extract configured named entities from text."""
+    if not text or not text.strip():
+        return []
+    doc = _get_nlp()(text)
+    entities: Dict[str, Entity] = {}
+    for ent in doc.ents:
+        value = " ".join(ent.text.split()).strip()
+        if not value or ent.label_ not in settings.GRAPH_ENTITY_LABELS:
+            continue
+        entity_id = _entity_id(value, ent.label_)
+        entities.setdefault(
+            entity_id,
+            Entity(id=entity_id, text=value, label=ent.label_),
+        )
+    return list(entities.values())
+def build_graph(chunks: List[Dict[str, Any]]) -> nx.Graph:
+    """Build an entity co-occurrence graph from document chunks."""
+    graph = nx.Graph()
+    for chunk in chunks:
+        text = chunk.get("text", "")
+        page = chunk.get("page")
+        chunk_index = chunk.get("chunk_index")
+        entities = extract_entities(text)
+        for entity in entities:
+            if graph.has_node(entity.id):
+                graph.nodes[entity.id]["mentions"] += 1
+                graph.nodes[entity.id]["pages"].add(page)
+                graph.nodes[entity.id]["chunks"].add(chunk_index)
+            else:
+                graph.add_node(
+                    entity.id,
+                    name=entity.text,
+                    label=entity.label,
+                    mentions=1,
+                    pages={page},
+                    chunks={chunk_index},
+                )
+        for left_index, left in enumerate(entities):
+            for right in entities[left_index + 1:]:
+                if graph.has_edge(left.id, right.id):
+                    graph[left.id][right.id]["weight"] += 1
+                    graph[left.id][right.id]["pages"].add(page)
+                    graph[left.id][right.id]["chunks"].add(chunk_index)
+                else:
+                    graph.add_edge(
+                        left.id,
+                        right.id,
+                        weight=1,
+                        pages={page},
+                        chunks={chunk_index},
+                    )
+    _convert_sets_for_json(graph)
+    return graph
+def _convert_sets_for_json(graph: nx.Graph) -> None:
+    for _, data in graph.nodes(data=True):
+        data["pages"] = sorted(item for item in data.get("pages", []) if item is not None)
+        data["chunks"] = sorted(item for item in data.get("chunks", []) if item is not None)
+    for _, _, data in graph.edges(data=True):
+        data["pages"] = sorted(item for item in data.get("pages", []) if item is not None)
+        data["chunks"] = sorted(item for item in data.get("chunks", []) if item is not None)
+def save_graph(graph: nx.Graph, user_id: str, document_id: str) -> Path:
+    """Persist a graph to disk as node-link JSON."""
+    graph_path = get_graph_path(user_id, document_id)
+    graph_path.parent.mkdir(parents=True, exist_ok=True)
+    data = nx.node_link_data(graph)
+    data["metadata"] = {
+        "user_id": user_id,
+        "document_id": document_id,
+        "node_count": graph.number_of_nodes(),
+        "edge_count": graph.number_of_edges(),
+    }
+    graph_path.write_text(json.dumps(data, ensure_ascii=True, indent=2), encoding="utf-8")
+    logger.info(
+        "Saved knowledge graph for document %s with %s nodes and %s edges",
+        document_id,
+        graph.number_of_nodes(),
+        graph.number_of_edges(),
+    )
+    return graph_path
+def load_graph(user_id: str, document_id: str) -> Optional[nx.Graph]:
+    """Load a persisted graph for one user/document pair."""
+    return load_graph_path(get_graph_path(user_id, document_id))
+def load_graph_path(graph_path: Path) -> Optional[nx.Graph]:
+    """Load a graph from a concrete JSON path."""
+    if not graph_path.exists():
+        return None
+    data = json.loads(graph_path.read_text(encoding="utf-8"))
+    return nx.node_link_graph(data)
+def delete_graph(user_id: str, document_id: str) -> None:
+    """Delete a persisted graph file if it exists."""
+    get_graph_path(user_id, document_id).unlink(missing_ok=True)

backend/app/rag/graph_retriever.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Knowledge graph retrieval for augmenting RAG context.
+"""
+import logging
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+import networkx as nx
+from app.config import get_settings
+from app.rag.graph_builder import (
+    extract_entities,
+    iter_graph_paths,
+    load_graph,
+    load_graph_path,
+)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+def _candidate_graphs(user_id: str, document_id: Optional[str]) -> Iterable[nx.Graph]:
+    if document_id:
+        graph = load_graph(user_id, document_id)
+        return [graph] if graph is not None else []
+    graphs = []
+    for path in iter_graph_paths(user_id):
+        graph = load_graph_path(path)
+        if graph is not None:
+            graphs.append(graph)
+    return graphs
+def _node_name(graph: nx.Graph, node_id: str) -> str:
+    return graph.nodes[node_id].get("name", node_id.split(":", 1)[-1])
+def _match_query_nodes(graph: nx.Graph, query: str) -> Set[str]:
+    query_entities = extract_entities(query)
+    matched = {entity.id for entity in query_entities if graph.has_node(entity.id)}
+    if matched:
+        return matched
+    query_text = query.casefold()
+    for node_id, data in graph.nodes(data=True):
+        name = data.get("name", "").casefold()
+        if name and name in query_text:
+            matched.add(node_id)
+    return matched
+def _format_pages(pages: List[int]) -> str:
+    if not pages:
+        return "unknown pages"
+    if len(pages) == 1:
+        return f"page {pages[0]}"
+    return "pages " + ", ".join(str(page) for page in pages[:4])
+def _relationship_key(left: str, right: str) -> Tuple[str, str]:
+    return tuple(sorted((left, right)))
+def get_entity_context(
+    query: str,
+    user_id: str,
+    document_id: Optional[str] = None,
+) -> str:
+    """Return compact graph relationship context relevant to the query."""
+    relationships: Dict[Tuple[str, str], Dict[str, object]] = {}
+    try:
+        graphs = _candidate_graphs(user_id=user_id, document_id=document_id)
+        for graph in graphs:
+            matched_nodes = _match_query_nodes(graph, query)
+            for node_id in matched_nodes:
+                neighbors = sorted(
+                    graph.neighbors(node_id),
+                    key=lambda neighbor: graph[node_id][neighbor].get("weight", 0),
+                    reverse=True,
+                )
+                for neighbor_id in neighbors:
+                    edge = graph[node_id][neighbor_id]
+                    left = _node_name(graph, node_id)
+                    right = _node_name(graph, neighbor_id)
+                    key = _relationship_key(left.casefold(), right.casefold())
+                    existing = relationships.setdefault(
+                        key,
+                        {
+                            "left": left,
+                            "right": right,
+                            "weight": 0,
+                            "pages": set(),
+                        },
+                    )
+                    existing["weight"] = int(existing["weight"]) + int(edge.get("weight", 1))
+                    existing["pages"].update(edge.get("pages", []))
+    except Exception as exc:
+        logger.warning("GraphRAG context retrieval failed: %s", exc)
+        return ""
+    if not relationships:
+        return ""
+    ranked = sorted(
+        relationships.values(),
+        key=lambda item: int(item["weight"]),
+        reverse=True,
+    )[: settings.GRAPH_MAX_RELATIONSHIPS]
+    lines = ["## Knowledge Graph Context"]
+    for item in ranked:
+        pages = sorted(item["pages"])
+        lines.append(
+            f"- {item['left']} is related to {item['right']} "
+            f"through document co-occurrence on {_format_pages(pages)} "
+            f"(strength: {item['weight']})."
+        )
+    return "\n".join(lines)

backend/app/routes/documents.py CHANGED Viewed

@@ -172,6 +172,15 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
             db.commit()
             return
         # Store embeddings in ChromaDB
         chunk_count = store_chunks(
             chunks=chunks,
@@ -629,6 +638,14 @@ def delete_document(
     except Exception as e:
         logger.warning(f"Error deleting vectors: {e}")
     # Delete from database (cascades to chat messages)
     db.delete(doc)
     db.commit()

             db.commit()
             return
+        # Build and persist a lightweight entity co-occurrence graph for GraphRAG.
+        try:
+            from app.rag.graph_builder import build_graph, save_graph
+            graph = build_graph(chunks)
+            save_graph(graph, user_id=user_id, document_id=document_id)
+        except Exception as e:
+            logger.warning(f"Could not build knowledge graph for document {document_id}: {e}")
         # Store embeddings in ChromaDB
         chunk_count = store_chunks(
             chunks=chunks,
     except Exception as e:
         logger.warning(f"Error deleting vectors: {e}")
+    # Delete persisted knowledge graph
+    try:
+        from app.rag.graph_builder import delete_graph
+        delete_graph(user_id=user.id, document_id=document_id)
+    except Exception as e:
+        logger.warning(f"Error deleting knowledge graph: {e}")
     # Delete from database (cascades to chat messages)
     db.delete(doc)
     db.commit()

backend/requirements.txt CHANGED Viewed

@@ -41,6 +41,9 @@ transformers
 # Vector Database
 chromadb
 # LLM Inference
 huggingface-hub

 # Vector Database
 chromadb
+networkx>=3.3
+spacy>=3.7
+neo4j>=5.0
 # LLM Inference
 huggingface-hub

backend/tests/test_documents.py CHANGED Viewed

@@ -1,3 +1,9 @@
 def test_api_health(client):
     response = client.get("/api/health")
@@ -32,3 +38,76 @@ def test_upload_rejects_unsupported_extension_before_deep_validation(client, aut
     assert response.status_code == 400
     assert "not supported" in response.json()["detail"]

+import types
+from app.models import Document
+from app.routes.documents import _ingest_document
 def test_api_health(client):
     response = client.get("/api/health")
     assert response.status_code == 400
     assert "not supported" in response.json()["detail"]
+def test_ingest_document_builds_and_saves_graph(db_session, monkeypatch, tmp_path, user):
+    document = Document(
+        user_id=user.id,
+        filename="graph.txt",
+        original_name="graph.txt",
+        file_size=128,
+        status="pending",
+    )
+    db_session.add(document)
+    db_session.commit()
+    db_session.refresh(document)
+    user_id = user.id
+    document_id = document.id
+    chunks = [{"text": "OpenAI works with Microsoft.", "page": 1, "chunk_index": 0}]
+    saved = {}
+    monkeypatch.setattr("app.routes.documents.get_page_count", lambda filepath: 1)
+    monkeypatch.setattr("app.routes.documents.chunk_document", lambda filepath: chunks)
+    monkeypatch.setattr("app.routes.documents.store_chunks", lambda **kwargs: len(chunks))
+    monkeypatch.setattr("app.database.SessionLocal", lambda: db_session)
+    fake_summary = types.ModuleType("app.rag.summarizer")
+    fake_summary.generate_document_summary = lambda filepath, max_sentences=2: "Summary"
+    monkeypatch.setitem(__import__("sys").modules, "app.rag.summarizer", fake_summary)
+    monkeypatch.setattr(
+        "app.rag.graph_builder.build_graph",
+        lambda received_chunks: {"chunks": received_chunks},
+    )
+    monkeypatch.setattr(
+        "app.rag.graph_builder.save_graph",
+        lambda graph, user_id, document_id: saved.update(
+            {"graph": graph, "user_id": user_id, "document_id": document_id}
+        ),
+    )
+    _ingest_document(
+        document_id=document_id,
+        filepath=str(tmp_path / "graph.txt"),
+        original_name=document.original_name,
+        user_id=user_id,
+    )
+    assert saved == {
+        "graph": {"chunks": chunks},
+        "user_id": user_id,
+        "document_id": document_id,
+    }
+    refreshed = db_session.get(Document, document_id)
+    assert refreshed.status == "ready"
+    assert refreshed.chunk_count == 1
+def test_delete_document_removes_knowledge_graph(client, auth_headers, ready_document, monkeypatch):
+    deleted = {}
+    monkeypatch.setattr("app.routes.documents.delete_document_chunks", lambda **kwargs: None)
+    monkeypatch.setattr(
+        "app.rag.graph_builder.delete_graph",
+        lambda user_id, document_id: deleted.update(
+            {"user_id": user_id, "document_id": document_id}
+        ),
+    )
+    response = client.delete(
+        f"/api/v1/documents/{ready_document.id}",
+        headers=auth_headers,
+    )
+    assert response.status_code == 200
+    assert deleted["document_id"] == ready_document.id

backend/tests/test_graph_builder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+from app.rag import graph_builder
+class FakeEntity:
+    def __init__(self, text, label):
+        self.text = text
+        self.label_ = label
+class FakeDoc:
+    def __init__(self, entities):
+        self.ents = entities
+class FakeNlp:
+    def __call__(self, text):
+        entities = []
+        for value, label in (
+            ("OpenAI", "ORG"),
+            ("Microsoft", "ORG"),
+            ("Azure", "PRODUCT"),
+            ("Ignored Date", "DATE"),
+        ):
+            if value in text:
+                entities.append(FakeEntity(value, label))
+        return FakeDoc(entities)
+def test_extract_entities_filters_configured_labels(monkeypatch):
+    monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
+    entities = graph_builder.extract_entities("OpenAI works with Microsoft on Ignored Date")
+    assert {entity.text for entity in entities} == {"OpenAI", "Microsoft"}
+    assert {entity.label for entity in entities} == {"ORG"}
+def test_build_graph_tracks_entity_edges_and_weights(monkeypatch):
+    monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
+    chunks = [
+        {
+            "text": "OpenAI works with Microsoft.",
+            "page": 1,
+            "chunk_index": 0,
+        },
+        {
+            "text": "OpenAI and Microsoft use Azure.",
+            "page": 2,
+            "chunk_index": 1,
+        },
+    ]
+    graph = graph_builder.build_graph(chunks)
+    openai_id = "ORG:openai"
+    microsoft_id = "ORG:microsoft"
+    azure_id = "PRODUCT:azure"
+    assert graph.nodes[openai_id]["name"] == "OpenAI"
+    assert graph.nodes[openai_id]["pages"] == [1, 2]
+    assert graph[openai_id][microsoft_id]["weight"] == 2
+    assert graph[openai_id][microsoft_id]["pages"] == [1, 2]
+    assert graph.has_edge(microsoft_id, azure_id)
+def test_save_load_and_delete_graph_roundtrip(tmp_path, monkeypatch):
+    monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
+    graph = graph_builder.build_graph([])
+    graph.add_node("ORG:openai", name="OpenAI", label="ORG", mentions=1, pages=[1], chunks=[0])
+    path = graph_builder.save_graph(graph, user_id="user-1", document_id="doc-1")
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    loaded = graph_builder.load_graph(user_id="user-1", document_id="doc-1")
+    assert payload["metadata"]["document_id"] == "doc-1"
+    assert loaded.nodes["ORG:openai"]["name"] == "OpenAI"
+    graph_builder.delete_graph(user_id="user-1", document_id="doc-1")
+    assert not path.exists()
+def test_empty_chunks_produce_empty_graph(monkeypatch):
+    monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
+    graph = graph_builder.build_graph([])
+    assert graph.number_of_nodes() == 0
+    assert graph.number_of_edges() == 0

backend/tests/test_graph_retriever.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from app.rag import graph_builder, graph_retriever
+class FakeEntity:
+    def __init__(self, text, label):
+        self.text = text
+        self.label_ = label
+class FakeDoc:
+    def __init__(self, entities):
+        self.ents = entities
+class FakeNlp:
+    def __call__(self, text):
+        entities = []
+        for value, label in (
+            ("OpenAI", "ORG"),
+            ("Microsoft", "ORG"),
+            ("Azure", "PRODUCT"),
+        ):
+            if value in text:
+                entities.append(FakeEntity(value, label))
+        return FakeDoc(entities)
+def _save_sample_graph(tmp_path, monkeypatch, user_id="user-1", document_id="doc-1"):
+    monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
+    monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
+    graph = graph_builder.build_graph(
+        [
+            {
+                "text": "OpenAI works with Microsoft.",
+                "page": 1,
+                "chunk_index": 0,
+            },
+            {
+                "text": "Microsoft deploys Azure.",
+                "page": 2,
+                "chunk_index": 1,
+            },
+        ]
+    )
+    graph_builder.save_graph(graph, user_id=user_id, document_id=document_id)
+def test_get_entity_context_returns_one_hop_relationships(tmp_path, monkeypatch):
+    _save_sample_graph(tmp_path, monkeypatch)
+    context = graph_retriever.get_entity_context(
+        query="How is OpenAI related to Microsoft?",
+        user_id="user-1",
+        document_id="doc-1",
+    )
+    assert "## Knowledge Graph Context" in context
+    assert "OpenAI" in context
+    assert "Microsoft" in context
+    assert "page 1" in context
+def test_get_entity_context_returns_empty_for_no_match(tmp_path, monkeypatch):
+    _save_sample_graph(tmp_path, monkeypatch)
+    context = graph_retriever.get_entity_context(
+        query="What about Google?",
+        user_id="user-1",
+        document_id="doc-1",
+    )
+    assert context == ""
+def test_get_entity_context_returns_empty_for_missing_graph(tmp_path, monkeypatch):
+    monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
+    monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
+    context = graph_retriever.get_entity_context(
+        query="OpenAI",
+        user_id="user-1",
+        document_id="missing",
+    )
+    assert context == ""
+def test_get_entity_context_isolates_users(tmp_path, monkeypatch):
+    _save_sample_graph(tmp_path, monkeypatch, user_id="user-1", document_id="doc-1")
+    context = graph_retriever.get_entity_context(
+        query="OpenAI",
+        user_id="user-2",
+        document_id="doc-1",
+    )
+    assert context == ""

backend/tests/test_graphrag_agent.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from app.rag import agent
+class FakeMessage:
+    content = "Graph answer"
+class FakeChoice:
+    message = FakeMessage()
+class FakeResponse:
+    choices = [FakeChoice()]
+class FakeClient:
+    def __init__(self):
+        self.messages = None
+    def chat_completion(self, messages, **kwargs):
+        self.messages = messages
+        return FakeResponse()
+def test_generate_answer_appends_graph_context_without_changing_sources(monkeypatch):
+    client = FakeClient()
+    chunks = [
+        {
+            "text": "Vector context",
+            "filename": "doc.pdf",
+            "page": 1,
+            "score": 0.9,
+            "confidence": 100.0,
+        }
+    ]
+    monkeypatch.setattr(agent, "get_llm_client", lambda: client)
+    monkeypatch.setattr(agent, "retrieve", lambda **kwargs: chunks)
+    monkeypatch.setattr(
+        agent,
+        "get_entity_context",
+        lambda **kwargs: "## Knowledge Graph Context\n- OpenAI is related to Microsoft on page 1.",
+    )
+    result = agent.generate_answer("How are OpenAI and Microsoft related?", "user-1", "doc-1")
+    prompt = client.messages[1]["content"]
+    assert "Vector context" in prompt
+    assert "Knowledge Graph Context" in prompt
+    assert result["sources"] == [
+        {
+            "text": "Vector context",
+            "filename": "doc.pdf",
+            "page": 1,
+            "score": 0.9,
+            "confidence": 100.0,
+        }
+    ]
+def test_generate_answer_stream_appends_graph_context(monkeypatch):
+    captured = {}
+    class StreamingClient:
+        def chat_completion(self, messages, **kwargs):
+            captured["messages"] = messages
+            return iter([])
+    monkeypatch.setattr(agent, "get_llm_client", lambda: StreamingClient())
+    monkeypatch.setattr(
+        agent,
+        "retrieve",
+        lambda **kwargs: [
+            {
+                "text": "Vector stream context",
+                "filename": "doc.pdf",
+                "page": 1,
+                "score": 0.9,
+                "confidence": 100.0,
+            }
+        ],
+    )
+    monkeypatch.setattr(
+        agent,
+        "get_entity_context",
+        lambda **kwargs: "## Knowledge Graph Context\n- OpenAI is related to Microsoft on page 1.",
+    )
+    events = list(agent.generate_answer_stream("OpenAI Microsoft", "user-1", "doc-1"))
+    assert events[0].startswith("data:")
+    assert "Knowledge Graph Context" in captured["messages"][1]["content"]