Spaces:

cmd0160
/

abalone_chat_application

Sleeping

App Files Files Community

cmd0160 commited on Dec 8, 2025

Commit

ee749be

1 Parent(s): 67fb4c0

Adding files

Browse files

Files changed (16) hide show

.DS_Store +0 -0
.gitignore +17 -0
.idea/.gitignore +0 -8
.idea/material_theme_project_new.xml +0 -12
app.py +53 -28
src/__pycache__/ingest.cpython-310.pyc +0 -0
src/__pycache__/qa_chain.cpython-310.pyc +0 -0
src/__pycache__/vectorstore.cpython-310.pyc +0 -0
src/ingest.py +96 -0
src/kg/extract.py +75 -0
src/kg/retriever.py +28 -0
src/kg/store.py +91 -0
src/utils/rag_runtime.py +78 -0
src/vectorstore.py +4 -0
vectorstore/chroma-collections.parquet +0 -3
vectorstore/chroma-embeddings.parquet +0 -3

.DS_Store DELETED Viewed

Binary file (8.2 kB)

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# ignore generated vectorstore data and caches
+/vectorstore/
+# python cache
+__pycache__/
+*.pyc
+# macOS finder
+.DS_Store
+# IDE
+.idea/
+# optional: ignore local sample data if you don't want it in repo
+# /data/
+# tests / validation artifacts
+# /validation/
+EOF
+git add .gitignore
+# do NOT commit this yet if you prefer to review

.idea/.gitignore DELETED Viewed

@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml

.idea/material_theme_project_new.xml DELETED Viewed

@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="MaterialThemeProjectNewConfig">
-    <option name="metadata">
-      <MTProjectMetadataState>
-        <option name="migrated" value="true" />
-        <option name="pristineConfig" value="false" />
-        <option name="userId" value="-3a906995:19986b060ad:-7ffc" />
-      </MTProjectMetadataState>
-    </option>
-  </component>
-</project>

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import List, Dict, Tuple, Optional
 # Disable telemetry for LangChain and Chroma by default
 os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
@@ -12,6 +12,7 @@ from src.utils.rag_runtime import (
     run_ingest_cli,
     build_or_load_retriever_cached,
     get_chain_cached,
 )
 from src.utils.metrics import compute_quality_scores
 from src.utils.formatting import format_source_label
@@ -25,12 +26,22 @@ class AbaloneRAGApp:
         """Initialize the Streamlit page and application state."""
         st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
-        st.title("Abalone RAG Chatbot")
-        st.write(
-            "Ask natural-language questions about abalone biology, ecology, "
-            "and research datasets. The app uses a local Chroma vectorstore "
-            "and OpenAI to retrieve and answer questions accurately."
-        )
         # Data and vectorstore locations
         self.data_dir = "./data"
@@ -49,17 +60,22 @@ class AbaloneRAGApp:
             self.temperature,
             self.answer_length,
             self.style_instruction,
-            self.rebuild_clicked,
         ) = self._build_sidebar()
         # QA chain instance (loaded lazily)
-        self.chain: Optional[object] = None
     # ------------------------------------------------------------------
     # Sidebar configuration
     # ------------------------------------------------------------------
-    def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool]:
         """Render all sidebar controls and return model configuration.
         Returns:
@@ -95,7 +111,7 @@ class AbaloneRAGApp:
         retrieval_mode_label = st.sidebar.selectbox(
             "Retrieval mode",
             ["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
-            index=0,
         )
         retrieval_mode_map = {
             "MMR (diverse)": "mmr",
@@ -104,6 +120,12 @@ class AbaloneRAGApp:
         }
         retrieval_mode = retrieval_mode_map[retrieval_mode_label]
         st.sidebar.markdown("---")
         # Answer style
@@ -123,20 +145,9 @@ class AbaloneRAGApp:
             index=1,
         )
         st.sidebar.markdown("---")
-        # Vectorstore controls
-        st.sidebar.header("Vectorstore Controls")
-        rebuild_clicked = st.sidebar.button(
-            "Rebuild vectorstore",
-            use_container_width=True,
-        )
-        st.sidebar.markdown(
-            "<small>Use this when you add or modify files in <code>./data</code>.</small>",
-            unsafe_allow_html=True,
-        )
         # Build style instruction for the LLM
         length_instruction_map = {
@@ -158,7 +169,8 @@ class AbaloneRAGApp:
             temperature,
             answer_length,
             style_instruction,
-            rebuild_clicked,
         )
     # ------------------------------------------------------------------
@@ -238,6 +250,7 @@ class AbaloneRAGApp:
             st.session_state["rebuild_pending"] = False
             st.info("Rebuild canceled.")
     # ------------------------------------------------------------------
     # Chain loading
     # ------------------------------------------------------------------
@@ -260,6 +273,7 @@ class AbaloneRAGApp:
         else:
             st.success("Knowledge base and model are ready.")
     # ------------------------------------------------------------------
     # Chat UI
     # ------------------------------------------------------------------
@@ -296,9 +310,20 @@ class AbaloneRAGApp:
             styled_question = self.style_instruction + "\n\nQuestion: " + user_input
-            result = self.chain(
-                {"question": styled_question, "chat_history": prior_history}
-            )
             answer = (
                     result.get("answer")

 import os
+from typing import List, Dict, Tuple, Optional, Any
 # Disable telemetry for LangChain and Chroma by default
 os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
     run_ingest_cli,
     build_or_load_retriever_cached,
     get_chain_cached,
+    answer_with_kg,
 )
 from src.utils.metrics import compute_quality_scores
 from src.utils.formatting import format_source_label
         """Initialize the Streamlit page and application state."""
         st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
+        # Header row: title/subtitle on the left, rebuild action on the right
+        header_col, action_col = st.columns([5, 1])
+        with header_col:
+            st.title("Abalone RAG Chatbot")
+            st.write(
+                "Ask natural-language questions about abalone biology, ecology, "
+                "and research datasets. The app uses a local Chroma vectorstore "
+                "and OpenAI to retrieve and answer questions accurately."
+            )
+        with action_col:
+            # A compact, prominent rebuild control placed in the header
+            self._top_rebuild_clicked = st.button(
+                "Rebuild vectorstore",
+                key="top_rebuild",
+                use_container_width=True,
+            )
         # Data and vectorstore locations
         self.data_dir = "./data"
             self.temperature,
             self.answer_length,
             self.style_instruction,
+            self.use_kg,
+            self.kg_hops,
         ) = self._build_sidebar()
+        # Ensure rebuild_clicked reflects the top-right control
+        self.rebuild_clicked = bool(getattr(self, "_top_rebuild_clicked", False))
         # QA chain instance (loaded lazily)
+        # typing as Any avoids static warnings when calling the chain object
+        self.chain: Optional[Any] = None
     # ------------------------------------------------------------------
     # Sidebar configuration
     # ------------------------------------------------------------------
+    def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool, int]:
         """Render all sidebar controls and return model configuration.
         Returns:
         retrieval_mode_label = st.sidebar.selectbox(
             "Retrieval mode",
             ["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
+            index=2,
         )
         retrieval_mode_map = {
             "MMR (diverse)": "mmr",
         }
         retrieval_mode = retrieval_mode_map[retrieval_mode_label]
+        # Knowledge graph toggle (placed under Retrieval Configuration)
+        st.sidebar.markdown("---")
+        st.sidebar.header("Knowledge Graph")
+        use_kg = st.sidebar.checkbox("Use knowledge graph for retrieval", value=False)
+        kg_hops = st.sidebar.slider("KG hops", min_value=1, max_value=3, value=1)
         st.sidebar.markdown("---")
         # Answer style
             index=1,
         )
+        # (Vectorstore rebuild moved to top-right action button)
         st.sidebar.markdown("---")
+        st.sidebar.markdown("<small>To rebuild the vectorstore use the top-right \"Rebuild vectorstore\" button.</small>", unsafe_allow_html=True)
         # Build style instruction for the LLM
         length_instruction_map = {
             temperature,
             answer_length,
             style_instruction,
+            use_kg,
+            kg_hops,
         )
     # ------------------------------------------------------------------
             st.session_state["rebuild_pending"] = False
             st.info("Rebuild canceled.")
     # ------------------------------------------------------------------
     # Chain loading
     # ------------------------------------------------------------------
         else:
             st.success("Knowledge base and model are ready.")
     # ------------------------------------------------------------------
     # Chat UI
     # ------------------------------------------------------------------
             styled_question = self.style_instruction + "\n\nQuestion: " + user_input
+            if self.chain is None:
+                st.error("Model not initialized. Please wait for the knowledge base and model to be ready or rebuild the vectorstore.")
+                return
+            if getattr(self, 'use_kg', False):
+                result = answer_with_kg(
+                    self.chain,
+                    styled_question,
+                    prior_history,
+                    persist_dir=self.persist_dir,
+                    kg_hops=self.kg_hops,
+                )
+            else:
+                result = self.chain({"question": styled_question, "chat_history": prior_history})
             answer = (
                     result.get("answer")

src/__pycache__/ingest.cpython-310.pyc DELETED Viewed

Binary file (2.18 kB)

src/__pycache__/qa_chain.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/qa_chain.cpython-310.pyc and b/src/__pycache__/qa_chain.cpython-310.pyc differ

src/__pycache__/vectorstore.cpython-310.pyc DELETED Viewed

Binary file (2.1 kB)

src/ingest.py CHANGED Viewed

@@ -1,10 +1,28 @@
 import argparse
 import os
 from langchain_community.document_loaders import DirectoryLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import OpenAIEmbeddings
 def load_documents(data_dir: str):
@@ -54,20 +72,98 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
     os.makedirs(persist_dir, exist_ok=True)
     Chroma.from_documents(
         split_docs,
         embedding=embeddings,
         persist_directory=persist_dir,
     )
     print(f"Vectorstore built and persisted to {persist_dir}")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--data-dir", type=str, default="./data")
     parser.add_argument("--persist-dir", type=str, default="./vectorstore")
     parser.add_argument("--chunk-size", type=int, default=800)
     parser.add_argument("--chunk-overlap", type=int, default=200)
     args = parser.parse_args()
     ingest(

 import argparse
 import os
+<<<<<<< HEAD
 from langchain_community.document_loaders import DirectoryLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import OpenAIEmbeddings
+=======
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import OpenAIEmbeddings
+# New: KG integration imports
+import uuid
+import json
+try:
+    from src.kg.extract import extract_triples_with_llm
+    from src.kg.store import KGStore
+    from src.kg.retriever import KGRetriever
+    _HAS_KG = True
+except Exception:
+    _HAS_KG = False
+>>>>>>> ba5a1f4 (Adding kg to deployment)
 def load_documents(data_dir: str):
     os.makedirs(persist_dir, exist_ok=True)
+<<<<<<< HEAD
+=======
+    # Prepare KG store and local chunk index
+    chunks_index = {}
+    kg_path = os.path.join(persist_dir, "kg_store.ttl")
+    if _HAS_KG:
+        try:
+            kg = KGStore(path=kg_path)
+        except Exception:
+            kg = None
+    else:
+        kg = None
+    # Annotate chunks with stable chunk_id and optionally extract/link KG triples
+    for d in split_docs:
+        meta = d.metadata or {}
+        chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
+        if not meta:
+            d.metadata = {}
+        d.metadata["chunk_id"] = chunk_id
+        # Save minimal chunk index for runtime retrieval (text and source metadata)
+        chunks_index[chunk_id] = {
+            "text": getattr(d, "page_content", "") or getattr(d, "content", ""),
+            "metadata": d.metadata,
+        }
+        # If KG is available, attempt to extract triples and link the chunk
+        if kg is not None:
+            try:
+                triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
+                for t in triples:
+                    try:
+                        kg.add_triple(
+                            t.get("subject"),
+                            t.get("predicate"),
+                            t.get("object"),
+                            provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
+                        )
+                        kg.link_chunk_to_entity(
+                            chunk_id,
+                            t.get("subject"),
+                            sentence=t.get("sentence"),
+                            confidence=t.get("confidence"),
+                        )
+                    except Exception:
+                        # non-fatal: continue
+                        continue
+            except Exception:
+                # LLM extraction failed or not configured; skip KG extraction
+                pass
+    # Persist Chroma vectorstore
+>>>>>>> ba5a1f4 (Adding kg to deployment)
     Chroma.from_documents(
         split_docs,
         embedding=embeddings,
         persist_directory=persist_dir,
     )
     print(f"Vectorstore built and persisted to {persist_dir}")
+<<<<<<< HEAD
+=======
+    # Persist chunks index for runtime (simple json mapping)
+    try:
+        idx_path = os.path.join(persist_dir, "chunks_index.json")
+        with open(idx_path, "w", encoding="utf-8") as fh:
+            json.dump(chunks_index, fh)
+    except Exception:
+        pass
+    # Persist KG if available
+    if kg is not None:
+        try:
+            kg.save()
+            print(f"KG persisted to {kg_path}")
+        except Exception:
+            pass
+>>>>>>> ba5a1f4 (Adding kg to deployment)
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--data-dir", type=str, default="./data")
     parser.add_argument("--persist-dir", type=str, default="./vectorstore")
+<<<<<<< HEAD
     parser.add_argument("--chunk-size", type=int, default=800)
     parser.add_argument("--chunk-overlap", type=int, default=200)
+=======
+    parser.add_argument("--chunk-size", type=int, default=200)
+    parser.add_argument("--chunk-overlap", type=int, default=50)
+>>>>>>> ba5a1f4 (Adding kg to deployment)
     args = parser.parse_args()
     ingest(

src/kg/extract.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""LLM-backed triple/entity extractor for PoC.
+This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI)
+to extract a small set of triples from a text chunk. It returns a list of dicts:
+    {"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float}
+The implementation is intentionally conservative and small for a Spaces-compatible PoC.
+"""
+from typing import List, Dict
+import json
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import HumanMessage, SystemMessage
+def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]:
+    """Extract triples from text using a Chat LLM. Returns parsed JSON triples.
+    Note: requires OPENAI_API_KEY in env for ChatOpenAI to work.
+    """
+    prompt = (
+        "You are an assistant that extracts factual triples from a short text.\n"
+        "Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n"
+        "Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n"
+        f"Limit results to at most {max_triples} triples.\n\n"
+        "Text:\n<<<TEXT_START>>>\n"
+        + text
+        + "\n<<<TEXT_END>>>\n"
+    )
+    # system message to instruct format strictly
+    system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.")
+    human = HumanMessage(content=prompt)
+    llm = ChatOpenAI(model_name=model_name, temperature=0.0)
+    resp = llm([system, human])
+    raw = resp.content.strip()
+    # Attempt to find JSON in the output
+    try:
+        data = json.loads(raw)
+    except Exception:
+        # try to find first JSON substring
+        start = raw.find("[")
+        end = raw.rfind("]")
+        if start != -1 and end != -1:
+            try:
+                data = json.loads(raw[start:end+1])
+            except Exception:
+                data = []
+        else:
+            data = []
+    cleaned: List[Dict] = []
+    for item in data:
+        if not isinstance(item, dict):
+            continue
+        subj = item.get("subject") or item.get("s")
+        pred = item.get("predicate") or item.get("p")
+        obj = item.get("object") or item.get("o")
+        sent = item.get("sentence") or ""
+        conf = item.get("confidence")
+        try:
+            conf = float(conf) if conf is not None else 0.5
+        except Exception:
+            conf = 0.5
+        if subj and pred and obj:
+            cleaned.append({
+                "subject": str(subj),
+                "predicate": str(pred),
+                "object": str(obj),
+                "sentence": str(sent),
+                "confidence": conf,
+            })
+    return cleaned

src/kg/retriever.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""KG retriever that returns chunk IDs and short node summaries for a question."""
+from typing import List, Tuple
+from .store import KGStore
+class KGRetriever:
+    def __init__(self, kg_store: KGStore):
+        self.kg = kg_store
+    def get_context_for_question(self, question: str, hops: int = 1) -> Tuple[List[str], List[str]]:
+        """Return (chunk_ids, node_summaries).
+        This simple implementation finds entities whose labels appear in the question
+        and returns linked chunk ids. For hops >1 you could expand to related entities.
+        """
+        entity_uris = self.kg.query_entities(question)
+        chunk_ids = []
+        summaries = []
+        for e in entity_uris:
+            # uri like http://.../entity/<label>
+            label = e.split("/entity/", 1)[-1].replace("_", " ")
+            chunks = self.kg.find_chunks_for_entity(label)
+            chunk_ids.extend(chunks)
+            summaries.append(label)
+        # dedupe
+        chunk_ids = list(dict.fromkeys(chunk_ids))
+        return chunk_ids, summaries

src/kg/store.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""RDFLib-backed KG store for PoC.
+Stores nodes and simple edges; links DocumentChunk IDs to KG entities using
+`mentions` predicate. Persists to a TTL file.
+"""
+try:
+    from rdflib import Graph, URIRef, Literal, Namespace
+    from rdflib.namespace import RDF, RDFS
+    _HAS_RDFLIB = True
+except Exception:
+    _HAS_RDFLIB = False
+from typing import List, Dict, Optional
+import uuid
+import os
+NS_URI = "http://example.org/abalone/"
+if _HAS_RDFLIB:
+    NS = Namespace(NS_URI)
+    class KGStore:
+        def __init__(self, path: str = "./kg_store.ttl"):
+            self.path = path
+            self.graph = Graph()
+            if os.path.exists(self.path):
+                try:
+                    self.graph.parse(self.path, format="turtle")
+                except Exception:
+                    # start empty if parse fails
+                    self.graph = Graph()
+        def _entity_uri(self, label: str) -> URIRef:
+            safe = label.strip().lower().replace(" ", "_")
+            return URIRef(f"{NS_URI}entity/{safe}")
+        def _chunk_uri(self, chunk_id: str) -> URIRef:
+            return URIRef(f"{NS_URI}chunk/{chunk_id}")
+        def add_entity(self, label: str, description: Optional[str] = None) -> URIRef:
+            u = self._entity_uri(label)
+            self.graph.add((u, RDFS.label, Literal(label)))
+            if description:
+                self.graph.add((u, NS.description, Literal(description)))
+            return u
+        def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5):
+            e = self.add_entity(entity_label)
+            c = self._chunk_uri(chunk_id)
+            self.graph.add((c, NS.mentions, e))
+            # add provenance as reified data on the chunk node
+            self.graph.add((c, NS.sentence, Literal(sentence)))
+            self.graph.add((c, NS.confidence, Literal(str(confidence))))
+        def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None):
+            s = self.add_entity(subj_label)
+            o = self.add_entity(obj_label)
+            p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}")
+            self.graph.add((s, p, o))
+            if provenance:
+                # store provenance on subject node for simplicity
+                self.graph.add((s, NS.provenance, Literal(str(provenance))))
+        def save(self):
+            self.graph.serialize(destination=self.path, format="turtle")
+        def find_chunks_for_entity(self, entity_label: str) -> List[str]:
+            e = self._entity_uri(entity_label)
+            q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}"
+            res = self.graph.query(q)
+            out = []
+            for r in res:
+                uri = str(r[0])
+                if uri.startswith(NS_URI + "chunk/"):
+                    out.append(uri.split("chunk/", 1)[1])
+            return out
+        def query_entities(self, text: str) -> List[str]:
+            # naive: find entities whose label appears in text
+            text_l = text.lower()
+            out = []
+            for s, p, o in self.graph.triples((None, RDFS.label, None)):
+                label = str(o).lower()
+                if label in text_l:
+                    out.append(str(s))
+            return out
+else:
+    class KGStore:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`")

src/utils/rag_runtime.py CHANGED Viewed

@@ -6,6 +6,19 @@ import streamlit as st
 from src.vectorstore import get_retriever
 from src.qa_chain import make_conversational_chain
 def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
@@ -18,6 +31,10 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
     Raises:
         CalledProcessError: If the underlying subprocess fails.
     """
     cmd = [
         sys.executable,
         "-m",
@@ -30,6 +47,67 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
     subprocess.run(cmd, check=True)
 @st.cache_resource(show_spinner=False)
 def build_or_load_retriever_cached(
         data_dir: str,

 from src.vectorstore import get_retriever
 from src.qa_chain import make_conversational_chain
+<<<<<<< HEAD
+=======
+import os
+import json
+from typing import Dict, List, Tuple
+try:
+    from src.kg.store import KGStore
+    from src.kg.retriever import KGRetriever
+    _HAS_KG = True
+except Exception:
+    _HAS_KG = False
+>>>>>>> ba5a1f4 (Adding kg to deployment)
 def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
     Raises:
         CalledProcessError: If the underlying subprocess fails.
     """
+<<<<<<< HEAD
+=======
+    # Updated to point to the CLI module inside the ingest package
+>>>>>>> ba5a1f4 (Adding kg to deployment)
     cmd = [
         sys.executable,
         "-m",
     subprocess.run(cmd, check=True)
+<<<<<<< HEAD
+=======
+def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
+    idx_path = os.path.join(persist_dir, "chunks_index.json")
+    if not os.path.exists(idx_path):
+        return {}
+    try:
+        with open(idx_path, "r", encoding="utf-8") as fh:
+            return json.load(fh)
+    except Exception:
+        return {}
+def answer_with_kg(
+        chain,
+        question: str,
+        chat_history: List[Tuple[str, str]],
+        persist_dir: str,
+        kg_hops: int = 1,
+        kg_context_max_chars: int = 1000,
+    ) -> Any:
+    """Augment question with KG context (if available) and run the chain.
+    This is a low-risk integration: we build a short textual summary from the KG
+    (node labels and short chunk snippets from chunks_index.json) and prepend it to
+    the question. The chain's retriever still runs; KG context is additional grounding.
+    """
+    kg_text_parts: List[str] = []
+    # Load chunks index mapping
+    chunks_index = _load_chunks_index(persist_dir)
+    if _HAS_KG:
+        kg_path = os.path.join(persist_dir, "kg_store.ttl")
+        try:
+            kg = KGStore(path=kg_path)
+            retr = KGRetriever(kg)
+            chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
+            if summaries:
+                kg_text_parts.append("KG entities: " + ", ".join(summaries))
+            # add chunk snippets
+            for cid in chunk_ids:
+                info = chunks_index.get(cid)
+                if info:
+                    txt = info.get("text", "")
+                    if txt:
+                        snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
+                        kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
+        except Exception:
+            # If KG load fails, skip KG augmentation
+            kg_text_parts = []
+    kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
+    if kg_context:
+        augmented_question = f"KG CONTEXT:\n{kg_context}\n\nUser Question:\n{question}"
+    else:
+        augmented_question = question
+    return chain({"question": augmented_question, "chat_history": chat_history})
+>>>>>>> ba5a1f4 (Adding kg to deployment)
 @st.cache_resource(show_spinner=False)
 def build_or_load_retriever_cached(
         data_dir: str,

src/vectorstore.py CHANGED Viewed

@@ -51,7 +51,11 @@ class HybridRetriever(BaseRetriever):
 def get_retriever(
         persist_dir: str,
         top_k: int,
         retrieval_mode: RetrievalMode = "mmr",
 ):
     db = get_vectorstore(persist_dir=persist_dir)
     mode = retrieval_mode.lower()

 def get_retriever(
         persist_dir: str,
         top_k: int,
+<<<<<<< HEAD
         retrieval_mode: RetrievalMode = "mmr",
+=======
+        retrieval_mode: RetrievalMode = "hybrid",
+>>>>>>> ba5a1f4 (Adding kg to deployment)
 ):
     db = get_vectorstore(persist_dir=persist_dir)
     mode = retrieval_mode.lower()

vectorstore/chroma-collections.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2f2f346a015c1ffec6f8a3c535ac4ea2a99fe14f441a424e373b42248ac0fbe
-size 601

vectorstore/chroma-embeddings.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e572394a2cfa7976f286fa157be4e5eaf287d0721581969eed4c4df7874f04a
-size 3380376