cmd0160 commited on
Commit
ee749be
·
1 Parent(s): 67fb4c0

Adding files

Browse files
.DS_Store DELETED
Binary file (8.2 kB)
 
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ignore generated vectorstore data and caches
2
+ /vectorstore/
3
+ # python cache
4
+ __pycache__/
5
+ *.pyc
6
+ # macOS finder
7
+ .DS_Store
8
+ # IDE
9
+ .idea/
10
+ # optional: ignore local sample data if you don't want it in repo
11
+ # /data/
12
+ # tests / validation artifacts
13
+ # /validation/
14
+ EOF
15
+
16
+ git add .gitignore
17
+ # do NOT commit this yet if you prefer to review
.idea/.gitignore DELETED
@@ -1,8 +0,0 @@
1
- # Default ignored files
2
- /shelf/
3
- /workspace.xml
4
- # Editor-based HTTP Client requests
5
- /httpRequests/
6
- # Datasource local storage ignored files
7
- /dataSources/
8
- /dataSources.local.xml
 
 
 
 
 
 
 
 
 
.idea/material_theme_project_new.xml DELETED
@@ -1,12 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="MaterialThemeProjectNewConfig">
4
- <option name="metadata">
5
- <MTProjectMetadataState>
6
- <option name="migrated" value="true" />
7
- <option name="pristineConfig" value="false" />
8
- <option name="userId" value="-3a906995:19986b060ad:-7ffc" />
9
- </MTProjectMetadataState>
10
- </option>
11
- </component>
12
- </project>
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import os
2
- from typing import List, Dict, Tuple, Optional
3
 
4
  # Disable telemetry for LangChain and Chroma by default
5
  os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
@@ -12,6 +12,7 @@ from src.utils.rag_runtime import (
12
  run_ingest_cli,
13
  build_or_load_retriever_cached,
14
  get_chain_cached,
 
15
  )
16
  from src.utils.metrics import compute_quality_scores
17
  from src.utils.formatting import format_source_label
@@ -25,12 +26,22 @@ class AbaloneRAGApp:
25
  """Initialize the Streamlit page and application state."""
26
  st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
27
 
28
- st.title("Abalone RAG Chatbot")
29
- st.write(
30
- "Ask natural-language questions about abalone biology, ecology, "
31
- "and research datasets. The app uses a local Chroma vectorstore "
32
- "and OpenAI to retrieve and answer questions accurately."
33
- )
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Data and vectorstore locations
36
  self.data_dir = "./data"
@@ -49,17 +60,22 @@ class AbaloneRAGApp:
49
  self.temperature,
50
  self.answer_length,
51
  self.style_instruction,
52
- self.rebuild_clicked,
 
53
  ) = self._build_sidebar()
54
 
 
 
 
55
  # QA chain instance (loaded lazily)
56
- self.chain: Optional[object] = None
 
57
 
58
  # ------------------------------------------------------------------
59
  # Sidebar configuration
60
  # ------------------------------------------------------------------
61
 
62
- def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool]:
63
  """Render all sidebar controls and return model configuration.
64
 
65
  Returns:
@@ -95,7 +111,7 @@ class AbaloneRAGApp:
95
  retrieval_mode_label = st.sidebar.selectbox(
96
  "Retrieval mode",
97
  ["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
98
- index=0,
99
  )
100
  retrieval_mode_map = {
101
  "MMR (diverse)": "mmr",
@@ -104,6 +120,12 @@ class AbaloneRAGApp:
104
  }
105
  retrieval_mode = retrieval_mode_map[retrieval_mode_label]
106
 
 
 
 
 
 
 
107
  st.sidebar.markdown("---")
108
 
109
  # Answer style
@@ -123,20 +145,9 @@ class AbaloneRAGApp:
123
  index=1,
124
  )
125
 
 
126
  st.sidebar.markdown("---")
127
-
128
- # Vectorstore controls
129
- st.sidebar.header("Vectorstore Controls")
130
-
131
- rebuild_clicked = st.sidebar.button(
132
- "Rebuild vectorstore",
133
- use_container_width=True,
134
- )
135
-
136
- st.sidebar.markdown(
137
- "<small>Use this when you add or modify files in <code>./data</code>.</small>",
138
- unsafe_allow_html=True,
139
- )
140
 
141
  # Build style instruction for the LLM
142
  length_instruction_map = {
@@ -158,7 +169,8 @@ class AbaloneRAGApp:
158
  temperature,
159
  answer_length,
160
  style_instruction,
161
- rebuild_clicked,
 
162
  )
163
 
164
  # ------------------------------------------------------------------
@@ -238,6 +250,7 @@ class AbaloneRAGApp:
238
  st.session_state["rebuild_pending"] = False
239
  st.info("Rebuild canceled.")
240
 
 
241
  # ------------------------------------------------------------------
242
  # Chain loading
243
  # ------------------------------------------------------------------
@@ -260,6 +273,7 @@ class AbaloneRAGApp:
260
  else:
261
  st.success("Knowledge base and model are ready.")
262
 
 
263
  # ------------------------------------------------------------------
264
  # Chat UI
265
  # ------------------------------------------------------------------
@@ -296,9 +310,20 @@ class AbaloneRAGApp:
296
 
297
  styled_question = self.style_instruction + "\n\nQuestion: " + user_input
298
 
299
- result = self.chain(
300
- {"question": styled_question, "chat_history": prior_history}
301
- )
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  answer = (
304
  result.get("answer")
 
1
  import os
2
+ from typing import List, Dict, Tuple, Optional, Any
3
 
4
  # Disable telemetry for LangChain and Chroma by default
5
  os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
 
12
  run_ingest_cli,
13
  build_or_load_retriever_cached,
14
  get_chain_cached,
15
+ answer_with_kg,
16
  )
17
  from src.utils.metrics import compute_quality_scores
18
  from src.utils.formatting import format_source_label
 
26
  """Initialize the Streamlit page and application state."""
27
  st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
28
 
29
+ # Header row: title/subtitle on the left, rebuild action on the right
30
+ header_col, action_col = st.columns([5, 1])
31
+ with header_col:
32
+ st.title("Abalone RAG Chatbot")
33
+ st.write(
34
+ "Ask natural-language questions about abalone biology, ecology, "
35
+ "and research datasets. The app uses a local Chroma vectorstore "
36
+ "and OpenAI to retrieve and answer questions accurately."
37
+ )
38
+ with action_col:
39
+ # A compact, prominent rebuild control placed in the header
40
+ self._top_rebuild_clicked = st.button(
41
+ "Rebuild vectorstore",
42
+ key="top_rebuild",
43
+ use_container_width=True,
44
+ )
45
 
46
  # Data and vectorstore locations
47
  self.data_dir = "./data"
 
60
  self.temperature,
61
  self.answer_length,
62
  self.style_instruction,
63
+ self.use_kg,
64
+ self.kg_hops,
65
  ) = self._build_sidebar()
66
 
67
+ # Ensure rebuild_clicked reflects the top-right control
68
+ self.rebuild_clicked = bool(getattr(self, "_top_rebuild_clicked", False))
69
+
70
  # QA chain instance (loaded lazily)
71
+ # typing as Any avoids static warnings when calling the chain object
72
+ self.chain: Optional[Any] = None
73
 
74
  # ------------------------------------------------------------------
75
  # Sidebar configuration
76
  # ------------------------------------------------------------------
77
 
78
+ def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool, int]:
79
  """Render all sidebar controls and return model configuration.
80
 
81
  Returns:
 
111
  retrieval_mode_label = st.sidebar.selectbox(
112
  "Retrieval mode",
113
  ["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
114
+ index=2,
115
  )
116
  retrieval_mode_map = {
117
  "MMR (diverse)": "mmr",
 
120
  }
121
  retrieval_mode = retrieval_mode_map[retrieval_mode_label]
122
 
123
+ # Knowledge graph toggle (placed under Retrieval Configuration)
124
+ st.sidebar.markdown("---")
125
+ st.sidebar.header("Knowledge Graph")
126
+ use_kg = st.sidebar.checkbox("Use knowledge graph for retrieval", value=False)
127
+ kg_hops = st.sidebar.slider("KG hops", min_value=1, max_value=3, value=1)
128
+
129
  st.sidebar.markdown("---")
130
 
131
  # Answer style
 
145
  index=1,
146
  )
147
 
148
+ # (Vectorstore rebuild moved to top-right action button)
149
  st.sidebar.markdown("---")
150
+ st.sidebar.markdown("<small>To rebuild the vectorstore use the top-right \"Rebuild vectorstore\" button.</small>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # Build style instruction for the LLM
153
  length_instruction_map = {
 
169
  temperature,
170
  answer_length,
171
  style_instruction,
172
+ use_kg,
173
+ kg_hops,
174
  )
175
 
176
  # ------------------------------------------------------------------
 
250
  st.session_state["rebuild_pending"] = False
251
  st.info("Rebuild canceled.")
252
 
253
+
254
  # ------------------------------------------------------------------
255
  # Chain loading
256
  # ------------------------------------------------------------------
 
273
  else:
274
  st.success("Knowledge base and model are ready.")
275
 
276
+
277
  # ------------------------------------------------------------------
278
  # Chat UI
279
  # ------------------------------------------------------------------
 
310
 
311
  styled_question = self.style_instruction + "\n\nQuestion: " + user_input
312
 
313
+ if self.chain is None:
314
+ st.error("Model not initialized. Please wait for the knowledge base and model to be ready or rebuild the vectorstore.")
315
+ return
316
+
317
+ if getattr(self, 'use_kg', False):
318
+ result = answer_with_kg(
319
+ self.chain,
320
+ styled_question,
321
+ prior_history,
322
+ persist_dir=self.persist_dir,
323
+ kg_hops=self.kg_hops,
324
+ )
325
+ else:
326
+ result = self.chain({"question": styled_question, "chat_history": prior_history})
327
 
328
  answer = (
329
  result.get("answer")
src/__pycache__/ingest.cpython-310.pyc DELETED
Binary file (2.18 kB)
 
src/__pycache__/qa_chain.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/qa_chain.cpython-310.pyc and b/src/__pycache__/qa_chain.cpython-310.pyc differ
 
src/__pycache__/vectorstore.cpython-310.pyc DELETED
Binary file (2.1 kB)
 
src/ingest.py CHANGED
@@ -1,10 +1,28 @@
1
  import argparse
2
  import os
3
 
 
4
  from langchain_community.document_loaders import DirectoryLoader, TextLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.vectorstores import Chroma
7
  from langchain_community.embeddings import OpenAIEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def load_documents(data_dir: str):
@@ -54,20 +72,98 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
54
 
55
  os.makedirs(persist_dir, exist_ok=True)
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  Chroma.from_documents(
58
  split_docs,
59
  embedding=embeddings,
60
  persist_directory=persist_dir,
61
  )
62
  print(f"Vectorstore built and persisted to {persist_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  def main():
66
  parser = argparse.ArgumentParser()
67
  parser.add_argument("--data-dir", type=str, default="./data")
68
  parser.add_argument("--persist-dir", type=str, default="./vectorstore")
 
69
  parser.add_argument("--chunk-size", type=int, default=800)
70
  parser.add_argument("--chunk-overlap", type=int, default=200)
 
 
 
 
71
  args = parser.parse_args()
72
 
73
  ingest(
 
1
  import argparse
2
  import os
3
 
4
+ <<<<<<< HEAD
5
  from langchain_community.document_loaders import DirectoryLoader, TextLoader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import Chroma
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
+ =======
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_community.embeddings import OpenAIEmbeddings
13
+
14
+ # New: KG integration imports
15
+ import uuid
16
+ import json
17
+
18
+ try:
19
+ from src.kg.extract import extract_triples_with_llm
20
+ from src.kg.store import KGStore
21
+ from src.kg.retriever import KGRetriever
22
+ _HAS_KG = True
23
+ except Exception:
24
+ _HAS_KG = False
25
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
26
 
27
 
28
  def load_documents(data_dir: str):
 
72
 
73
  os.makedirs(persist_dir, exist_ok=True)
74
 
75
+ <<<<<<< HEAD
76
+ =======
77
+ # Prepare KG store and local chunk index
78
+ chunks_index = {}
79
+ kg_path = os.path.join(persist_dir, "kg_store.ttl")
80
+ if _HAS_KG:
81
+ try:
82
+ kg = KGStore(path=kg_path)
83
+ except Exception:
84
+ kg = None
85
+ else:
86
+ kg = None
87
+
88
+ # Annotate chunks with stable chunk_id and optionally extract/link KG triples
89
+ for d in split_docs:
90
+ meta = d.metadata or {}
91
+ chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
92
+ if not meta:
93
+ d.metadata = {}
94
+ d.metadata["chunk_id"] = chunk_id
95
+
96
+ # Save minimal chunk index for runtime retrieval (text and source metadata)
97
+ chunks_index[chunk_id] = {
98
+ "text": getattr(d, "page_content", "") or getattr(d, "content", ""),
99
+ "metadata": d.metadata,
100
+ }
101
+
102
+ # If KG is available, attempt to extract triples and link the chunk
103
+ if kg is not None:
104
+ try:
105
+ triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
106
+ for t in triples:
107
+ try:
108
+ kg.add_triple(
109
+ t.get("subject"),
110
+ t.get("predicate"),
111
+ t.get("object"),
112
+ provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
113
+ )
114
+ kg.link_chunk_to_entity(
115
+ chunk_id,
116
+ t.get("subject"),
117
+ sentence=t.get("sentence"),
118
+ confidence=t.get("confidence"),
119
+ )
120
+ except Exception:
121
+ # non-fatal: continue
122
+ continue
123
+ except Exception:
124
+ # LLM extraction failed or not configured; skip KG extraction
125
+ pass
126
+
127
+ # Persist Chroma vectorstore
128
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
129
  Chroma.from_documents(
130
  split_docs,
131
  embedding=embeddings,
132
  persist_directory=persist_dir,
133
  )
134
  print(f"Vectorstore built and persisted to {persist_dir}")
135
+ <<<<<<< HEAD
136
+ =======
137
+
138
+ # Persist chunks index for runtime (simple json mapping)
139
+ try:
140
+ idx_path = os.path.join(persist_dir, "chunks_index.json")
141
+ with open(idx_path, "w", encoding="utf-8") as fh:
142
+ json.dump(chunks_index, fh)
143
+ except Exception:
144
+ pass
145
+
146
+ # Persist KG if available
147
+ if kg is not None:
148
+ try:
149
+ kg.save()
150
+ print(f"KG persisted to {kg_path}")
151
+ except Exception:
152
+ pass
153
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
154
 
155
 
156
  def main():
157
  parser = argparse.ArgumentParser()
158
  parser.add_argument("--data-dir", type=str, default="./data")
159
  parser.add_argument("--persist-dir", type=str, default="./vectorstore")
160
+ <<<<<<< HEAD
161
  parser.add_argument("--chunk-size", type=int, default=800)
162
  parser.add_argument("--chunk-overlap", type=int, default=200)
163
+ =======
164
+ parser.add_argument("--chunk-size", type=int, default=200)
165
+ parser.add_argument("--chunk-overlap", type=int, default=50)
166
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
167
  args = parser.parse_args()
168
 
169
  ingest(
src/kg/extract.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM-backed triple/entity extractor for PoC.
2
+
3
+ This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI)
4
+ to extract a small set of triples from a text chunk. It returns a list of dicts:
5
+ {"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float}
6
+
7
+ The implementation is intentionally conservative and small for a Spaces-compatible PoC.
8
+ """
9
+ from typing import List, Dict
10
+ import json
11
+
12
+ from langchain.chat_models import ChatOpenAI
13
+ from langchain.schema import HumanMessage, SystemMessage
14
+
15
+
16
+ def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]:
17
+ """Extract triples from text using a Chat LLM. Returns parsed JSON triples.
18
+
19
+ Note: requires OPENAI_API_KEY in env for ChatOpenAI to work.
20
+ """
21
+ prompt = (
22
+ "You are an assistant that extracts factual triples from a short text.\n"
23
+ "Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n"
24
+ "Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n"
25
+ f"Limit results to at most {max_triples} triples.\n\n"
26
+ "Text:\n<<<TEXT_START>>>\n"
27
+ + text
28
+ + "\n<<<TEXT_END>>>\n"
29
+ )
30
+
31
+ # system message to instruct format strictly
32
+ system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.")
33
+ human = HumanMessage(content=prompt)
34
+
35
+ llm = ChatOpenAI(model_name=model_name, temperature=0.0)
36
+ resp = llm([system, human])
37
+ raw = resp.content.strip()
38
+
39
+ # Attempt to find JSON in the output
40
+ try:
41
+ data = json.loads(raw)
42
+ except Exception:
43
+ # try to find first JSON substring
44
+ start = raw.find("[")
45
+ end = raw.rfind("]")
46
+ if start != -1 and end != -1:
47
+ try:
48
+ data = json.loads(raw[start:end+1])
49
+ except Exception:
50
+ data = []
51
+ else:
52
+ data = []
53
+
54
+ cleaned: List[Dict] = []
55
+ for item in data:
56
+ if not isinstance(item, dict):
57
+ continue
58
+ subj = item.get("subject") or item.get("s")
59
+ pred = item.get("predicate") or item.get("p")
60
+ obj = item.get("object") or item.get("o")
61
+ sent = item.get("sentence") or ""
62
+ conf = item.get("confidence")
63
+ try:
64
+ conf = float(conf) if conf is not None else 0.5
65
+ except Exception:
66
+ conf = 0.5
67
+ if subj and pred and obj:
68
+ cleaned.append({
69
+ "subject": str(subj),
70
+ "predicate": str(pred),
71
+ "object": str(obj),
72
+ "sentence": str(sent),
73
+ "confidence": conf,
74
+ })
75
+ return cleaned
src/kg/retriever.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """KG retriever that returns chunk IDs and short node summaries for a question."""
2
+ from typing import List, Tuple
3
+ from .store import KGStore
4
+
5
+
6
+ class KGRetriever:
7
+ def __init__(self, kg_store: KGStore):
8
+ self.kg = kg_store
9
+
10
+ def get_context_for_question(self, question: str, hops: int = 1) -> Tuple[List[str], List[str]]:
11
+ """Return (chunk_ids, node_summaries).
12
+
13
+ This simple implementation finds entities whose labels appear in the question
14
+ and returns linked chunk ids. For hops >1 you could expand to related entities.
15
+ """
16
+ entity_uris = self.kg.query_entities(question)
17
+ chunk_ids = []
18
+ summaries = []
19
+ for e in entity_uris:
20
+ # uri like http://.../entity/<label>
21
+ label = e.split("/entity/", 1)[-1].replace("_", " ")
22
+ chunks = self.kg.find_chunks_for_entity(label)
23
+ chunk_ids.extend(chunks)
24
+ summaries.append(label)
25
+ # dedupe
26
+ chunk_ids = list(dict.fromkeys(chunk_ids))
27
+ return chunk_ids, summaries
28
+
src/kg/store.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RDFLib-backed KG store for PoC.
2
+
3
+ Stores nodes and simple edges; links DocumentChunk IDs to KG entities using
4
+ `mentions` predicate. Persists to a TTL file.
5
+ """
6
+ try:
7
+ from rdflib import Graph, URIRef, Literal, Namespace
8
+ from rdflib.namespace import RDF, RDFS
9
+ _HAS_RDFLIB = True
10
+ except Exception:
11
+ _HAS_RDFLIB = False
12
+
13
+ from typing import List, Dict, Optional
14
+ import uuid
15
+ import os
16
+
17
+ NS_URI = "http://example.org/abalone/"
18
+
19
+ if _HAS_RDFLIB:
20
+ NS = Namespace(NS_URI)
21
+
22
+ class KGStore:
23
+ def __init__(self, path: str = "./kg_store.ttl"):
24
+ self.path = path
25
+ self.graph = Graph()
26
+ if os.path.exists(self.path):
27
+ try:
28
+ self.graph.parse(self.path, format="turtle")
29
+ except Exception:
30
+ # start empty if parse fails
31
+ self.graph = Graph()
32
+
33
+ def _entity_uri(self, label: str) -> URIRef:
34
+ safe = label.strip().lower().replace(" ", "_")
35
+ return URIRef(f"{NS_URI}entity/{safe}")
36
+
37
+ def _chunk_uri(self, chunk_id: str) -> URIRef:
38
+ return URIRef(f"{NS_URI}chunk/{chunk_id}")
39
+
40
+ def add_entity(self, label: str, description: Optional[str] = None) -> URIRef:
41
+ u = self._entity_uri(label)
42
+ self.graph.add((u, RDFS.label, Literal(label)))
43
+ if description:
44
+ self.graph.add((u, NS.description, Literal(description)))
45
+ return u
46
+
47
+ def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5):
48
+ e = self.add_entity(entity_label)
49
+ c = self._chunk_uri(chunk_id)
50
+ self.graph.add((c, NS.mentions, e))
51
+ # add provenance as reified data on the chunk node
52
+ self.graph.add((c, NS.sentence, Literal(sentence)))
53
+ self.graph.add((c, NS.confidence, Literal(str(confidence))))
54
+
55
+ def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None):
56
+ s = self.add_entity(subj_label)
57
+ o = self.add_entity(obj_label)
58
+ p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}")
59
+ self.graph.add((s, p, o))
60
+ if provenance:
61
+ # store provenance on subject node for simplicity
62
+ self.graph.add((s, NS.provenance, Literal(str(provenance))))
63
+
64
+ def save(self):
65
+ self.graph.serialize(destination=self.path, format="turtle")
66
+
67
+ def find_chunks_for_entity(self, entity_label: str) -> List[str]:
68
+ e = self._entity_uri(entity_label)
69
+ q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}"
70
+ res = self.graph.query(q)
71
+ out = []
72
+ for r in res:
73
+ uri = str(r[0])
74
+ if uri.startswith(NS_URI + "chunk/"):
75
+ out.append(uri.split("chunk/", 1)[1])
76
+ return out
77
+
78
+ def query_entities(self, text: str) -> List[str]:
79
+ # naive: find entities whose label appears in text
80
+ text_l = text.lower()
81
+ out = []
82
+ for s, p, o in self.graph.triples((None, RDFS.label, None)):
83
+ label = str(o).lower()
84
+ if label in text_l:
85
+ out.append(str(s))
86
+ return out
87
+
88
+ else:
89
+ class KGStore:
90
+ def __init__(self, *args, **kwargs):
91
+ raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`")
src/utils/rag_runtime.py CHANGED
@@ -6,6 +6,19 @@ import streamlit as st
6
 
7
  from src.vectorstore import get_retriever
8
  from src.qa_chain import make_conversational_chain
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
@@ -18,6 +31,10 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
18
  Raises:
19
  CalledProcessError: If the underlying subprocess fails.
20
  """
 
 
 
 
21
  cmd = [
22
  sys.executable,
23
  "-m",
@@ -30,6 +47,67 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
30
  subprocess.run(cmd, check=True)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  @st.cache_resource(show_spinner=False)
34
  def build_or_load_retriever_cached(
35
  data_dir: str,
 
6
 
7
  from src.vectorstore import get_retriever
8
  from src.qa_chain import make_conversational_chain
9
+ <<<<<<< HEAD
10
+ =======
11
+ import os
12
+ import json
13
+ from typing import Dict, List, Tuple
14
+
15
+ try:
16
+ from src.kg.store import KGStore
17
+ from src.kg.retriever import KGRetriever
18
+ _HAS_KG = True
19
+ except Exception:
20
+ _HAS_KG = False
21
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
22
 
23
 
24
  def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
 
31
  Raises:
32
  CalledProcessError: If the underlying subprocess fails.
33
  """
34
+ <<<<<<< HEAD
35
+ =======
36
+ # Updated to point to the CLI module inside the ingest package
37
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
38
  cmd = [
39
  sys.executable,
40
  "-m",
 
47
  subprocess.run(cmd, check=True)
48
 
49
 
50
+ <<<<<<< HEAD
51
+ =======
52
+ def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
53
+ idx_path = os.path.join(persist_dir, "chunks_index.json")
54
+ if not os.path.exists(idx_path):
55
+ return {}
56
+ try:
57
+ with open(idx_path, "r", encoding="utf-8") as fh:
58
+ return json.load(fh)
59
+ except Exception:
60
+ return {}
61
+
62
+
63
+ def answer_with_kg(
64
+ chain,
65
+ question: str,
66
+ chat_history: List[Tuple[str, str]],
67
+ persist_dir: str,
68
+ kg_hops: int = 1,
69
+ kg_context_max_chars: int = 1000,
70
+ ) -> Any:
71
+ """Augment question with KG context (if available) and run the chain.
72
+
73
+ This is a low-risk integration: we build a short textual summary from the KG
74
+ (node labels and short chunk snippets from chunks_index.json) and prepend it to
75
+ the question. The chain's retriever still runs; KG context is additional grounding.
76
+ """
77
+ kg_text_parts: List[str] = []
78
+ # Load chunks index mapping
79
+ chunks_index = _load_chunks_index(persist_dir)
80
+
81
+ if _HAS_KG:
82
+ kg_path = os.path.join(persist_dir, "kg_store.ttl")
83
+ try:
84
+ kg = KGStore(path=kg_path)
85
+ retr = KGRetriever(kg)
86
+ chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
87
+ if summaries:
88
+ kg_text_parts.append("KG entities: " + ", ".join(summaries))
89
+ # add chunk snippets
90
+ for cid in chunk_ids:
91
+ info = chunks_index.get(cid)
92
+ if info:
93
+ txt = info.get("text", "")
94
+ if txt:
95
+ snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
96
+ kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
97
+ except Exception:
98
+ # If KG load fails, skip KG augmentation
99
+ kg_text_parts = []
100
+
101
+ kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
102
+ if kg_context:
103
+ augmented_question = f"KG CONTEXT:\n{kg_context}\n\nUser Question:\n{question}"
104
+ else:
105
+ augmented_question = question
106
+
107
+ return chain({"question": augmented_question, "chat_history": chat_history})
108
+
109
+
110
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
111
  @st.cache_resource(show_spinner=False)
112
  def build_or_load_retriever_cached(
113
  data_dir: str,
src/vectorstore.py CHANGED
@@ -51,7 +51,11 @@ class HybridRetriever(BaseRetriever):
51
  def get_retriever(
52
  persist_dir: str,
53
  top_k: int,
 
54
  retrieval_mode: RetrievalMode = "mmr",
 
 
 
55
  ):
56
  db = get_vectorstore(persist_dir=persist_dir)
57
  mode = retrieval_mode.lower()
 
51
  def get_retriever(
52
  persist_dir: str,
53
  top_k: int,
54
+ <<<<<<< HEAD
55
  retrieval_mode: RetrievalMode = "mmr",
56
+ =======
57
+ retrieval_mode: RetrievalMode = "hybrid",
58
+ >>>>>>> ba5a1f4 (Adding kg to deployment)
59
  ):
60
  db = get_vectorstore(persist_dir=persist_dir)
61
  mode = retrieval_mode.lower()
vectorstore/chroma-collections.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2f2f346a015c1ffec6f8a3c535ac4ea2a99fe14f441a424e373b42248ac0fbe
3
- size 601
 
 
 
 
vectorstore/chroma-embeddings.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e572394a2cfa7976f286fa157be4e5eaf287d0721581969eed4c4df7874f04a
3
- size 3380376