cmd0160 commited on
Commit
70de36c
·
1 Parent(s): 5b73300

Adding kg updates

Browse files
app.py CHANGED
@@ -1,13 +1,14 @@
1
  import os
2
  from typing import List, Dict, Tuple, Optional, Any
 
 
 
3
 
4
  # Disable telemetry for LangChain and Chroma by default
5
  os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
6
  os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
7
  os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
8
 
9
- import streamlit as st
10
-
11
  from src.utils.rag_runtime import (
12
  run_ingest_cli,
13
  build_or_load_retriever_cached,
@@ -229,11 +230,46 @@ class AbaloneRAGApp:
229
  unsafe_allow_html=True,
230
  )
231
 
 
 
 
 
 
 
 
 
 
232
  if confirm:
 
233
  with st.spinner("Rebuilding vectorstore..."):
234
- run_ingest_cli(data_dir=self.data_dir, persist_dir=self.persist_dir)
235
- build_or_load_retriever_cached.clear()
236
- get_chain_cached.clear()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  self.chain = get_chain_cached(
239
  model_name=self.model_name,
@@ -444,10 +480,14 @@ def main() -> None:
444
  """Main entry point for running the Abalone RAG Chatbot app."""
445
  app = AbaloneRAGApp()
446
 
 
 
 
 
 
447
  if not ensure_openai_key():
448
  st.stop()
449
 
450
- app.handle_rebuild()
451
  app.ensure_chain_ready()
452
  app.render_chat_history()
453
  app.handle_user_input()
 
1
  import os
2
  from typing import List, Dict, Tuple, Optional, Any
3
+ import streamlit as st
4
+ import logging
5
+ from datetime import datetime
6
 
7
  # Disable telemetry for LangChain and Chroma by default
8
  os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
9
  os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
10
  os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
11
 
 
 
12
  from src.utils.rag_runtime import (
13
  run_ingest_cli,
14
  build_or_load_retriever_cached,
 
230
  unsafe_allow_html=True,
231
  )
232
 
233
+ # add a small UI log for rebuild actions
234
+ def _ui_log(msg: str):
235
+ try:
236
+ os.makedirs(self.persist_dir, exist_ok=True)
237
+ with open(os.path.join(self.persist_dir, "ui_rebuild.log"), "a", encoding="utf-8") as fh:
238
+ fh.write(f"{msg}\n")
239
+ except Exception:
240
+ pass
241
+
242
  if confirm:
243
+ _ui_log(f"{datetime.utcnow().isoformat()} - Confirm rebuild clicked by user")
244
  with st.spinner("Rebuilding vectorstore..."):
245
+ try:
246
+ out = run_ingest_cli(data_dir=self.data_dir, persist_dir=self.persist_dir)
247
+ _ui_log(f"{datetime.utcnow().isoformat()} - Rebuild succeeded")
248
+ except Exception as e:
249
+ import subprocess as _sp
250
+ _ui_log(f"{datetime.utcnow().isoformat()} - Rebuild failed: {e}")
251
+ if isinstance(e, _sp.CalledProcessError):
252
+ stderr = getattr(e, 'stderr', None)
253
+ stdout = getattr(e, 'output', None) or getattr(e, 'stdout', None)
254
+ st.error("Rebuild failed. See logs below.")
255
+ if stdout:
256
+ st.markdown("**ingest stdout:**")
257
+ st.code(stdout)
258
+ if stderr:
259
+ st.markdown("**ingest stderr:**")
260
+ st.code(stderr)
261
+ else:
262
+ st.error(f"Rebuild failed: {e}")
263
+ st.session_state["rebuild_pending"] = False
264
+ return
265
+
266
+ # On success, clear cached retriever/chain and reload
267
+ try:
268
+ build_or_load_retriever_cached.clear()
269
+ get_chain_cached.clear()
270
+ except Exception:
271
+ # if clearing cache fails, just log it in UI log
272
+ _ui_log(f"{datetime.utcnow().isoformat()} - Warning: failed to clear cached functions")
273
 
274
  self.chain = get_chain_cached(
275
  model_name=self.model_name,
 
480
  """Main entry point for running the Abalone RAG Chatbot app."""
481
  app = AbaloneRAGApp()
482
 
483
+ # Allow rebuild actions before enforcing OPENAI key so users can inspect logs
484
+ # and trigger rebuild operations even when the key isn't set. Chain init
485
+ # requires the key, so enforce it after handling rebuild requests.
486
+ app.handle_rebuild()
487
+
488
  if not ensure_openai_key():
489
  st.stop()
490
 
 
491
  app.ensure_chain_ready()
492
  app.render_chat_history()
493
  app.handle_user_input()
requirements.txt CHANGED
@@ -9,3 +9,4 @@ numpy==1.24.4
9
  streamlit>=1.25.0
10
  python-dotenv==1.0.0
11
  pytest==7.2.0
 
 
9
  streamlit>=1.25.0
10
  python-dotenv==1.0.0
11
  pytest==7.2.0
12
+ rdflib
src/ingest.py CHANGED
@@ -1,26 +1,25 @@
1
  import argparse
2
  import os
 
 
 
 
 
3
 
4
- from langchain_community.document_loaders import DirectoryLoader, TextLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.vectorstores import Chroma
7
- from langchain_community.embeddings import OpenAIEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain_community.vectorstores import Chroma
10
  from langchain_community.embeddings import OpenAIEmbeddings
11
 
12
- # New: KG integration imports
 
 
 
13
  import uuid
14
  import json
15
 
16
- try:
17
- from src.kg.extract import extract_triples_with_llm
18
- from src.kg.store import KGStore
19
- from src.kg.retriever import KGRetriever
20
- _HAS_KG = True
21
- except Exception:
22
- _HAS_KG = False
23
-
24
 
25
  def load_documents(data_dir: str):
26
  from pathlib import Path
@@ -40,22 +39,22 @@ def load_documents(data_dir: str):
40
  loaded = loader.load()
41
  docs.extend(loaded)
42
 
43
- print(f"Loaded {len(docs)} documents from {data_dir}")
44
- print("Documents ingested:")
45
- for d in docs:
46
- meta = d.metadata or {}
47
- path = meta.get("source") or meta.get("file_path") or meta.get("path")
48
- print(f" - {os.path.abspath(path) if path else 'Unknown file'}")
49
  return docs
50
 
51
 
52
- def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int):
 
 
53
  if not os.path.exists(data_dir):
 
54
  raise ValueError(f"Data directory does not exist: {data_dir}")
55
 
56
  docs = load_documents(data_dir)
57
 
58
  if not docs:
 
59
  raise ValueError(f"No .txt documents found in {data_dir}")
60
 
61
  splitter = RecursiveCharacterTextSplitter(
@@ -63,25 +62,53 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
63
  chunk_overlap=chunk_overlap,
64
  )
65
  split_docs = splitter.split_documents(docs)
66
- print(f"Split into {len(split_docs)} chunks")
67
-
68
- embeddings = OpenAIEmbeddings()
69
 
 
70
  os.makedirs(persist_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Prepare KG store and local chunk index
73
  chunks_index = {}
74
  kg_path = os.path.join(persist_dir, "kg_store.ttl")
75
- if _HAS_KG:
76
- try:
77
- kg = KGStore(path=kg_path)
78
- except Exception:
79
- kg = None
80
- else:
81
- kg = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # Annotate chunks with stable chunk_id and optionally extract/link KG triples
84
- for d in split_docs:
 
 
 
85
  meta = d.metadata or {}
86
  chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
87
  if not meta:
@@ -94,54 +121,83 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
94
  "metadata": d.metadata,
95
  }
96
 
97
- # If KG is available, attempt to extract triples and link the chunk
98
- if kg is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  try:
100
- triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
101
- for t in triples:
102
- try:
103
- kg.add_triple(
104
- t.get("subject"),
105
- t.get("predicate"),
106
- t.get("object"),
107
- provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
108
- )
109
- kg.link_chunk_to_entity(
110
- chunk_id,
111
- t.get("subject"),
112
- sentence=t.get("sentence"),
113
- confidence=t.get("confidence"),
114
- )
115
- except Exception:
116
- # non-fatal: continue
117
- continue
118
  except Exception:
119
- # LLM extraction failed or not configured; skip KG extraction
120
  pass
121
-
122
- # Persist Chroma vectorstore
123
- Chroma.from_documents(
124
- split_docs,
125
- embedding=embeddings,
126
- persist_directory=persist_dir,
127
- )
128
- print(f"Vectorstore built and persisted to {persist_dir}")
129
 
130
  # Persist chunks index for runtime (simple json mapping)
131
  try:
132
  idx_path = os.path.join(persist_dir, "chunks_index.json")
133
  with open(idx_path, "w", encoding="utf-8") as fh:
134
  json.dump(chunks_index, fh)
 
135
  except Exception:
136
- pass
137
 
138
- # Persist KG if available
139
- if kg is not None:
140
- try:
141
- kg.save()
142
- print(f"KG persisted to {kg_path}")
143
- except Exception:
144
- pass
 
 
 
 
 
 
 
145
 
146
 
147
  def main():
@@ -150,6 +206,7 @@ def main():
150
  parser.add_argument("--persist-dir", type=str, default="./vectorstore")
151
  parser.add_argument("--chunk-size", type=int, default=200)
152
  parser.add_argument("--chunk-overlap", type=int, default=50)
 
153
  args = parser.parse_args()
154
 
155
  ingest(
@@ -157,6 +214,7 @@ def main():
157
  persist_dir=args.persist_dir,
158
  chunk_size=args.chunk_size,
159
  chunk_overlap=args.chunk_overlap,
 
160
  )
161
 
162
 
 
1
  import argparse
2
  import os
3
+ import logging
4
+ from datetime import datetime, timezone
5
+
6
+ # Disable Chroma telemetry to avoid opentelemetry compatibility errors during ingestion
7
+ os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
8
 
 
 
 
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain_community.vectorstores import Chroma
11
  from langchain_community.embeddings import OpenAIEmbeddings
12
 
13
+ # KG integration: import unconditionally so errors propagate if dependencies missing
14
+ from src.kg.extract import extract_triples_with_llm
15
+ from src.kg.store import KGStore
16
+
17
  import uuid
18
  import json
19
 
20
+ # Module logger
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
 
 
 
 
 
23
 
24
  def load_documents(data_dir: str):
25
  from pathlib import Path
 
39
  loaded = loader.load()
40
  docs.extend(loaded)
41
 
42
+ logger.info(f"Loaded {len(docs)} documents from {data_dir}")
43
+ logger.debug("Documents ingested: %s", [ (d.metadata or {}).get('source') for d in docs ])
 
 
 
 
44
  return docs
45
 
46
 
47
+ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int, openai_api_key: str = None):
48
+ logger.info("Starting ingest: data_dir=%s persist_dir=%s chunk_size=%s chunk_overlap=%s", data_dir, persist_dir, chunk_size, chunk_overlap)
49
+
50
  if not os.path.exists(data_dir):
51
+ logger.error("Data directory does not exist: %s", data_dir)
52
  raise ValueError(f"Data directory does not exist: {data_dir}")
53
 
54
  docs = load_documents(data_dir)
55
 
56
  if not docs:
57
+ logger.error("No documents found in %s", data_dir)
58
  raise ValueError(f"No .txt documents found in {data_dir}")
59
 
60
  splitter = RecursiveCharacterTextSplitter(
 
62
  chunk_overlap=chunk_overlap,
63
  )
64
  split_docs = splitter.split_documents(docs)
65
+ logger.info("Split into %d chunks", len(split_docs))
 
 
66
 
67
+ # Ensure persist dir exists and add file handler to logger
68
  os.makedirs(persist_dir, exist_ok=True)
69
+ # Add file handler for detailed logs in persist_dir/ingest.log
70
+ try:
71
+ fh = logging.FileHandler(os.path.join(persist_dir, 'ingest.log'))
72
+ fh.setLevel(logging.DEBUG)
73
+ fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
74
+ # Avoid adding multiple file handlers on repeated calls
75
+ if not any(isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', None) == fh.baseFilename for h in logger.handlers):
76
+ logger.addHandler(fh)
77
+ except Exception:
78
+ logger.exception('Failed to add file handler for ingest log')
79
 
80
  # Prepare KG store and local chunk index
81
  chunks_index = {}
82
  kg_path = os.path.join(persist_dir, "kg_store.ttl")
83
+
84
+ # Initialize embeddings; provide a clear error if OpenAI API key is missing
85
+ try:
86
+ logger.info('Initializing embeddings')
87
+ # If an API key was provided on the CLI, inject it into the environment
88
+ if openai_api_key:
89
+ os.environ['OPENAI_API_KEY'] = openai_api_key
90
+ logger.debug('Set OPENAI_API_KEY from CLI flag')
91
+ embeddings = OpenAIEmbeddings()
92
+ logger.info('Embeddings initialized')
93
+ except Exception as e:
94
+ logger.exception("Failed to initialize OpenAI embeddings. Ensure OPENAI_API_KEY is set in the environment or pass --openai-api-key.")
95
+ raise
96
+
97
+ # Initialize KG store unconditionally so errors are visible
98
+ try:
99
+ logger.info('Initializing KG store at %s', kg_path)
100
+ kg = KGStore(path=kg_path)
101
+ logger.info('KG store initialized')
102
+ except Exception:
103
+ logger.exception('Failed to initialize KGStore')
104
+ # re-raise so caller sees the failure
105
+ raise
106
 
107
  # Annotate chunks with stable chunk_id and optionally extract/link KG triples
108
+ start_time = datetime.now(timezone.utc)
109
+ logger.info('Beginning per-chunk processing at %s UTC', start_time.isoformat())
110
+ for i, d in enumerate(split_docs, start=1):
111
+ print(i, d)
112
  meta = d.metadata or {}
113
  chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
114
  if not meta:
 
121
  "metadata": d.metadata,
122
  }
123
 
124
+ # Log progress at intervals
125
+ if i % 50 == 0 or i <= 5:
126
+ logger.debug('Processing chunk %d/%d (id=%s)', i, len(split_docs), chunk_id)
127
+
128
+ # Attempt to extract triples and link the chunk (errors during extraction are non-fatal)
129
+ try:
130
+ triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
131
+ if triples:
132
+ logger.debug('Extracted %d triples for chunk %s', len(triples), chunk_id)
133
+ for t in triples:
134
+ try:
135
+ kg.add_triple(
136
+ t.get("subject"),
137
+ t.get("predicate"),
138
+ t.get("object"),
139
+ provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
140
+ )
141
+ kg.link_chunk_to_entity(
142
+ chunk_id,
143
+ t.get("subject"),
144
+ sentence=t.get("sentence"),
145
+ confidence=t.get("confidence"),
146
+ )
147
+ except Exception:
148
+ logger.exception('Non-fatal error while adding triple or linking chunk %s', chunk_id)
149
+ continue
150
+ except Exception:
151
+ # LLM extraction failed or not configured; skip KG extraction for this chunk
152
+ logger.exception('KG extraction failed for chunk %s (continuing)', chunk_id)
153
+ pass
154
+
155
+ end_time = datetime.now(timezone.utc)
156
+ logger.info('Finished per-chunk processing at %s UTC (duration %s)', end_time.isoformat(), end_time - start_time)
157
+
158
+ # Persist Chroma vectorstore
159
+ try:
160
+ logger.info('Persisting Chroma vectorstore to %s', persist_dir)
161
+ Chroma.from_documents(
162
+ split_docs,
163
+ embedding=embeddings,
164
+ persist_directory=persist_dir,
165
+ )
166
+ logger.info('Vectorstore built and persisted to %s', persist_dir)
167
+ except Exception as e:
168
+ import traceback, sys
169
+ logger.exception('Chroma.from_documents failed to write the vectorstore:')
170
+ # ensure the log is flushed to file
171
+ for h in logger.handlers:
172
  try:
173
+ h.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  except Exception:
 
175
  pass
176
+ sys.exit(1)
 
 
 
 
 
 
 
177
 
178
  # Persist chunks index for runtime (simple json mapping)
179
  try:
180
  idx_path = os.path.join(persist_dir, "chunks_index.json")
181
  with open(idx_path, "w", encoding="utf-8") as fh:
182
  json.dump(chunks_index, fh)
183
+ logger.info('Wrote chunks_index.json (%d entries)', len(chunks_index))
184
  except Exception:
185
+ logger.exception('Failed to write chunks_index.json')
186
 
187
+ # Persist KG
188
+ try:
189
+ kg.save()
190
+ logger.info('KG persisted to %s', kg_path)
191
+ except Exception:
192
+ import traceback, sys
193
+ logger.exception('Failed to persist KG to disk:')
194
+ # ensure the log is flushed to file
195
+ for h in logger.handlers:
196
+ try:
197
+ h.flush()
198
+ except Exception:
199
+ pass
200
+ sys.exit(1)
201
 
202
 
203
  def main():
 
206
  parser.add_argument("--persist-dir", type=str, default="./vectorstore")
207
  parser.add_argument("--chunk-size", type=int, default=200)
208
  parser.add_argument("--chunk-overlap", type=int, default=50)
209
+ parser.add_argument("--openai-api-key", type=str, default=None, help="Optional OpenAI API key to use for embeddings (overrides env var)")
210
  args = parser.parse_args()
211
 
212
  ingest(
 
214
  persist_dir=args.persist_dir,
215
  chunk_size=args.chunk_size,
216
  chunk_overlap=args.chunk_overlap,
217
+ openai_api_key=args.openai_api_key,
218
  )
219
 
220
 
src/utils/rag_runtime.py CHANGED
@@ -8,27 +8,20 @@ from src.vectorstore import get_retriever
8
  from src.qa_chain import make_conversational_chain
9
  import os
10
  import json
11
- from typing import Dict, List, Tuple
12
 
13
- try:
14
- from src.kg.store import KGStore
15
- from src.kg.retriever import KGRetriever
16
- _HAS_KG = True
17
- except Exception:
18
- _HAS_KG = False
19
 
20
 
21
- def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
22
  """Run the ingestion module to rebuild the vectorstore.
23
 
24
- Args:
25
- data_dir: Directory containing the raw text files.
26
- persist_dir: Directory where embeddings and Chroma DB should be stored.
27
-
28
- Raises:
29
- CalledProcessError: If the underlying subprocess fails.
30
  """
31
-
32
  cmd = [
33
  sys.executable,
34
  "-m",
@@ -38,7 +31,29 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
38
  "--persist-dir",
39
  persist_dir,
40
  ]
41
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
44
  idx_path = os.path.join(persist_dir, "chunks_index.json")
@@ -69,25 +84,25 @@ def answer_with_kg(
69
  # Load chunks index mapping
70
  chunks_index = _load_chunks_index(persist_dir)
71
 
72
- if _HAS_KG:
73
- kg_path = os.path.join(persist_dir, "kg_store.ttl")
74
- try:
75
- kg = KGStore(path=kg_path)
76
- retr = KGRetriever(kg)
77
- chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
78
- if summaries:
79
- kg_text_parts.append("KG entities: " + ", ".join(summaries))
80
- # add chunk snippets
81
- for cid in chunk_ids:
82
- info = chunks_index.get(cid)
83
- if info:
84
- txt = info.get("text", "")
85
- if txt:
86
- snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
87
- kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
88
- except Exception:
89
- # If KG load fails, skip KG augmentation
90
- kg_text_parts = []
91
 
92
  kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
93
  if kg_context:
@@ -113,24 +128,31 @@ def build_or_load_retriever_cached(
113
  Args:
114
  data_dir: Directory containing input documents.
115
  persist_dir: Directory where the Chroma vectorstore is stored.
116
- top_k: Number of chunks to retrieve for queries.
117
  retrieval_mode: Retrieval strategy (mmr, similarity, hybrid).
118
 
119
  Returns:
120
  An initialized retriever instance.
121
  """
122
  try:
 
 
 
 
123
  return get_retriever(
124
  persist_dir=persist_dir,
125
  top_k=top_k,
126
- retrieval_mode=retrieval_mode,
127
  )
128
  except Exception:
129
  run_ingest_cli(data_dir=data_dir, persist_dir=persist_dir)
 
 
 
130
  return get_retriever(
131
  persist_dir=persist_dir,
132
  top_k=top_k,
133
- retrieval_mode=retrieval_mode,
134
  )
135
 
136
 
 
8
  from src.qa_chain import make_conversational_chain
9
  import os
10
  import json
11
+ from typing import Dict, List, Tuple, cast
12
 
13
+ # Unconditionally import KG modules; let import errors propagate so failures are visible
14
+ from src.kg.store import KGStore
15
+ from src.kg.retriever import KGRetriever
 
 
 
16
 
17
 
18
+ def run_ingest_cli(data_dir: str, persist_dir: str) -> str:
19
  """Run the ingestion module to rebuild the vectorstore.
20
 
21
+ Runs the ingest CLI as a subprocess and returns stdout on success.
22
+ On failure raises subprocess.CalledProcessError with captured stdout/stderr so callers
23
+ (for example the Streamlit UI) can display a helpful error message.
 
 
 
24
  """
 
25
  cmd = [
26
  sys.executable,
27
  "-m",
 
31
  "--persist-dir",
32
  persist_dir,
33
  ]
34
+ try:
35
+ # Add a timeout to avoid indefinite hanging; 600s (10 minutes) is generous for large ingests
36
+ completed = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
37
+ except subprocess.TimeoutExpired as te:
38
+ # Provide helpful error including partial output
39
+ raise subprocess.CalledProcessError(
40
+ returncode=124,
41
+ cmd=cmd,
42
+ output=getattr(te, 'output', '') or '',
43
+ stderr=f"Ingest process timed out after {te.timeout} seconds",
44
+ )
45
+
46
+ # Check return code and raise with captured output on failure
47
+ if completed.returncode != 0:
48
+ # Raise with captured output to make it easy to present to the user
49
+ raise subprocess.CalledProcessError(
50
+ returncode=completed.returncode,
51
+ cmd=cmd,
52
+ output=completed.stdout,
53
+ stderr=completed.stderr,
54
+ )
55
+ return completed.stdout
56
+
57
 
58
  def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
59
  idx_path = os.path.join(persist_dir, "chunks_index.json")
 
84
  # Load chunks index mapping
85
  chunks_index = _load_chunks_index(persist_dir)
86
 
87
+ # Load KG unconditionally; let import or parse errors raise so callers can see them.
88
+ kg_path = os.path.join(persist_dir, "kg_store.ttl")
89
+ try:
90
+ kg = KGStore(path=kg_path)
91
+ retr = KGRetriever(kg)
92
+ chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
93
+ if summaries:
94
+ kg_text_parts.append("KG entities: " + ", ".join(summaries))
95
+ # add chunk snippets
96
+ for cid in chunk_ids:
97
+ info = chunks_index.get(cid)
98
+ if info:
99
+ txt = info.get("text", "")
100
+ if txt:
101
+ snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
102
+ kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
103
+ except Exception:
104
+ # If KG load or query fails, skip KG augmentation (allow the exception to surface in logs)
105
+ kg_text_parts = []
106
 
107
  kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
108
  if kg_context:
 
128
  Args:
129
  data_dir: Directory containing input documents.
130
  persist_dir: Directory where the Chroma vectorstore is stored.
131
+ top_k: Number of chunks to retrieve.
132
  retrieval_mode: Retrieval strategy (mmr, similarity, hybrid).
133
 
134
  Returns:
135
  An initialized retriever instance.
136
  """
137
  try:
138
+ # Cast retrieval_mode to the expected literal type to satisfy type checkers
139
+ from typing import Literal
140
+ RetrievalMode = Literal["mmr", "similarity", "hybrid"]
141
+ mode = cast(RetrievalMode, retrieval_mode)
142
  return get_retriever(
143
  persist_dir=persist_dir,
144
  top_k=top_k,
145
+ retrieval_mode=mode,
146
  )
147
  except Exception:
148
  run_ingest_cli(data_dir=data_dir, persist_dir=persist_dir)
149
+ from typing import Literal
150
+ RetrievalMode = Literal["mmr", "similarity", "hybrid"]
151
+ mode = cast(RetrievalMode, retrieval_mode)
152
  return get_retriever(
153
  persist_dir=persist_dir,
154
  top_k=top_k,
155
+ retrieval_mode=mode,
156
  )
157
 
158
 
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/data_level0.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4f2354cfac4766af62b3206edeadc037befeda51170642759cfde636719b6d0
3
- size 6284000
 
 
 
 
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d683b2d00a21eda899c7d7b067fa8afd0eadec8a6a2f4bbcf835c3a028bf30ed
3
- size 100
 
 
 
 
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/index_metadata.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e246e950cff89753a1014bf89d814426f6d22f1c0f0fa673dc8d3b11986afb4d
3
- size 55974
 
 
 
 
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c10f6f8590ed8a62b4b5da7c3c431202130fc835d35f8318945b0c1fcfd1bb56
3
- size 4000
 
 
 
 
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/link_lists.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f4382f14c7a7b600756f34e3493f67f810192c9a1d4afef2cd3d3d335fb092c
3
- size 8148
 
 
 
 
vectorstore/chroma-embeddings.parquet DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e572394a2cfa7976f286fa157be4e5eaf287d0721581969eed4c4df7874f04a
3
- size 3380376
 
 
 
 
vectorstore/chroma.sqlite3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f9fe3c3f3cddec7b92e068aa41dab33d8c0c831fab65768f7567a457be81fad
3
- size 12951552
 
 
 
 
vectorstore/chunks_index.json DELETED
The diff for this file is too large to render. See raw diff
 
vectorstore/index/id_to_uuid_6855eddb-ade5-445b-9e7f-a8293769c768.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fee2dec9920bfc53613b6de9edf335643f12002c9aa363756e403115419c097
3
- size 8714
 
 
 
 
vectorstore/index/index_6855eddb-ade5-445b-9e7f-a8293769c768.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba0cc06f9ba2330e6768388c4467d1b6a5cb39e7421ef240df400fb89baf2b92
3
- size 1730044
 
 
 
 
vectorstore/index/index_metadata_6855eddb-ade5-445b-9e7f-a8293769c768.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0bc6676e35c07eb122909628dcc80dd749794bdfb45c099a06ef1bcb59be763
3
- size 105
 
 
 
 
vectorstore/index/uuid_to_id_6855eddb-ade5-445b-9e7f-a8293769c768.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c41d12045aa07c6d3cfa93a06b0b0e9414c7f9adab57e9f22f1bb3b7328dee1
3
- size 10211