Spaces:
Sleeping
Sleeping
Adding kg updates
Browse files- app.py +46 -6
- requirements.txt +1 -0
- src/ingest.py +126 -68
- src/utils/rag_runtime.py +60 -38
- vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/data_level0.bin +0 -3
- vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/header.bin +0 -3
- vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/index_metadata.pickle +0 -3
- vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/length.bin +0 -3
- vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/link_lists.bin +0 -3
- vectorstore/chroma-embeddings.parquet +0 -3
- vectorstore/chroma.sqlite3 +0 -3
- vectorstore/chunks_index.json +0 -0
- vectorstore/index/id_to_uuid_6855eddb-ade5-445b-9e7f-a8293769c768.pkl +0 -3
- vectorstore/index/index_6855eddb-ade5-445b-9e7f-a8293769c768.bin +0 -3
- vectorstore/index/index_metadata_6855eddb-ade5-445b-9e7f-a8293769c768.pkl +0 -3
- vectorstore/index/uuid_to_id_6855eddb-ade5-445b-9e7f-a8293769c768.pkl +0 -3
app.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
import os
|
| 2 |
from typing import List, Dict, Tuple, Optional, Any
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# Disable telemetry for LangChain and Chroma by default
|
| 5 |
os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
|
| 6 |
os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
|
| 7 |
os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
|
| 8 |
|
| 9 |
-
import streamlit as st
|
| 10 |
-
|
| 11 |
from src.utils.rag_runtime import (
|
| 12 |
run_ingest_cli,
|
| 13 |
build_or_load_retriever_cached,
|
|
@@ -229,11 +230,46 @@ class AbaloneRAGApp:
|
|
| 229 |
unsafe_allow_html=True,
|
| 230 |
)
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
if confirm:
|
|
|
|
| 233 |
with st.spinner("Rebuilding vectorstore..."):
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
self.chain = get_chain_cached(
|
| 239 |
model_name=self.model_name,
|
|
@@ -444,10 +480,14 @@ def main() -> None:
|
|
| 444 |
"""Main entry point for running the Abalone RAG Chatbot app."""
|
| 445 |
app = AbaloneRAGApp()
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
if not ensure_openai_key():
|
| 448 |
st.stop()
|
| 449 |
|
| 450 |
-
app.handle_rebuild()
|
| 451 |
app.ensure_chain_ready()
|
| 452 |
app.render_chat_history()
|
| 453 |
app.handle_user_input()
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import List, Dict, Tuple, Optional, Any
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
|
| 7 |
# Disable telemetry for LangChain and Chroma by default
|
| 8 |
os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
|
| 9 |
os.environ.setdefault("LANGCHAIN_DISABLE_TELEMETRY", "true")
|
| 10 |
os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
|
| 11 |
|
|
|
|
|
|
|
| 12 |
from src.utils.rag_runtime import (
|
| 13 |
run_ingest_cli,
|
| 14 |
build_or_load_retriever_cached,
|
|
|
|
| 230 |
unsafe_allow_html=True,
|
| 231 |
)
|
| 232 |
|
| 233 |
+
# add a small UI log for rebuild actions
|
| 234 |
+
def _ui_log(msg: str):
|
| 235 |
+
try:
|
| 236 |
+
os.makedirs(self.persist_dir, exist_ok=True)
|
| 237 |
+
with open(os.path.join(self.persist_dir, "ui_rebuild.log"), "a", encoding="utf-8") as fh:
|
| 238 |
+
fh.write(f"{msg}\n")
|
| 239 |
+
except Exception:
|
| 240 |
+
pass
|
| 241 |
+
|
| 242 |
if confirm:
|
| 243 |
+
_ui_log(f"{datetime.utcnow().isoformat()} - Confirm rebuild clicked by user")
|
| 244 |
with st.spinner("Rebuilding vectorstore..."):
|
| 245 |
+
try:
|
| 246 |
+
out = run_ingest_cli(data_dir=self.data_dir, persist_dir=self.persist_dir)
|
| 247 |
+
_ui_log(f"{datetime.utcnow().isoformat()} - Rebuild succeeded")
|
| 248 |
+
except Exception as e:
|
| 249 |
+
import subprocess as _sp
|
| 250 |
+
_ui_log(f"{datetime.utcnow().isoformat()} - Rebuild failed: {e}")
|
| 251 |
+
if isinstance(e, _sp.CalledProcessError):
|
| 252 |
+
stderr = getattr(e, 'stderr', None)
|
| 253 |
+
stdout = getattr(e, 'output', None) or getattr(e, 'stdout', None)
|
| 254 |
+
st.error("Rebuild failed. See logs below.")
|
| 255 |
+
if stdout:
|
| 256 |
+
st.markdown("**ingest stdout:**")
|
| 257 |
+
st.code(stdout)
|
| 258 |
+
if stderr:
|
| 259 |
+
st.markdown("**ingest stderr:**")
|
| 260 |
+
st.code(stderr)
|
| 261 |
+
else:
|
| 262 |
+
st.error(f"Rebuild failed: {e}")
|
| 263 |
+
st.session_state["rebuild_pending"] = False
|
| 264 |
+
return
|
| 265 |
+
|
| 266 |
+
# On success, clear cached retriever/chain and reload
|
| 267 |
+
try:
|
| 268 |
+
build_or_load_retriever_cached.clear()
|
| 269 |
+
get_chain_cached.clear()
|
| 270 |
+
except Exception:
|
| 271 |
+
# if clearing cache fails, just log it in UI log
|
| 272 |
+
_ui_log(f"{datetime.utcnow().isoformat()} - Warning: failed to clear cached functions")
|
| 273 |
|
| 274 |
self.chain = get_chain_cached(
|
| 275 |
model_name=self.model_name,
|
|
|
|
| 480 |
"""Main entry point for running the Abalone RAG Chatbot app."""
|
| 481 |
app = AbaloneRAGApp()
|
| 482 |
|
| 483 |
+
# Allow rebuild actions before enforcing OPENAI key so users can inspect logs
|
| 484 |
+
# and trigger rebuild operations even when the key isn't set. Chain init
|
| 485 |
+
# requires the key, so enforce it after handling rebuild requests.
|
| 486 |
+
app.handle_rebuild()
|
| 487 |
+
|
| 488 |
if not ensure_openai_key():
|
| 489 |
st.stop()
|
| 490 |
|
|
|
|
| 491 |
app.ensure_chain_ready()
|
| 492 |
app.render_chat_history()
|
| 493 |
app.handle_user_input()
|
requirements.txt
CHANGED
|
@@ -9,3 +9,4 @@ numpy==1.24.4
|
|
| 9 |
streamlit>=1.25.0
|
| 10 |
python-dotenv==1.0.0
|
| 11 |
pytest==7.2.0
|
|
|
|
|
|
| 9 |
streamlit>=1.25.0
|
| 10 |
python-dotenv==1.0.0
|
| 11 |
pytest==7.2.0
|
| 12 |
+
rdflib
|
src/ingest.py
CHANGED
|
@@ -1,26 +1,25 @@
|
|
| 1 |
import argparse
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 5 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
-
from langchain_community.vectorstores import Chroma
|
| 7 |
-
from langchain_community.embeddings import OpenAIEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
from langchain_community.vectorstores import Chroma
|
| 10 |
from langchain_community.embeddings import OpenAIEmbeddings
|
| 11 |
|
| 12 |
-
#
|
|
|
|
|
|
|
|
|
|
| 13 |
import uuid
|
| 14 |
import json
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
from src.kg.retriever import KGRetriever
|
| 20 |
-
_HAS_KG = True
|
| 21 |
-
except Exception:
|
| 22 |
-
_HAS_KG = False
|
| 23 |
-
|
| 24 |
|
| 25 |
def load_documents(data_dir: str):
|
| 26 |
from pathlib import Path
|
|
@@ -40,22 +39,22 @@ def load_documents(data_dir: str):
|
|
| 40 |
loaded = loader.load()
|
| 41 |
docs.extend(loaded)
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
for d in docs:
|
| 46 |
-
meta = d.metadata or {}
|
| 47 |
-
path = meta.get("source") or meta.get("file_path") or meta.get("path")
|
| 48 |
-
print(f" - {os.path.abspath(path) if path else 'Unknown file'}")
|
| 49 |
return docs
|
| 50 |
|
| 51 |
|
| 52 |
-
def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int):
|
|
|
|
|
|
|
| 53 |
if not os.path.exists(data_dir):
|
|
|
|
| 54 |
raise ValueError(f"Data directory does not exist: {data_dir}")
|
| 55 |
|
| 56 |
docs = load_documents(data_dir)
|
| 57 |
|
| 58 |
if not docs:
|
|
|
|
| 59 |
raise ValueError(f"No .txt documents found in {data_dir}")
|
| 60 |
|
| 61 |
splitter = RecursiveCharacterTextSplitter(
|
|
@@ -63,25 +62,53 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
|
|
| 63 |
chunk_overlap=chunk_overlap,
|
| 64 |
)
|
| 65 |
split_docs = splitter.split_documents(docs)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
embeddings = OpenAIEmbeddings()
|
| 69 |
|
|
|
|
| 70 |
os.makedirs(persist_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Prepare KG store and local chunk index
|
| 73 |
chunks_index = {}
|
| 74 |
kg_path = os.path.join(persist_dir, "kg_store.ttl")
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
# Annotate chunks with stable chunk_id and optionally extract/link KG triples
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
meta = d.metadata or {}
|
| 86 |
chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
|
| 87 |
if not meta:
|
|
@@ -94,54 +121,83 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
|
|
| 94 |
"metadata": d.metadata,
|
| 95 |
}
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
try:
|
| 100 |
-
|
| 101 |
-
for t in triples:
|
| 102 |
-
try:
|
| 103 |
-
kg.add_triple(
|
| 104 |
-
t.get("subject"),
|
| 105 |
-
t.get("predicate"),
|
| 106 |
-
t.get("object"),
|
| 107 |
-
provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
|
| 108 |
-
)
|
| 109 |
-
kg.link_chunk_to_entity(
|
| 110 |
-
chunk_id,
|
| 111 |
-
t.get("subject"),
|
| 112 |
-
sentence=t.get("sentence"),
|
| 113 |
-
confidence=t.get("confidence"),
|
| 114 |
-
)
|
| 115 |
-
except Exception:
|
| 116 |
-
# non-fatal: continue
|
| 117 |
-
continue
|
| 118 |
except Exception:
|
| 119 |
-
# LLM extraction failed or not configured; skip KG extraction
|
| 120 |
pass
|
| 121 |
-
|
| 122 |
-
# Persist Chroma vectorstore
|
| 123 |
-
Chroma.from_documents(
|
| 124 |
-
split_docs,
|
| 125 |
-
embedding=embeddings,
|
| 126 |
-
persist_directory=persist_dir,
|
| 127 |
-
)
|
| 128 |
-
print(f"Vectorstore built and persisted to {persist_dir}")
|
| 129 |
|
| 130 |
# Persist chunks index for runtime (simple json mapping)
|
| 131 |
try:
|
| 132 |
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
| 133 |
with open(idx_path, "w", encoding="utf-8") as fh:
|
| 134 |
json.dump(chunks_index, fh)
|
|
|
|
| 135 |
except Exception:
|
| 136 |
-
|
| 137 |
|
| 138 |
-
# Persist KG
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
def main():
|
|
@@ -150,6 +206,7 @@ def main():
|
|
| 150 |
parser.add_argument("--persist-dir", type=str, default="./vectorstore")
|
| 151 |
parser.add_argument("--chunk-size", type=int, default=200)
|
| 152 |
parser.add_argument("--chunk-overlap", type=int, default=50)
|
|
|
|
| 153 |
args = parser.parse_args()
|
| 154 |
|
| 155 |
ingest(
|
|
@@ -157,6 +214,7 @@ def main():
|
|
| 157 |
persist_dir=args.persist_dir,
|
| 158 |
chunk_size=args.chunk_size,
|
| 159 |
chunk_overlap=args.chunk_overlap,
|
|
|
|
| 160 |
)
|
| 161 |
|
| 162 |
|
|
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
+
import logging
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
|
| 6 |
+
# Disable Chroma telemetry to avoid opentelemetry compatibility errors during ingestion
|
| 7 |
+
os.environ.setdefault("CHROMA_TELEMETRY_ENABLED", "false")
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
from langchain_community.vectorstores import Chroma
|
| 11 |
from langchain_community.embeddings import OpenAIEmbeddings
|
| 12 |
|
| 13 |
+
# KG integration: import unconditionally so errors propagate if dependencies missing
|
| 14 |
+
from src.kg.extract import extract_triples_with_llm
|
| 15 |
+
from src.kg.store import KGStore
|
| 16 |
+
|
| 17 |
import uuid
|
| 18 |
import json
|
| 19 |
|
| 20 |
+
# Module logger
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
logger.setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def load_documents(data_dir: str):
|
| 25 |
from pathlib import Path
|
|
|
|
| 39 |
loaded = loader.load()
|
| 40 |
docs.extend(loaded)
|
| 41 |
|
| 42 |
+
logger.info(f"Loaded {len(docs)} documents from {data_dir}")
|
| 43 |
+
logger.debug("Documents ingested: %s", [ (d.metadata or {}).get('source') for d in docs ])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return docs
|
| 45 |
|
| 46 |
|
| 47 |
+
def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int, openai_api_key: str = None):
|
| 48 |
+
logger.info("Starting ingest: data_dir=%s persist_dir=%s chunk_size=%s chunk_overlap=%s", data_dir, persist_dir, chunk_size, chunk_overlap)
|
| 49 |
+
|
| 50 |
if not os.path.exists(data_dir):
|
| 51 |
+
logger.error("Data directory does not exist: %s", data_dir)
|
| 52 |
raise ValueError(f"Data directory does not exist: {data_dir}")
|
| 53 |
|
| 54 |
docs = load_documents(data_dir)
|
| 55 |
|
| 56 |
if not docs:
|
| 57 |
+
logger.error("No documents found in %s", data_dir)
|
| 58 |
raise ValueError(f"No .txt documents found in {data_dir}")
|
| 59 |
|
| 60 |
splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 62 |
chunk_overlap=chunk_overlap,
|
| 63 |
)
|
| 64 |
split_docs = splitter.split_documents(docs)
|
| 65 |
+
logger.info("Split into %d chunks", len(split_docs))
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
# Ensure persist dir exists and add file handler to logger
|
| 68 |
os.makedirs(persist_dir, exist_ok=True)
|
| 69 |
+
# Add file handler for detailed logs in persist_dir/ingest.log
|
| 70 |
+
try:
|
| 71 |
+
fh = logging.FileHandler(os.path.join(persist_dir, 'ingest.log'))
|
| 72 |
+
fh.setLevel(logging.DEBUG)
|
| 73 |
+
fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
| 74 |
+
# Avoid adding multiple file handlers on repeated calls
|
| 75 |
+
if not any(isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', None) == fh.baseFilename for h in logger.handlers):
|
| 76 |
+
logger.addHandler(fh)
|
| 77 |
+
except Exception:
|
| 78 |
+
logger.exception('Failed to add file handler for ingest log')
|
| 79 |
|
| 80 |
# Prepare KG store and local chunk index
|
| 81 |
chunks_index = {}
|
| 82 |
kg_path = os.path.join(persist_dir, "kg_store.ttl")
|
| 83 |
+
|
| 84 |
+
# Initialize embeddings; provide a clear error if OpenAI API key is missing
|
| 85 |
+
try:
|
| 86 |
+
logger.info('Initializing embeddings')
|
| 87 |
+
# If an API key was provided on the CLI, inject it into the environment
|
| 88 |
+
if openai_api_key:
|
| 89 |
+
os.environ['OPENAI_API_KEY'] = openai_api_key
|
| 90 |
+
logger.debug('Set OPENAI_API_KEY from CLI flag')
|
| 91 |
+
embeddings = OpenAIEmbeddings()
|
| 92 |
+
logger.info('Embeddings initialized')
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.exception("Failed to initialize OpenAI embeddings. Ensure OPENAI_API_KEY is set in the environment or pass --openai-api-key.")
|
| 95 |
+
raise
|
| 96 |
+
|
| 97 |
+
# Initialize KG store unconditionally so errors are visible
|
| 98 |
+
try:
|
| 99 |
+
logger.info('Initializing KG store at %s', kg_path)
|
| 100 |
+
kg = KGStore(path=kg_path)
|
| 101 |
+
logger.info('KG store initialized')
|
| 102 |
+
except Exception:
|
| 103 |
+
logger.exception('Failed to initialize KGStore')
|
| 104 |
+
# re-raise so caller sees the failure
|
| 105 |
+
raise
|
| 106 |
|
| 107 |
# Annotate chunks with stable chunk_id and optionally extract/link KG triples
|
| 108 |
+
start_time = datetime.now(timezone.utc)
|
| 109 |
+
logger.info('Beginning per-chunk processing at %s UTC', start_time.isoformat())
|
| 110 |
+
for i, d in enumerate(split_docs, start=1):
|
| 111 |
+
print(i, d)
|
| 112 |
meta = d.metadata or {}
|
| 113 |
chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
|
| 114 |
if not meta:
|
|
|
|
| 121 |
"metadata": d.metadata,
|
| 122 |
}
|
| 123 |
|
| 124 |
+
# Log progress at intervals
|
| 125 |
+
if i % 50 == 0 or i <= 5:
|
| 126 |
+
logger.debug('Processing chunk %d/%d (id=%s)', i, len(split_docs), chunk_id)
|
| 127 |
+
|
| 128 |
+
# Attempt to extract triples and link the chunk (errors during extraction are non-fatal)
|
| 129 |
+
try:
|
| 130 |
+
triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
|
| 131 |
+
if triples:
|
| 132 |
+
logger.debug('Extracted %d triples for chunk %s', len(triples), chunk_id)
|
| 133 |
+
for t in triples:
|
| 134 |
+
try:
|
| 135 |
+
kg.add_triple(
|
| 136 |
+
t.get("subject"),
|
| 137 |
+
t.get("predicate"),
|
| 138 |
+
t.get("object"),
|
| 139 |
+
provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
|
| 140 |
+
)
|
| 141 |
+
kg.link_chunk_to_entity(
|
| 142 |
+
chunk_id,
|
| 143 |
+
t.get("subject"),
|
| 144 |
+
sentence=t.get("sentence"),
|
| 145 |
+
confidence=t.get("confidence"),
|
| 146 |
+
)
|
| 147 |
+
except Exception:
|
| 148 |
+
logger.exception('Non-fatal error while adding triple or linking chunk %s', chunk_id)
|
| 149 |
+
continue
|
| 150 |
+
except Exception:
|
| 151 |
+
# LLM extraction failed or not configured; skip KG extraction for this chunk
|
| 152 |
+
logger.exception('KG extraction failed for chunk %s (continuing)', chunk_id)
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
end_time = datetime.now(timezone.utc)
|
| 156 |
+
logger.info('Finished per-chunk processing at %s UTC (duration %s)', end_time.isoformat(), end_time - start_time)
|
| 157 |
+
|
| 158 |
+
# Persist Chroma vectorstore
|
| 159 |
+
try:
|
| 160 |
+
logger.info('Persisting Chroma vectorstore to %s', persist_dir)
|
| 161 |
+
Chroma.from_documents(
|
| 162 |
+
split_docs,
|
| 163 |
+
embedding=embeddings,
|
| 164 |
+
persist_directory=persist_dir,
|
| 165 |
+
)
|
| 166 |
+
logger.info('Vectorstore built and persisted to %s', persist_dir)
|
| 167 |
+
except Exception as e:
|
| 168 |
+
import traceback, sys
|
| 169 |
+
logger.exception('Chroma.from_documents failed to write the vectorstore:')
|
| 170 |
+
# ensure the log is flushed to file
|
| 171 |
+
for h in logger.handlers:
|
| 172 |
try:
|
| 173 |
+
h.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
except Exception:
|
|
|
|
| 175 |
pass
|
| 176 |
+
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
# Persist chunks index for runtime (simple json mapping)
|
| 179 |
try:
|
| 180 |
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
| 181 |
with open(idx_path, "w", encoding="utf-8") as fh:
|
| 182 |
json.dump(chunks_index, fh)
|
| 183 |
+
logger.info('Wrote chunks_index.json (%d entries)', len(chunks_index))
|
| 184 |
except Exception:
|
| 185 |
+
logger.exception('Failed to write chunks_index.json')
|
| 186 |
|
| 187 |
+
# Persist KG
|
| 188 |
+
try:
|
| 189 |
+
kg.save()
|
| 190 |
+
logger.info('KG persisted to %s', kg_path)
|
| 191 |
+
except Exception:
|
| 192 |
+
import traceback, sys
|
| 193 |
+
logger.exception('Failed to persist KG to disk:')
|
| 194 |
+
# ensure the log is flushed to file
|
| 195 |
+
for h in logger.handlers:
|
| 196 |
+
try:
|
| 197 |
+
h.flush()
|
| 198 |
+
except Exception:
|
| 199 |
+
pass
|
| 200 |
+
sys.exit(1)
|
| 201 |
|
| 202 |
|
| 203 |
def main():
|
|
|
|
| 206 |
parser.add_argument("--persist-dir", type=str, default="./vectorstore")
|
| 207 |
parser.add_argument("--chunk-size", type=int, default=200)
|
| 208 |
parser.add_argument("--chunk-overlap", type=int, default=50)
|
| 209 |
+
parser.add_argument("--openai-api-key", type=str, default=None, help="Optional OpenAI API key to use for embeddings (overrides env var)")
|
| 210 |
args = parser.parse_args()
|
| 211 |
|
| 212 |
ingest(
|
|
|
|
| 214 |
persist_dir=args.persist_dir,
|
| 215 |
chunk_size=args.chunk_size,
|
| 216 |
chunk_overlap=args.chunk_overlap,
|
| 217 |
+
openai_api_key=args.openai_api_key,
|
| 218 |
)
|
| 219 |
|
| 220 |
|
src/utils/rag_runtime.py
CHANGED
|
@@ -8,27 +8,20 @@ from src.vectorstore import get_retriever
|
|
| 8 |
from src.qa_chain import make_conversational_chain
|
| 9 |
import os
|
| 10 |
import json
|
| 11 |
-
from typing import Dict, List, Tuple
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
_HAS_KG = True
|
| 17 |
-
except Exception:
|
| 18 |
-
_HAS_KG = False
|
| 19 |
|
| 20 |
|
| 21 |
-
def run_ingest_cli(data_dir: str, persist_dir: str) ->
|
| 22 |
"""Run the ingestion module to rebuild the vectorstore.
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
Raises:
|
| 29 |
-
CalledProcessError: If the underlying subprocess fails.
|
| 30 |
"""
|
| 31 |
-
|
| 32 |
cmd = [
|
| 33 |
sys.executable,
|
| 34 |
"-m",
|
|
@@ -38,7 +31,29 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
|
|
| 38 |
"--persist-dir",
|
| 39 |
persist_dir,
|
| 40 |
]
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
|
| 44 |
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
|
@@ -69,25 +84,25 @@ def answer_with_kg(
|
|
| 69 |
# Load chunks index mapping
|
| 70 |
chunks_index = _load_chunks_index(persist_dir)
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
|
| 92 |
kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
|
| 93 |
if kg_context:
|
|
@@ -113,24 +128,31 @@ def build_or_load_retriever_cached(
|
|
| 113 |
Args:
|
| 114 |
data_dir: Directory containing input documents.
|
| 115 |
persist_dir: Directory where the Chroma vectorstore is stored.
|
| 116 |
-
top_k: Number of chunks to retrieve
|
| 117 |
retrieval_mode: Retrieval strategy (mmr, similarity, hybrid).
|
| 118 |
|
| 119 |
Returns:
|
| 120 |
An initialized retriever instance.
|
| 121 |
"""
|
| 122 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
return get_retriever(
|
| 124 |
persist_dir=persist_dir,
|
| 125 |
top_k=top_k,
|
| 126 |
-
retrieval_mode=
|
| 127 |
)
|
| 128 |
except Exception:
|
| 129 |
run_ingest_cli(data_dir=data_dir, persist_dir=persist_dir)
|
|
|
|
|
|
|
|
|
|
| 130 |
return get_retriever(
|
| 131 |
persist_dir=persist_dir,
|
| 132 |
top_k=top_k,
|
| 133 |
-
retrieval_mode=
|
| 134 |
)
|
| 135 |
|
| 136 |
|
|
|
|
| 8 |
from src.qa_chain import make_conversational_chain
|
| 9 |
import os
|
| 10 |
import json
|
| 11 |
+
from typing import Dict, List, Tuple, cast
|
| 12 |
|
| 13 |
+
# Unconditionally import KG modules; let import errors propagate so failures are visible
|
| 14 |
+
from src.kg.store import KGStore
|
| 15 |
+
from src.kg.retriever import KGRetriever
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
+
def run_ingest_cli(data_dir: str, persist_dir: str) -> str:
|
| 19 |
"""Run the ingestion module to rebuild the vectorstore.
|
| 20 |
|
| 21 |
+
Runs the ingest CLI as a subprocess and returns stdout on success.
|
| 22 |
+
On failure raises subprocess.CalledProcessError with captured stdout/stderr so callers
|
| 23 |
+
(for example the Streamlit UI) can display a helpful error message.
|
|
|
|
|
|
|
|
|
|
| 24 |
"""
|
|
|
|
| 25 |
cmd = [
|
| 26 |
sys.executable,
|
| 27 |
"-m",
|
|
|
|
| 31 |
"--persist-dir",
|
| 32 |
persist_dir,
|
| 33 |
]
|
| 34 |
+
try:
|
| 35 |
+
# Add a timeout to avoid indefinite hanging; 600s (10 minutes) is generous for large ingests
|
| 36 |
+
completed = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
|
| 37 |
+
except subprocess.TimeoutExpired as te:
|
| 38 |
+
# Provide helpful error including partial output
|
| 39 |
+
raise subprocess.CalledProcessError(
|
| 40 |
+
returncode=124,
|
| 41 |
+
cmd=cmd,
|
| 42 |
+
output=getattr(te, 'output', '') or '',
|
| 43 |
+
stderr=f"Ingest process timed out after {te.timeout} seconds",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Check return code and raise with captured output on failure
|
| 47 |
+
if completed.returncode != 0:
|
| 48 |
+
# Raise with captured output to make it easy to present to the user
|
| 49 |
+
raise subprocess.CalledProcessError(
|
| 50 |
+
returncode=completed.returncode,
|
| 51 |
+
cmd=cmd,
|
| 52 |
+
output=completed.stdout,
|
| 53 |
+
stderr=completed.stderr,
|
| 54 |
+
)
|
| 55 |
+
return completed.stdout
|
| 56 |
+
|
| 57 |
|
| 58 |
def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
|
| 59 |
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
|
|
|
| 84 |
# Load chunks index mapping
|
| 85 |
chunks_index = _load_chunks_index(persist_dir)
|
| 86 |
|
| 87 |
+
# Load KG unconditionally; let import or parse errors raise so callers can see them.
|
| 88 |
+
kg_path = os.path.join(persist_dir, "kg_store.ttl")
|
| 89 |
+
try:
|
| 90 |
+
kg = KGStore(path=kg_path)
|
| 91 |
+
retr = KGRetriever(kg)
|
| 92 |
+
chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
|
| 93 |
+
if summaries:
|
| 94 |
+
kg_text_parts.append("KG entities: " + ", ".join(summaries))
|
| 95 |
+
# add chunk snippets
|
| 96 |
+
for cid in chunk_ids:
|
| 97 |
+
info = chunks_index.get(cid)
|
| 98 |
+
if info:
|
| 99 |
+
txt = info.get("text", "")
|
| 100 |
+
if txt:
|
| 101 |
+
snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
|
| 102 |
+
kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
|
| 103 |
+
except Exception:
|
| 104 |
+
# If KG load or query fails, skip KG augmentation (allow the exception to surface in logs)
|
| 105 |
+
kg_text_parts = []
|
| 106 |
|
| 107 |
kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
|
| 108 |
if kg_context:
|
|
|
|
| 128 |
Args:
|
| 129 |
data_dir: Directory containing input documents.
|
| 130 |
persist_dir: Directory where the Chroma vectorstore is stored.
|
| 131 |
+
top_k: Number of chunks to retrieve.
|
| 132 |
retrieval_mode: Retrieval strategy (mmr, similarity, hybrid).
|
| 133 |
|
| 134 |
Returns:
|
| 135 |
An initialized retriever instance.
|
| 136 |
"""
|
| 137 |
try:
|
| 138 |
+
# Cast retrieval_mode to the expected literal type to satisfy type checkers
|
| 139 |
+
from typing import Literal
|
| 140 |
+
RetrievalMode = Literal["mmr", "similarity", "hybrid"]
|
| 141 |
+
mode = cast(RetrievalMode, retrieval_mode)
|
| 142 |
return get_retriever(
|
| 143 |
persist_dir=persist_dir,
|
| 144 |
top_k=top_k,
|
| 145 |
+
retrieval_mode=mode,
|
| 146 |
)
|
| 147 |
except Exception:
|
| 148 |
run_ingest_cli(data_dir=data_dir, persist_dir=persist_dir)
|
| 149 |
+
from typing import Literal
|
| 150 |
+
RetrievalMode = Literal["mmr", "similarity", "hybrid"]
|
| 151 |
+
mode = cast(RetrievalMode, retrieval_mode)
|
| 152 |
return get_retriever(
|
| 153 |
persist_dir=persist_dir,
|
| 154 |
top_k=top_k,
|
| 155 |
+
retrieval_mode=mode,
|
| 156 |
)
|
| 157 |
|
| 158 |
|
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/data_level0.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f4f2354cfac4766af62b3206edeadc037befeda51170642759cfde636719b6d0
|
| 3 |
-
size 6284000
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/header.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d683b2d00a21eda899c7d7b067fa8afd0eadec8a6a2f4bbcf835c3a028bf30ed
|
| 3 |
-
size 100
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/index_metadata.pickle
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e246e950cff89753a1014bf89d814426f6d22f1c0f0fa673dc8d3b11986afb4d
|
| 3 |
-
size 55974
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/length.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c10f6f8590ed8a62b4b5da7c3c431202130fc835d35f8318945b0c1fcfd1bb56
|
| 3 |
-
size 4000
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/1f5695a5-7499-40c9-8804-b3c330d70966/link_lists.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5f4382f14c7a7b600756f34e3493f67f810192c9a1d4afef2cd3d3d335fb092c
|
| 3 |
-
size 8148
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/chroma-embeddings.parquet
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1e572394a2cfa7976f286fa157be4e5eaf287d0721581969eed4c4df7874f04a
|
| 3 |
-
size 3380376
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/chroma.sqlite3
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6f9fe3c3f3cddec7b92e068aa41dab33d8c0c831fab65768f7567a457be81fad
|
| 3 |
-
size 12951552
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/chunks_index.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vectorstore/index/id_to_uuid_6855eddb-ade5-445b-9e7f-a8293769c768.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5fee2dec9920bfc53613b6de9edf335643f12002c9aa363756e403115419c097
|
| 3 |
-
size 8714
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/index/index_6855eddb-ade5-445b-9e7f-a8293769c768.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ba0cc06f9ba2330e6768388c4467d1b6a5cb39e7421ef240df400fb89baf2b92
|
| 3 |
-
size 1730044
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/index/index_metadata_6855eddb-ade5-445b-9e7f-a8293769c768.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c0bc6676e35c07eb122909628dcc80dd749794bdfb45c099a06ef1bcb59be763
|
| 3 |
-
size 105
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/index/uuid_to_id_6855eddb-ade5-445b-9e7f-a8293769c768.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7c41d12045aa07c6d3cfa93a06b0b0e9414c7f9adab57e9f22f1bb3b7328dee1
|
| 3 |
-
size 10211
|
|
|
|
|
|
|
|
|
|
|
|