Adding files
Browse files- .DS_Store +0 -0
- .gitignore +17 -0
- .idea/.gitignore +0 -8
- .idea/material_theme_project_new.xml +0 -12
- app.py +53 -28
- src/__pycache__/ingest.cpython-310.pyc +0 -0
- src/__pycache__/qa_chain.cpython-310.pyc +0 -0
- src/__pycache__/vectorstore.cpython-310.pyc +0 -0
- src/ingest.py +96 -0
- src/kg/extract.py +75 -0
- src/kg/retriever.py +28 -0
- src/kg/store.py +91 -0
- src/utils/rag_runtime.py +78 -0
- src/vectorstore.py +4 -0
- vectorstore/chroma-collections.parquet +0 -3
- vectorstore/chroma-embeddings.parquet +0 -3
.DS_Store
DELETED
|
Binary file (8.2 kB)
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ignore generated vectorstore data and caches
|
| 2 |
+
/vectorstore/
|
| 3 |
+
# python cache
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
# macOS finder
|
| 7 |
+
.DS_Store
|
| 8 |
+
# IDE
|
| 9 |
+
.idea/
|
| 10 |
+
# optional: ignore local sample data if you don't want it in repo
|
| 11 |
+
# /data/
|
| 12 |
+
# tests / validation artifacts
|
| 13 |
+
# /validation/
|
| 14 |
+
EOF
|
| 15 |
+
|
| 16 |
+
git add .gitignore
|
| 17 |
+
# do NOT commit this yet if you prefer to review
|
.idea/.gitignore
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
# Default ignored files
|
| 2 |
-
/shelf/
|
| 3 |
-
/workspace.xml
|
| 4 |
-
# Editor-based HTTP Client requests
|
| 5 |
-
/httpRequests/
|
| 6 |
-
# Datasource local storage ignored files
|
| 7 |
-
/dataSources/
|
| 8 |
-
/dataSources.local.xml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.idea/material_theme_project_new.xml
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
-
<project version="4">
|
| 3 |
-
<component name="MaterialThemeProjectNewConfig">
|
| 4 |
-
<option name="metadata">
|
| 5 |
-
<MTProjectMetadataState>
|
| 6 |
-
<option name="migrated" value="true" />
|
| 7 |
-
<option name="pristineConfig" value="false" />
|
| 8 |
-
<option name="userId" value="-3a906995:19986b060ad:-7ffc" />
|
| 9 |
-
</MTProjectMetadataState>
|
| 10 |
-
</option>
|
| 11 |
-
</component>
|
| 12 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
from typing import List, Dict, Tuple, Optional
|
| 3 |
|
| 4 |
# Disable telemetry for LangChain and Chroma by default
|
| 5 |
os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
|
|
@@ -12,6 +12,7 @@ from src.utils.rag_runtime import (
|
|
| 12 |
run_ingest_cli,
|
| 13 |
build_or_load_retriever_cached,
|
| 14 |
get_chain_cached,
|
|
|
|
| 15 |
)
|
| 16 |
from src.utils.metrics import compute_quality_scores
|
| 17 |
from src.utils.formatting import format_source_label
|
|
@@ -25,12 +26,22 @@ class AbaloneRAGApp:
|
|
| 25 |
"""Initialize the Streamlit page and application state."""
|
| 26 |
st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
|
| 27 |
|
| 28 |
-
|
| 29 |
-
st.
|
| 30 |
-
|
| 31 |
-
"
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Data and vectorstore locations
|
| 36 |
self.data_dir = "./data"
|
|
@@ -49,17 +60,22 @@ class AbaloneRAGApp:
|
|
| 49 |
self.temperature,
|
| 50 |
self.answer_length,
|
| 51 |
self.style_instruction,
|
| 52 |
-
self.
|
|
|
|
| 53 |
) = self._build_sidebar()
|
| 54 |
|
|
|
|
|
|
|
|
|
|
| 55 |
# QA chain instance (loaded lazily)
|
| 56 |
-
|
|
|
|
| 57 |
|
| 58 |
# ------------------------------------------------------------------
|
| 59 |
# Sidebar configuration
|
| 60 |
# ------------------------------------------------------------------
|
| 61 |
|
| 62 |
-
def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool]:
|
| 63 |
"""Render all sidebar controls and return model configuration.
|
| 64 |
|
| 65 |
Returns:
|
|
@@ -95,7 +111,7 @@ class AbaloneRAGApp:
|
|
| 95 |
retrieval_mode_label = st.sidebar.selectbox(
|
| 96 |
"Retrieval mode",
|
| 97 |
["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
|
| 98 |
-
index=
|
| 99 |
)
|
| 100 |
retrieval_mode_map = {
|
| 101 |
"MMR (diverse)": "mmr",
|
|
@@ -104,6 +120,12 @@ class AbaloneRAGApp:
|
|
| 104 |
}
|
| 105 |
retrieval_mode = retrieval_mode_map[retrieval_mode_label]
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
st.sidebar.markdown("---")
|
| 108 |
|
| 109 |
# Answer style
|
|
@@ -123,20 +145,9 @@ class AbaloneRAGApp:
|
|
| 123 |
index=1,
|
| 124 |
)
|
| 125 |
|
|
|
|
| 126 |
st.sidebar.markdown("---")
|
| 127 |
-
|
| 128 |
-
# Vectorstore controls
|
| 129 |
-
st.sidebar.header("Vectorstore Controls")
|
| 130 |
-
|
| 131 |
-
rebuild_clicked = st.sidebar.button(
|
| 132 |
-
"Rebuild vectorstore",
|
| 133 |
-
use_container_width=True,
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
st.sidebar.markdown(
|
| 137 |
-
"<small>Use this when you add or modify files in <code>./data</code>.</small>",
|
| 138 |
-
unsafe_allow_html=True,
|
| 139 |
-
)
|
| 140 |
|
| 141 |
# Build style instruction for the LLM
|
| 142 |
length_instruction_map = {
|
|
@@ -158,7 +169,8 @@ class AbaloneRAGApp:
|
|
| 158 |
temperature,
|
| 159 |
answer_length,
|
| 160 |
style_instruction,
|
| 161 |
-
|
|
|
|
| 162 |
)
|
| 163 |
|
| 164 |
# ------------------------------------------------------------------
|
|
@@ -238,6 +250,7 @@ class AbaloneRAGApp:
|
|
| 238 |
st.session_state["rebuild_pending"] = False
|
| 239 |
st.info("Rebuild canceled.")
|
| 240 |
|
|
|
|
| 241 |
# ------------------------------------------------------------------
|
| 242 |
# Chain loading
|
| 243 |
# ------------------------------------------------------------------
|
|
@@ -260,6 +273,7 @@ class AbaloneRAGApp:
|
|
| 260 |
else:
|
| 261 |
st.success("Knowledge base and model are ready.")
|
| 262 |
|
|
|
|
| 263 |
# ------------------------------------------------------------------
|
| 264 |
# Chat UI
|
| 265 |
# ------------------------------------------------------------------
|
|
@@ -296,9 +310,20 @@ class AbaloneRAGApp:
|
|
| 296 |
|
| 297 |
styled_question = self.style_instruction + "\n\nQuestion: " + user_input
|
| 298 |
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
answer = (
|
| 304 |
result.get("answer")
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import List, Dict, Tuple, Optional, Any
|
| 3 |
|
| 4 |
# Disable telemetry for LangChain and Chroma by default
|
| 5 |
os.environ.setdefault("LANGCHAIN_TELEMETRY_ENABLED", "false")
|
|
|
|
| 12 |
run_ingest_cli,
|
| 13 |
build_or_load_retriever_cached,
|
| 14 |
get_chain_cached,
|
| 15 |
+
answer_with_kg,
|
| 16 |
)
|
| 17 |
from src.utils.metrics import compute_quality_scores
|
| 18 |
from src.utils.formatting import format_source_label
|
|
|
|
| 26 |
"""Initialize the Streamlit page and application state."""
|
| 27 |
st.set_page_config(page_title="Abalone RAG Chatbot", page_icon="🐚")
|
| 28 |
|
| 29 |
+
# Header row: title/subtitle on the left, rebuild action on the right
|
| 30 |
+
header_col, action_col = st.columns([5, 1])
|
| 31 |
+
with header_col:
|
| 32 |
+
st.title("Abalone RAG Chatbot")
|
| 33 |
+
st.write(
|
| 34 |
+
"Ask natural-language questions about abalone biology, ecology, "
|
| 35 |
+
"and research datasets. The app uses a local Chroma vectorstore "
|
| 36 |
+
"and OpenAI to retrieve and answer questions accurately."
|
| 37 |
+
)
|
| 38 |
+
with action_col:
|
| 39 |
+
# A compact, prominent rebuild control placed in the header
|
| 40 |
+
self._top_rebuild_clicked = st.button(
|
| 41 |
+
"Rebuild vectorstore",
|
| 42 |
+
key="top_rebuild",
|
| 43 |
+
use_container_width=True,
|
| 44 |
+
)
|
| 45 |
|
| 46 |
# Data and vectorstore locations
|
| 47 |
self.data_dir = "./data"
|
|
|
|
| 60 |
self.temperature,
|
| 61 |
self.answer_length,
|
| 62 |
self.style_instruction,
|
| 63 |
+
self.use_kg,
|
| 64 |
+
self.kg_hops,
|
| 65 |
) = self._build_sidebar()
|
| 66 |
|
| 67 |
+
# Ensure rebuild_clicked reflects the top-right control
|
| 68 |
+
self.rebuild_clicked = bool(getattr(self, "_top_rebuild_clicked", False))
|
| 69 |
+
|
| 70 |
# QA chain instance (loaded lazily)
|
| 71 |
+
# typing as Any avoids static warnings when calling the chain object
|
| 72 |
+
self.chain: Optional[Any] = None
|
| 73 |
|
| 74 |
# ------------------------------------------------------------------
|
| 75 |
# Sidebar configuration
|
| 76 |
# ------------------------------------------------------------------
|
| 77 |
|
| 78 |
+
def _build_sidebar(self) -> Tuple[str, int, str, float, str, str, bool, int]:
|
| 79 |
"""Render all sidebar controls and return model configuration.
|
| 80 |
|
| 81 |
Returns:
|
|
|
|
| 111 |
retrieval_mode_label = st.sidebar.selectbox(
|
| 112 |
"Retrieval mode",
|
| 113 |
["MMR (diverse)", "Similarity", "Hybrid (dense + MMR)"],
|
| 114 |
+
index=2,
|
| 115 |
)
|
| 116 |
retrieval_mode_map = {
|
| 117 |
"MMR (diverse)": "mmr",
|
|
|
|
| 120 |
}
|
| 121 |
retrieval_mode = retrieval_mode_map[retrieval_mode_label]
|
| 122 |
|
| 123 |
+
# Knowledge graph toggle (placed under Retrieval Configuration)
|
| 124 |
+
st.sidebar.markdown("---")
|
| 125 |
+
st.sidebar.header("Knowledge Graph")
|
| 126 |
+
use_kg = st.sidebar.checkbox("Use knowledge graph for retrieval", value=False)
|
| 127 |
+
kg_hops = st.sidebar.slider("KG hops", min_value=1, max_value=3, value=1)
|
| 128 |
+
|
| 129 |
st.sidebar.markdown("---")
|
| 130 |
|
| 131 |
# Answer style
|
|
|
|
| 145 |
index=1,
|
| 146 |
)
|
| 147 |
|
| 148 |
+
# (Vectorstore rebuild moved to top-right action button)
|
| 149 |
st.sidebar.markdown("---")
|
| 150 |
+
st.sidebar.markdown("<small>To rebuild the vectorstore use the top-right \"Rebuild vectorstore\" button.</small>", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# Build style instruction for the LLM
|
| 153 |
length_instruction_map = {
|
|
|
|
| 169 |
temperature,
|
| 170 |
answer_length,
|
| 171 |
style_instruction,
|
| 172 |
+
use_kg,
|
| 173 |
+
kg_hops,
|
| 174 |
)
|
| 175 |
|
| 176 |
# ------------------------------------------------------------------
|
|
|
|
| 250 |
st.session_state["rebuild_pending"] = False
|
| 251 |
st.info("Rebuild canceled.")
|
| 252 |
|
| 253 |
+
|
| 254 |
# ------------------------------------------------------------------
|
| 255 |
# Chain loading
|
| 256 |
# ------------------------------------------------------------------
|
|
|
|
| 273 |
else:
|
| 274 |
st.success("Knowledge base and model are ready.")
|
| 275 |
|
| 276 |
+
|
| 277 |
# ------------------------------------------------------------------
|
| 278 |
# Chat UI
|
| 279 |
# ------------------------------------------------------------------
|
|
|
|
| 310 |
|
| 311 |
styled_question = self.style_instruction + "\n\nQuestion: " + user_input
|
| 312 |
|
| 313 |
+
if self.chain is None:
|
| 314 |
+
st.error("Model not initialized. Please wait for the knowledge base and model to be ready or rebuild the vectorstore.")
|
| 315 |
+
return
|
| 316 |
+
|
| 317 |
+
if getattr(self, 'use_kg', False):
|
| 318 |
+
result = answer_with_kg(
|
| 319 |
+
self.chain,
|
| 320 |
+
styled_question,
|
| 321 |
+
prior_history,
|
| 322 |
+
persist_dir=self.persist_dir,
|
| 323 |
+
kg_hops=self.kg_hops,
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
result = self.chain({"question": styled_question, "chat_history": prior_history})
|
| 327 |
|
| 328 |
answer = (
|
| 329 |
result.get("answer")
|
src/__pycache__/ingest.cpython-310.pyc
DELETED
|
Binary file (2.18 kB)
|
|
|
src/__pycache__/qa_chain.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/qa_chain.cpython-310.pyc and b/src/__pycache__/qa_chain.cpython-310.pyc differ
|
|
|
src/__pycache__/vectorstore.cpython-310.pyc
DELETED
|
Binary file (2.1 kB)
|
|
|
src/ingest.py
CHANGED
|
@@ -1,10 +1,28 @@
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
from langchain_community.vectorstores import Chroma
|
| 7 |
from langchain_community.embeddings import OpenAIEmbeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def load_documents(data_dir: str):
|
|
@@ -54,20 +72,98 @@ def ingest(data_dir: str, persist_dir: str, chunk_size: int, chunk_overlap: int)
|
|
| 54 |
|
| 55 |
os.makedirs(persist_dir, exist_ok=True)
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
Chroma.from_documents(
|
| 58 |
split_docs,
|
| 59 |
embedding=embeddings,
|
| 60 |
persist_directory=persist_dir,
|
| 61 |
)
|
| 62 |
print(f"Vectorstore built and persisted to {persist_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def main():
|
| 66 |
parser = argparse.ArgumentParser()
|
| 67 |
parser.add_argument("--data-dir", type=str, default="./data")
|
| 68 |
parser.add_argument("--persist-dir", type=str, default="./vectorstore")
|
|
|
|
| 69 |
parser.add_argument("--chunk-size", type=int, default=800)
|
| 70 |
parser.add_argument("--chunk-overlap", type=int, default=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
args = parser.parse_args()
|
| 72 |
|
| 73 |
ingest(
|
|
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
|
| 4 |
+
<<<<<<< HEAD
|
| 5 |
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
from langchain_community.vectorstores import Chroma
|
| 8 |
from langchain_community.embeddings import OpenAIEmbeddings
|
| 9 |
+
=======
|
| 10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 11 |
+
from langchain_community.vectorstores import Chroma
|
| 12 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
| 13 |
+
|
| 14 |
+
# New: KG integration imports
|
| 15 |
+
import uuid
|
| 16 |
+
import json
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from src.kg.extract import extract_triples_with_llm
|
| 20 |
+
from src.kg.store import KGStore
|
| 21 |
+
from src.kg.retriever import KGRetriever
|
| 22 |
+
_HAS_KG = True
|
| 23 |
+
except Exception:
|
| 24 |
+
_HAS_KG = False
|
| 25 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 26 |
|
| 27 |
|
| 28 |
def load_documents(data_dir: str):
|
|
|
|
| 72 |
|
| 73 |
os.makedirs(persist_dir, exist_ok=True)
|
| 74 |
|
| 75 |
+
<<<<<<< HEAD
|
| 76 |
+
=======
|
| 77 |
+
# Prepare KG store and local chunk index
|
| 78 |
+
chunks_index = {}
|
| 79 |
+
kg_path = os.path.join(persist_dir, "kg_store.ttl")
|
| 80 |
+
if _HAS_KG:
|
| 81 |
+
try:
|
| 82 |
+
kg = KGStore(path=kg_path)
|
| 83 |
+
except Exception:
|
| 84 |
+
kg = None
|
| 85 |
+
else:
|
| 86 |
+
kg = None
|
| 87 |
+
|
| 88 |
+
# Annotate chunks with stable chunk_id and optionally extract/link KG triples
|
| 89 |
+
for d in split_docs:
|
| 90 |
+
meta = d.metadata or {}
|
| 91 |
+
chunk_id = meta.get("chunk_id") or str(uuid.uuid4())
|
| 92 |
+
if not meta:
|
| 93 |
+
d.metadata = {}
|
| 94 |
+
d.metadata["chunk_id"] = chunk_id
|
| 95 |
+
|
| 96 |
+
# Save minimal chunk index for runtime retrieval (text and source metadata)
|
| 97 |
+
chunks_index[chunk_id] = {
|
| 98 |
+
"text": getattr(d, "page_content", "") or getattr(d, "content", ""),
|
| 99 |
+
"metadata": d.metadata,
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# If KG is available, attempt to extract triples and link the chunk
|
| 103 |
+
if kg is not None:
|
| 104 |
+
try:
|
| 105 |
+
triples = extract_triples_with_llm(chunks_index[chunk_id]["text"], max_triples=4)
|
| 106 |
+
for t in triples:
|
| 107 |
+
try:
|
| 108 |
+
kg.add_triple(
|
| 109 |
+
t.get("subject"),
|
| 110 |
+
t.get("predicate"),
|
| 111 |
+
t.get("object"),
|
| 112 |
+
provenance={"sentence": t.get("sentence"), "confidence": t.get("confidence")},
|
| 113 |
+
)
|
| 114 |
+
kg.link_chunk_to_entity(
|
| 115 |
+
chunk_id,
|
| 116 |
+
t.get("subject"),
|
| 117 |
+
sentence=t.get("sentence"),
|
| 118 |
+
confidence=t.get("confidence"),
|
| 119 |
+
)
|
| 120 |
+
except Exception:
|
| 121 |
+
# non-fatal: continue
|
| 122 |
+
continue
|
| 123 |
+
except Exception:
|
| 124 |
+
# LLM extraction failed or not configured; skip KG extraction
|
| 125 |
+
pass
|
| 126 |
+
|
| 127 |
+
# Persist Chroma vectorstore
|
| 128 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 129 |
Chroma.from_documents(
|
| 130 |
split_docs,
|
| 131 |
embedding=embeddings,
|
| 132 |
persist_directory=persist_dir,
|
| 133 |
)
|
| 134 |
print(f"Vectorstore built and persisted to {persist_dir}")
|
| 135 |
+
<<<<<<< HEAD
|
| 136 |
+
=======
|
| 137 |
+
|
| 138 |
+
# Persist chunks index for runtime (simple json mapping)
|
| 139 |
+
try:
|
| 140 |
+
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
| 141 |
+
with open(idx_path, "w", encoding="utf-8") as fh:
|
| 142 |
+
json.dump(chunks_index, fh)
|
| 143 |
+
except Exception:
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
# Persist KG if available
|
| 147 |
+
if kg is not None:
|
| 148 |
+
try:
|
| 149 |
+
kg.save()
|
| 150 |
+
print(f"KG persisted to {kg_path}")
|
| 151 |
+
except Exception:
|
| 152 |
+
pass
|
| 153 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 154 |
|
| 155 |
|
| 156 |
def main():
|
| 157 |
parser = argparse.ArgumentParser()
|
| 158 |
parser.add_argument("--data-dir", type=str, default="./data")
|
| 159 |
parser.add_argument("--persist-dir", type=str, default="./vectorstore")
|
| 160 |
+
<<<<<<< HEAD
|
| 161 |
parser.add_argument("--chunk-size", type=int, default=800)
|
| 162 |
parser.add_argument("--chunk-overlap", type=int, default=200)
|
| 163 |
+
=======
|
| 164 |
+
parser.add_argument("--chunk-size", type=int, default=200)
|
| 165 |
+
parser.add_argument("--chunk-overlap", type=int, default=50)
|
| 166 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 167 |
args = parser.parse_args()
|
| 168 |
|
| 169 |
ingest(
|
src/kg/extract.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM-backed triple/entity extractor for PoC.
|
| 2 |
+
|
| 3 |
+
This module provides a small wrapper that asks the LLM (via LangChain ChatOpenAI)
|
| 4 |
+
to extract a small set of triples from a text chunk. It returns a list of dicts:
|
| 5 |
+
{"subject": ..., "predicate": ..., "object": ..., "sentence": ..., "confidence": float}
|
| 6 |
+
|
| 7 |
+
The implementation is intentionally conservative and small for a Spaces-compatible PoC.
|
| 8 |
+
"""
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
from langchain.chat_models import ChatOpenAI
|
| 13 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_triples_with_llm(text: str, max_triples: int = 6, model_name: str = "gpt-3.5-turbo") -> List[Dict]:
|
| 17 |
+
"""Extract triples from text using a Chat LLM. Returns parsed JSON triples.
|
| 18 |
+
|
| 19 |
+
Note: requires OPENAI_API_KEY in env for ChatOpenAI to work.
|
| 20 |
+
"""
|
| 21 |
+
prompt = (
|
| 22 |
+
"You are an assistant that extracts factual triples from a short text.\n"
|
| 23 |
+
"Return a JSON array where each element is an object with keys: subject, predicate, object, sentence, confidence.\n"
|
| 24 |
+
"Be concise and only return JSON. Confidence should be a float between 0.0 and 1.0.\n"
|
| 25 |
+
f"Limit results to at most {max_triples} triples.\n\n"
|
| 26 |
+
"Text:\n<<<TEXT_START>>>\n"
|
| 27 |
+
+ text
|
| 28 |
+
+ "\n<<<TEXT_END>>>\n"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# system message to instruct format strictly
|
| 32 |
+
system = SystemMessage(content="You output only JSON arrays. Do not add any extra text.")
|
| 33 |
+
human = HumanMessage(content=prompt)
|
| 34 |
+
|
| 35 |
+
llm = ChatOpenAI(model_name=model_name, temperature=0.0)
|
| 36 |
+
resp = llm([system, human])
|
| 37 |
+
raw = resp.content.strip()
|
| 38 |
+
|
| 39 |
+
# Attempt to find JSON in the output
|
| 40 |
+
try:
|
| 41 |
+
data = json.loads(raw)
|
| 42 |
+
except Exception:
|
| 43 |
+
# try to find first JSON substring
|
| 44 |
+
start = raw.find("[")
|
| 45 |
+
end = raw.rfind("]")
|
| 46 |
+
if start != -1 and end != -1:
|
| 47 |
+
try:
|
| 48 |
+
data = json.loads(raw[start:end+1])
|
| 49 |
+
except Exception:
|
| 50 |
+
data = []
|
| 51 |
+
else:
|
| 52 |
+
data = []
|
| 53 |
+
|
| 54 |
+
cleaned: List[Dict] = []
|
| 55 |
+
for item in data:
|
| 56 |
+
if not isinstance(item, dict):
|
| 57 |
+
continue
|
| 58 |
+
subj = item.get("subject") or item.get("s")
|
| 59 |
+
pred = item.get("predicate") or item.get("p")
|
| 60 |
+
obj = item.get("object") or item.get("o")
|
| 61 |
+
sent = item.get("sentence") or ""
|
| 62 |
+
conf = item.get("confidence")
|
| 63 |
+
try:
|
| 64 |
+
conf = float(conf) if conf is not None else 0.5
|
| 65 |
+
except Exception:
|
| 66 |
+
conf = 0.5
|
| 67 |
+
if subj and pred and obj:
|
| 68 |
+
cleaned.append({
|
| 69 |
+
"subject": str(subj),
|
| 70 |
+
"predicate": str(pred),
|
| 71 |
+
"object": str(obj),
|
| 72 |
+
"sentence": str(sent),
|
| 73 |
+
"confidence": conf,
|
| 74 |
+
})
|
| 75 |
+
return cleaned
|
src/kg/retriever.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""KG retriever that returns chunk IDs and short node summaries for a question."""
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from .store import KGStore
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class KGRetriever:
|
| 7 |
+
def __init__(self, kg_store: KGStore):
|
| 8 |
+
self.kg = kg_store
|
| 9 |
+
|
| 10 |
+
def get_context_for_question(self, question: str, hops: int = 1) -> Tuple[List[str], List[str]]:
|
| 11 |
+
"""Return (chunk_ids, node_summaries).
|
| 12 |
+
|
| 13 |
+
This simple implementation finds entities whose labels appear in the question
|
| 14 |
+
and returns linked chunk ids. For hops >1 you could expand to related entities.
|
| 15 |
+
"""
|
| 16 |
+
entity_uris = self.kg.query_entities(question)
|
| 17 |
+
chunk_ids = []
|
| 18 |
+
summaries = []
|
| 19 |
+
for e in entity_uris:
|
| 20 |
+
# uri like http://.../entity/<label>
|
| 21 |
+
label = e.split("/entity/", 1)[-1].replace("_", " ")
|
| 22 |
+
chunks = self.kg.find_chunks_for_entity(label)
|
| 23 |
+
chunk_ids.extend(chunks)
|
| 24 |
+
summaries.append(label)
|
| 25 |
+
# dedupe
|
| 26 |
+
chunk_ids = list(dict.fromkeys(chunk_ids))
|
| 27 |
+
return chunk_ids, summaries
|
| 28 |
+
|
src/kg/store.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RDFLib-backed KG store for PoC.
|
| 2 |
+
|
| 3 |
+
Stores nodes and simple edges; links DocumentChunk IDs to KG entities using
|
| 4 |
+
`mentions` predicate. Persists to a TTL file.
|
| 5 |
+
"""
|
| 6 |
+
try:
|
| 7 |
+
from rdflib import Graph, URIRef, Literal, Namespace
|
| 8 |
+
from rdflib.namespace import RDF, RDFS
|
| 9 |
+
_HAS_RDFLIB = True
|
| 10 |
+
except Exception:
|
| 11 |
+
_HAS_RDFLIB = False
|
| 12 |
+
|
| 13 |
+
from typing import List, Dict, Optional
|
| 14 |
+
import uuid
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
NS_URI = "http://example.org/abalone/"
|
| 18 |
+
|
| 19 |
+
if _HAS_RDFLIB:
|
| 20 |
+
NS = Namespace(NS_URI)
|
| 21 |
+
|
| 22 |
+
class KGStore:
|
| 23 |
+
def __init__(self, path: str = "./kg_store.ttl"):
|
| 24 |
+
self.path = path
|
| 25 |
+
self.graph = Graph()
|
| 26 |
+
if os.path.exists(self.path):
|
| 27 |
+
try:
|
| 28 |
+
self.graph.parse(self.path, format="turtle")
|
| 29 |
+
except Exception:
|
| 30 |
+
# start empty if parse fails
|
| 31 |
+
self.graph = Graph()
|
| 32 |
+
|
| 33 |
+
def _entity_uri(self, label: str) -> URIRef:
|
| 34 |
+
safe = label.strip().lower().replace(" ", "_")
|
| 35 |
+
return URIRef(f"{NS_URI}entity/{safe}")
|
| 36 |
+
|
| 37 |
+
def _chunk_uri(self, chunk_id: str) -> URIRef:
|
| 38 |
+
return URIRef(f"{NS_URI}chunk/{chunk_id}")
|
| 39 |
+
|
| 40 |
+
def add_entity(self, label: str, description: Optional[str] = None) -> URIRef:
|
| 41 |
+
u = self._entity_uri(label)
|
| 42 |
+
self.graph.add((u, RDFS.label, Literal(label)))
|
| 43 |
+
if description:
|
| 44 |
+
self.graph.add((u, NS.description, Literal(description)))
|
| 45 |
+
return u
|
| 46 |
+
|
| 47 |
+
def link_chunk_to_entity(self, chunk_id: str, entity_label: str, sentence: str = "", confidence: float = 0.5):
|
| 48 |
+
e = self.add_entity(entity_label)
|
| 49 |
+
c = self._chunk_uri(chunk_id)
|
| 50 |
+
self.graph.add((c, NS.mentions, e))
|
| 51 |
+
# add provenance as reified data on the chunk node
|
| 52 |
+
self.graph.add((c, NS.sentence, Literal(sentence)))
|
| 53 |
+
self.graph.add((c, NS.confidence, Literal(str(confidence))))
|
| 54 |
+
|
| 55 |
+
def add_triple(self, subj_label: str, pred_label: str, obj_label: str, provenance: Optional[Dict] = None):
|
| 56 |
+
s = self.add_entity(subj_label)
|
| 57 |
+
o = self.add_entity(obj_label)
|
| 58 |
+
p = URIRef(f"{NS_URI}relation/{pred_label.strip().lower().replace(' ', '_')}")
|
| 59 |
+
self.graph.add((s, p, o))
|
| 60 |
+
if provenance:
|
| 61 |
+
# store provenance on subject node for simplicity
|
| 62 |
+
self.graph.add((s, NS.provenance, Literal(str(provenance))))
|
| 63 |
+
|
| 64 |
+
def save(self):
|
| 65 |
+
self.graph.serialize(destination=self.path, format="turtle")
|
| 66 |
+
|
| 67 |
+
def find_chunks_for_entity(self, entity_label: str) -> List[str]:
|
| 68 |
+
e = self._entity_uri(entity_label)
|
| 69 |
+
q = f"SELECT ?chunk WHERE {{ ?chunk <{NS_URI}mentions> <{e}> . }}"
|
| 70 |
+
res = self.graph.query(q)
|
| 71 |
+
out = []
|
| 72 |
+
for r in res:
|
| 73 |
+
uri = str(r[0])
|
| 74 |
+
if uri.startswith(NS_URI + "chunk/"):
|
| 75 |
+
out.append(uri.split("chunk/", 1)[1])
|
| 76 |
+
return out
|
| 77 |
+
|
| 78 |
+
def query_entities(self, text: str) -> List[str]:
|
| 79 |
+
# naive: find entities whose label appears in text
|
| 80 |
+
text_l = text.lower()
|
| 81 |
+
out = []
|
| 82 |
+
for s, p, o in self.graph.triples((None, RDFS.label, None)):
|
| 83 |
+
label = str(o).lower()
|
| 84 |
+
if label in text_l:
|
| 85 |
+
out.append(str(s))
|
| 86 |
+
return out
|
| 87 |
+
|
| 88 |
+
else:
|
| 89 |
+
class KGStore:
|
| 90 |
+
def __init__(self, *args, **kwargs):
|
| 91 |
+
raise RuntimeError("rdflib is required for KGStore. Install with `pip install rdflib`")
|
src/utils/rag_runtime.py
CHANGED
|
@@ -6,6 +6,19 @@ import streamlit as st
|
|
| 6 |
|
| 7 |
from src.vectorstore import get_retriever
|
| 8 |
from src.qa_chain import make_conversational_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
|
|
@@ -18,6 +31,10 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
|
|
| 18 |
Raises:
|
| 19 |
CalledProcessError: If the underlying subprocess fails.
|
| 20 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
cmd = [
|
| 22 |
sys.executable,
|
| 23 |
"-m",
|
|
@@ -30,6 +47,67 @@ def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
|
|
| 30 |
subprocess.run(cmd, check=True)
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
@st.cache_resource(show_spinner=False)
|
| 34 |
def build_or_load_retriever_cached(
|
| 35 |
data_dir: str,
|
|
|
|
| 6 |
|
| 7 |
from src.vectorstore import get_retriever
|
| 8 |
from src.qa_chain import make_conversational_chain
|
| 9 |
+
<<<<<<< HEAD
|
| 10 |
+
=======
|
| 11 |
+
import os
|
| 12 |
+
import json
|
| 13 |
+
from typing import Dict, List, Tuple
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from src.kg.store import KGStore
|
| 17 |
+
from src.kg.retriever import KGRetriever
|
| 18 |
+
_HAS_KG = True
|
| 19 |
+
except Exception:
|
| 20 |
+
_HAS_KG = False
|
| 21 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 22 |
|
| 23 |
|
| 24 |
def run_ingest_cli(data_dir: str, persist_dir: str) -> None:
|
|
|
|
| 31 |
Raises:
|
| 32 |
CalledProcessError: If the underlying subprocess fails.
|
| 33 |
"""
|
| 34 |
+
<<<<<<< HEAD
|
| 35 |
+
=======
|
| 36 |
+
# Updated to point to the CLI module inside the ingest package
|
| 37 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 38 |
cmd = [
|
| 39 |
sys.executable,
|
| 40 |
"-m",
|
|
|
|
| 47 |
subprocess.run(cmd, check=True)
|
| 48 |
|
| 49 |
|
| 50 |
+
<<<<<<< HEAD
|
| 51 |
+
=======
|
| 52 |
+
def _load_chunks_index(persist_dir: str) -> Dict[str, Dict]:
|
| 53 |
+
idx_path = os.path.join(persist_dir, "chunks_index.json")
|
| 54 |
+
if not os.path.exists(idx_path):
|
| 55 |
+
return {}
|
| 56 |
+
try:
|
| 57 |
+
with open(idx_path, "r", encoding="utf-8") as fh:
|
| 58 |
+
return json.load(fh)
|
| 59 |
+
except Exception:
|
| 60 |
+
return {}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def answer_with_kg(
|
| 64 |
+
chain,
|
| 65 |
+
question: str,
|
| 66 |
+
chat_history: List[Tuple[str, str]],
|
| 67 |
+
persist_dir: str,
|
| 68 |
+
kg_hops: int = 1,
|
| 69 |
+
kg_context_max_chars: int = 1000,
|
| 70 |
+
) -> Any:
|
| 71 |
+
"""Augment question with KG context (if available) and run the chain.
|
| 72 |
+
|
| 73 |
+
This is a low-risk integration: we build a short textual summary from the KG
|
| 74 |
+
(node labels and short chunk snippets from chunks_index.json) and prepend it to
|
| 75 |
+
the question. The chain's retriever still runs; KG context is additional grounding.
|
| 76 |
+
"""
|
| 77 |
+
kg_text_parts: List[str] = []
|
| 78 |
+
# Load chunks index mapping
|
| 79 |
+
chunks_index = _load_chunks_index(persist_dir)
|
| 80 |
+
|
| 81 |
+
if _HAS_KG:
|
| 82 |
+
kg_path = os.path.join(persist_dir, "kg_store.ttl")
|
| 83 |
+
try:
|
| 84 |
+
kg = KGStore(path=kg_path)
|
| 85 |
+
retr = KGRetriever(kg)
|
| 86 |
+
chunk_ids, summaries = retr.get_context_for_question(question, hops=kg_hops)
|
| 87 |
+
if summaries:
|
| 88 |
+
kg_text_parts.append("KG entities: " + ", ".join(summaries))
|
| 89 |
+
# add chunk snippets
|
| 90 |
+
for cid in chunk_ids:
|
| 91 |
+
info = chunks_index.get(cid)
|
| 92 |
+
if info:
|
| 93 |
+
txt = info.get("text", "")
|
| 94 |
+
if txt:
|
| 95 |
+
snippet = txt.strip().replace("\n", " ")[:min(len(txt), kg_context_max_chars)]
|
| 96 |
+
kg_text_parts.append(f"[KG chunk {cid}]: {snippet}")
|
| 97 |
+
except Exception:
|
| 98 |
+
# If KG load fails, skip KG augmentation
|
| 99 |
+
kg_text_parts = []
|
| 100 |
+
|
| 101 |
+
kg_context = "\n\n".join(kg_text_parts) if kg_text_parts else ""
|
| 102 |
+
if kg_context:
|
| 103 |
+
augmented_question = f"KG CONTEXT:\n{kg_context}\n\nUser Question:\n{question}"
|
| 104 |
+
else:
|
| 105 |
+
augmented_question = question
|
| 106 |
+
|
| 107 |
+
return chain({"question": augmented_question, "chat_history": chat_history})
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 111 |
@st.cache_resource(show_spinner=False)
|
| 112 |
def build_or_load_retriever_cached(
|
| 113 |
data_dir: str,
|
src/vectorstore.py
CHANGED
|
@@ -51,7 +51,11 @@ class HybridRetriever(BaseRetriever):
|
|
| 51 |
def get_retriever(
|
| 52 |
persist_dir: str,
|
| 53 |
top_k: int,
|
|
|
|
| 54 |
retrieval_mode: RetrievalMode = "mmr",
|
|
|
|
|
|
|
|
|
|
| 55 |
):
|
| 56 |
db = get_vectorstore(persist_dir=persist_dir)
|
| 57 |
mode = retrieval_mode.lower()
|
|
|
|
| 51 |
def get_retriever(
|
| 52 |
persist_dir: str,
|
| 53 |
top_k: int,
|
| 54 |
+
<<<<<<< HEAD
|
| 55 |
retrieval_mode: RetrievalMode = "mmr",
|
| 56 |
+
=======
|
| 57 |
+
retrieval_mode: RetrievalMode = "hybrid",
|
| 58 |
+
>>>>>>> ba5a1f4 (Adding kg to deployment)
|
| 59 |
):
|
| 60 |
db = get_vectorstore(persist_dir=persist_dir)
|
| 61 |
mode = retrieval_mode.lower()
|
vectorstore/chroma-collections.parquet
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d2f2f346a015c1ffec6f8a3c535ac4ea2a99fe14f441a424e373b42248ac0fbe
|
| 3 |
-
size 601
|
|
|
|
|
|
|
|
|
|
|
|
vectorstore/chroma-embeddings.parquet
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1e572394a2cfa7976f286fa157be4e5eaf287d0721581969eed4c4df7874f04a
|
| 3 |
-
size 3380376
|
|
|
|
|
|
|
|
|
|
|
|