Update app.py
Browse files
app.py
CHANGED
|
@@ -6,7 +6,6 @@ import sqlite3
|
|
| 6 |
import json
|
| 7 |
import numpy as np
|
| 8 |
import datetime
|
| 9 |
-
|
| 10 |
from langchain.chains import RetrievalQA
|
| 11 |
from langchain.schema import Document
|
| 12 |
from langchain_core.retrievers import BaseRetriever
|
|
@@ -32,17 +31,25 @@ if "modal_title" not in st.session_state:
|
|
| 32 |
st.session_state.modal_title = ""
|
| 33 |
|
| 34 |
st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
|
| 35 |
-
st.title("Chat with Your Vectorized JSON Files (
|
| 36 |
|
| 37 |
uploaded_files = st.file_uploader(
|
| 38 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 39 |
)
|
| 40 |
|
|
|
|
| 41 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 42 |
items = {}
|
| 43 |
if isinstance(obj, dict):
|
| 44 |
for k, v in obj.items():
|
| 45 |
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
items.update(flatten_json_obj(v, new_key, sep=sep))
|
| 47 |
elif isinstance(obj, list):
|
| 48 |
for i, v in enumerate(obj):
|
|
@@ -113,6 +120,7 @@ def ingest_json_files(files):
|
|
| 113 |
if uploaded_files and st.button("Ingest batch to database"):
|
| 114 |
ingest_json_files(uploaded_files)
|
| 115 |
|
|
|
|
| 116 |
def query_vector_db(user_query, top_k=5):
|
| 117 |
query_emb = get_embedding(user_query)
|
| 118 |
conn = sqlite3.connect(DB_PATH)
|
|
@@ -132,24 +140,63 @@ def query_vector_db(user_query, top_k=5):
|
|
| 132 |
"id": row[0],
|
| 133 |
"batch_time": str(row[1]),
|
| 134 |
"source_file": row[2],
|
| 135 |
-
"similarity": f"{sim:.4f}",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
"raw_json": row[3],
|
| 137 |
}
|
| 138 |
docs.append(Document(page_content=row[4], metadata=meta))
|
| 139 |
return docs
|
| 140 |
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
top_k: int = Field(default=5)
|
| 143 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
| 144 |
-
return
|
| 145 |
|
| 146 |
-
# --- SYSTEM PROMPT &
|
| 147 |
system_prompt = (
|
| 148 |
"You are a JSON data assistant. Always give a direct, concise answer based only on the context provided. "
|
| 149 |
"If you do not see the answer in the context, reply: 'I don’t have that information.' "
|
| 150 |
"Never make up information. Never ask for clarification."
|
| 151 |
)
|
| 152 |
-
|
| 153 |
prompt = ChatPromptTemplate.from_messages([
|
| 154 |
("system", system_prompt),
|
| 155 |
("human", "Context:\n{context}\n\nQuestion: {question}")
|
|
@@ -157,7 +204,7 @@ prompt = ChatPromptTemplate.from_messages([
|
|
| 157 |
|
| 158 |
llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
|
| 159 |
|
| 160 |
-
retriever =
|
| 161 |
qa_chain = RetrievalQA.from_chain_type(
|
| 162 |
llm=llm,
|
| 163 |
retriever=retriever,
|
|
@@ -165,7 +212,7 @@ qa_chain = RetrievalQA.from_chain_type(
|
|
| 165 |
return_source_documents=True,
|
| 166 |
)
|
| 167 |
|
| 168 |
-
# --- Conversation Area
|
| 169 |
st.markdown("### Ask any question about your data, just like ChatGPT.")
|
| 170 |
for msg in st.session_state.messages:
|
| 171 |
if msg["role"] == "user":
|
|
@@ -204,7 +251,7 @@ def send_message():
|
|
| 204 |
return
|
| 205 |
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 206 |
with st.spinner("Thinking..."):
|
| 207 |
-
# Correct
|
| 208 |
result = qa_chain({"query": user_input})
|
| 209 |
answer = result['result']
|
| 210 |
st.session_state.messages.append({"role": "assistant", "content": answer})
|
|
@@ -214,12 +261,12 @@ def send_message():
|
|
| 214 |
doc_list.append({
|
| 215 |
"file": doc.metadata["source_file"],
|
| 216 |
"id": doc.metadata["id"],
|
|
|
|
| 217 |
"record": json.loads(doc.metadata["raw_json"])
|
| 218 |
})
|
| 219 |
st.session_state.messages.append({"role": "function", "content": json.dumps(doc_list, indent=2)})
|
| 220 |
st.session_state.temp_input = ""
|
| 221 |
|
| 222 |
-
|
| 223 |
st.text_input("Your message:", key="temp_input", on_change=send_message)
|
| 224 |
|
| 225 |
if st.button("Clear chat"):
|
|
|
|
| 6 |
import json
|
| 7 |
import numpy as np
|
| 8 |
import datetime
|
|
|
|
| 9 |
from langchain.chains import RetrievalQA
|
| 10 |
from langchain.schema import Document
|
| 11 |
from langchain_core.retrievers import BaseRetriever
|
|
|
|
| 31 |
st.session_state.modal_title = ""
|
| 32 |
|
| 33 |
st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
|
| 34 |
+
st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM)")
|
| 35 |
|
| 36 |
uploaded_files = st.file_uploader(
|
| 37 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 38 |
)
|
| 39 |
|
| 40 |
+
# --- Improved Flattening: extracts entity from emails/user fields for better matching
|
| 41 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 42 |
items = {}
|
| 43 |
if isinstance(obj, dict):
|
| 44 |
for k, v in obj.items():
|
| 45 |
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
| 46 |
+
# Entity extraction: add name from email
|
| 47 |
+
if (
|
| 48 |
+
k.lower() in {"customer", "user", "email", "username"} and
|
| 49 |
+
isinstance(v, str) and "@" in v
|
| 50 |
+
):
|
| 51 |
+
local = v.split("@")[0]
|
| 52 |
+
items[new_key + "_name"] = local
|
| 53 |
items.update(flatten_json_obj(v, new_key, sep=sep))
|
| 54 |
elif isinstance(obj, list):
|
| 55 |
for i, v in enumerate(obj):
|
|
|
|
| 120 |
if uploaded_files and st.button("Ingest batch to database"):
|
| 121 |
ingest_json_files(uploaded_files)
|
| 122 |
|
| 123 |
+
# --- VECTOR RETRIEVAL
|
| 124 |
def query_vector_db(user_query, top_k=5):
|
| 125 |
query_emb = get_embedding(user_query)
|
| 126 |
conn = sqlite3.connect(DB_PATH)
|
|
|
|
| 140 |
"id": row[0],
|
| 141 |
"batch_time": str(row[1]),
|
| 142 |
"source_file": row[2],
|
| 143 |
+
"similarity": f"{sim:.4f} (embedding)",
|
| 144 |
+
"raw_json": row[3],
|
| 145 |
+
}
|
| 146 |
+
docs.append(Document(page_content=row[4], metadata=meta))
|
| 147 |
+
return docs
|
| 148 |
+
|
| 149 |
+
# --- PYTHON FUZZY/KEYWORD SEARCH
|
| 150 |
+
def python_fuzzy_match(user_query, top_k=5):
|
| 151 |
+
query_terms = set(user_query.lower().replace("@", " ").replace(".", " ").split())
|
| 152 |
+
conn = sqlite3.connect(DB_PATH)
|
| 153 |
+
cursor = conn.cursor()
|
| 154 |
+
cursor.execute("SELECT id, batch_time, source_file, raw_json, flat_text FROM json_records")
|
| 155 |
+
results = []
|
| 156 |
+
for row in cursor.fetchall():
|
| 157 |
+
flat_text = row[4].lower()
|
| 158 |
+
# score = # of query terms present as substring in the flat_text
|
| 159 |
+
score = sum(any(term in flat_text for term in query_terms) for term in query_terms)
|
| 160 |
+
if score > 0:
|
| 161 |
+
results.append((score, row))
|
| 162 |
+
conn.close()
|
| 163 |
+
results = sorted(results, reverse=True)[:top_k]
|
| 164 |
+
docs = []
|
| 165 |
+
for score, row in results:
|
| 166 |
+
meta = {
|
| 167 |
+
"id": row[0],
|
| 168 |
+
"batch_time": str(row[1]),
|
| 169 |
+
"source_file": row[2],
|
| 170 |
+
"similarity": f"{score} (fuzzy)",
|
| 171 |
"raw_json": row[3],
|
| 172 |
}
|
| 173 |
docs.append(Document(page_content=row[4], metadata=meta))
|
| 174 |
return docs
|
| 175 |
|
| 176 |
+
# --- HYBRID RETRIEVER
|
| 177 |
+
def hybrid_query(user_query, top_k=5):
|
| 178 |
+
vector_docs = query_vector_db(user_query, top_k=top_k)
|
| 179 |
+
fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
|
| 180 |
+
seen_ids = set()
|
| 181 |
+
all_docs = []
|
| 182 |
+
for doc in (vector_docs + fuzzy_docs):
|
| 183 |
+
doc_id = doc.metadata.get("id")
|
| 184 |
+
if doc_id not in seen_ids:
|
| 185 |
+
all_docs.append(doc)
|
| 186 |
+
seen_ids.add(doc_id)
|
| 187 |
+
return all_docs[:top_k]
|
| 188 |
+
|
| 189 |
+
class HybridRetriever(BaseRetriever):
|
| 190 |
top_k: int = Field(default=5)
|
| 191 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
| 192 |
+
return hybrid_query(query, self.top_k)
|
| 193 |
|
| 194 |
+
# --- SYSTEM PROMPT & PROMPT TEMPLATE
|
| 195 |
system_prompt = (
|
| 196 |
"You are a JSON data assistant. Always give a direct, concise answer based only on the context provided. "
|
| 197 |
"If you do not see the answer in the context, reply: 'I don’t have that information.' "
|
| 198 |
"Never make up information. Never ask for clarification."
|
| 199 |
)
|
|
|
|
| 200 |
prompt = ChatPromptTemplate.from_messages([
|
| 201 |
("system", system_prompt),
|
| 202 |
("human", "Context:\n{context}\n\nQuestion: {question}")
|
|
|
|
| 204 |
|
| 205 |
llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
|
| 206 |
|
| 207 |
+
retriever = HybridRetriever(top_k=5)
|
| 208 |
qa_chain = RetrievalQA.from_chain_type(
|
| 209 |
llm=llm,
|
| 210 |
retriever=retriever,
|
|
|
|
| 212 |
return_source_documents=True,
|
| 213 |
)
|
| 214 |
|
| 215 |
+
# --- Chat UI and Conversation Area ---
|
| 216 |
st.markdown("### Ask any question about your data, just like ChatGPT.")
|
| 217 |
for msg in st.session_state.messages:
|
| 218 |
if msg["role"] == "user":
|
|
|
|
| 251 |
return
|
| 252 |
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 253 |
with st.spinner("Thinking..."):
|
| 254 |
+
# Correct key: "query"
|
| 255 |
result = qa_chain({"query": user_input})
|
| 256 |
answer = result['result']
|
| 257 |
st.session_state.messages.append({"role": "assistant", "content": answer})
|
|
|
|
| 261 |
doc_list.append({
|
| 262 |
"file": doc.metadata["source_file"],
|
| 263 |
"id": doc.metadata["id"],
|
| 264 |
+
"similarity": doc.metadata["similarity"],
|
| 265 |
"record": json.loads(doc.metadata["raw_json"])
|
| 266 |
})
|
| 267 |
st.session_state.messages.append({"role": "function", "content": json.dumps(doc_list, indent=2)})
|
| 268 |
st.session_state.temp_input = ""
|
| 269 |
|
|
|
|
| 270 |
st.text_input("Your message:", key="temp_input", on_change=send_message)
|
| 271 |
|
| 272 |
if st.button("Clear chat"):
|