Update app.py
Browse files
app.py
CHANGED
|
@@ -22,21 +22,15 @@ if "ingested_batches" not in st.session_state:
|
|
| 22 |
st.session_state.ingested_batches = 0
|
| 23 |
if "messages" not in st.session_state:
|
| 24 |
st.session_state.messages = []
|
| 25 |
-
if "modal_open" not in st.session_state:
|
| 26 |
-
st.session_state.modal_open = False
|
| 27 |
-
if "modal_content" not in st.session_state:
|
| 28 |
-
st.session_state.modal_content = ""
|
| 29 |
-
if "modal_title" not in st.session_state:
|
| 30 |
-
st.session_state.modal_title = ""
|
| 31 |
|
| 32 |
-
st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
|
| 33 |
-
st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM)")
|
| 34 |
|
| 35 |
uploaded_files = st.file_uploader(
|
| 36 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 37 |
)
|
| 38 |
|
| 39 |
-
# --- Enhanced
|
| 40 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 41 |
items = {}
|
| 42 |
if isinstance(obj, dict):
|
|
@@ -62,6 +56,25 @@ def flatten_json_obj(obj, parent_key="", sep="."):
|
|
| 62 |
items[parent_key] = obj
|
| 63 |
return items
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def get_embedding(text):
|
| 66 |
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
| 67 |
response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
|
|
@@ -88,6 +101,7 @@ def ingest_json_files(files):
|
|
| 88 |
rows = []
|
| 89 |
batch_time = datetime.datetime.utcnow().isoformat()
|
| 90 |
for file in files:
|
|
|
|
| 91 |
raw = json.load(file)
|
| 92 |
source_name = file.name
|
| 93 |
if isinstance(raw, list):
|
|
@@ -199,7 +213,6 @@ def filter_records_by_entity(records, entity):
|
|
| 199 |
matches.append(doc)
|
| 200 |
return matches if matches else records
|
| 201 |
|
| 202 |
-
|
| 203 |
def hybrid_query(user_query, top_k=5):
|
| 204 |
vector_docs = query_vector_db(user_query, top_k=top_k)
|
| 205 |
fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
|
|
@@ -211,11 +224,13 @@ def hybrid_query(user_query, top_k=5):
|
|
| 211 |
all_docs.append(doc)
|
| 212 |
seen_ids.add(doc_id)
|
| 213 |
entity = extract_main_entity(user_query)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
|
| 215 |
-
# Show only the most relevant record (to make LLM's job easier)
|
| 216 |
if entity_docs:
|
| 217 |
doc = entity_docs[0]
|
| 218 |
-
# Optionally, highlight entity in context
|
| 219 |
if entity:
|
| 220 |
doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
|
| 221 |
st.markdown("#### Context shown to LLM")
|
|
@@ -229,7 +244,6 @@ class HybridRetriever(BaseRetriever):
|
|
| 229 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
| 230 |
return hybrid_query(query, self.top_k)
|
| 231 |
|
| 232 |
-
# --- Prompt (explicitly tells LLM what to do)
|
| 233 |
system_prompt = (
|
| 234 |
"You are a JSON data assistant. "
|
| 235 |
"If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
|
|
@@ -244,7 +258,6 @@ prompt = ChatPromptTemplate.from_messages([
|
|
| 244 |
])
|
| 245 |
|
| 246 |
llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
|
| 247 |
-
|
| 248 |
retriever = HybridRetriever(top_k=5)
|
| 249 |
qa_chain = RetrievalQA.from_chain_type(
|
| 250 |
llm=llm,
|
|
|
|
| 22 |
st.session_state.ingested_batches = 0
|
| 23 |
if "messages" not in st.session_state:
|
| 24 |
st.session_state.messages = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
st.set_page_config(page_title="Chat with Your JSON Vectors (DEBUG)", layout="wide")
|
| 27 |
+
st.title("Chat with Your Vectorized JSON Files (Hybrid Retrieval, SQLite, LLM, DEBUG)")
|
| 28 |
|
| 29 |
uploaded_files = st.file_uploader(
|
| 30 |
"Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# --- Enhanced flattening with name extraction for emails/user fields
|
| 34 |
def flatten_json_obj(obj, parent_key="", sep="."):
|
| 35 |
items = {}
|
| 36 |
if isinstance(obj, dict):
|
|
|
|
| 56 |
items[parent_key] = obj
|
| 57 |
return items
|
| 58 |
|
| 59 |
+
# --- DEBUG: Show flattening of uploaded JSONs
|
| 60 |
+
if uploaded_files:
|
| 61 |
+
st.markdown("#### DEBUG: Flat view of all uploaded JSON records")
|
| 62 |
+
for file in uploaded_files:
|
| 63 |
+
file.seek(0)
|
| 64 |
+
try:
|
| 65 |
+
raw = json.load(file)
|
| 66 |
+
if isinstance(raw, list):
|
| 67 |
+
records = raw
|
| 68 |
+
elif isinstance(raw, dict):
|
| 69 |
+
main_lists = [v for v in raw.values() if isinstance(v, list)]
|
| 70 |
+
records = main_lists[0] if main_lists else [raw]
|
| 71 |
+
else:
|
| 72 |
+
records = [raw]
|
| 73 |
+
for idx, rec in enumerate(records):
|
| 74 |
+
st.code(flatten_json_obj(rec))
|
| 75 |
+
except Exception as e:
|
| 76 |
+
st.warning(str(e))
|
| 77 |
+
|
| 78 |
def get_embedding(text):
|
| 79 |
client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
| 80 |
response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
|
|
|
|
| 101 |
rows = []
|
| 102 |
batch_time = datetime.datetime.utcnow().isoformat()
|
| 103 |
for file in files:
|
| 104 |
+
file.seek(0)
|
| 105 |
raw = json.load(file)
|
| 106 |
source_name = file.name
|
| 107 |
if isinstance(raw, list):
|
|
|
|
| 213 |
matches.append(doc)
|
| 214 |
return matches if matches else records
|
| 215 |
|
|
|
|
| 216 |
def hybrid_query(user_query, top_k=5):
|
| 217 |
vector_docs = query_vector_db(user_query, top_k=top_k)
|
| 218 |
fuzzy_docs = python_fuzzy_match(user_query, top_k=top_k)
|
|
|
|
| 224 |
all_docs.append(doc)
|
| 225 |
seen_ids.add(doc_id)
|
| 226 |
entity = extract_main_entity(user_query)
|
| 227 |
+
st.markdown(f"#### DEBUG: Extracted entity from question: {entity}")
|
| 228 |
+
st.markdown("#### DEBUG: All retrieved docs for your query")
|
| 229 |
+
for idx, doc in enumerate(all_docs):
|
| 230 |
+
st.code(doc.page_content)
|
| 231 |
entity_docs = filter_records_by_entity(all_docs, entity) if entity else all_docs
|
|
|
|
| 232 |
if entity_docs:
|
| 233 |
doc = entity_docs[0]
|
|
|
|
| 234 |
if entity:
|
| 235 |
doc.page_content = re.sub(rf"({re.escape(entity)})", r"**\1**", doc.page_content, flags=re.IGNORECASE)
|
| 236 |
st.markdown("#### Context shown to LLM")
|
|
|
|
| 244 |
def _get_relevant_documents(self, query, run_manager=None, **kwargs):
|
| 245 |
return hybrid_query(query, self.top_k)
|
| 246 |
|
|
|
|
| 247 |
system_prompt = (
|
| 248 |
"You are a JSON data assistant. "
|
| 249 |
"If the question mentions a name or email (e.g. Johnny), match it to any field value (even as part of an email) "
|
|
|
|
| 258 |
])
|
| 259 |
|
| 260 |
llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
|
|
|
|
| 261 |
retriever = HybridRetriever(top_k=5)
|
| 262 |
qa_chain = RetrievalQA.from_chain_type(
|
| 263 |
llm=llm,
|