Multi_Model_AI_AGENT_VectorDB_langchain_json

Sleeping

App Files Files Community

Seth0330 commited on Jun 12, 2025

Commit

a84926c

verified ·

1 Parent(s): e145b0c

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -46

app.py CHANGED Viewed

@@ -11,18 +11,19 @@ from langchain.chains import RetrievalQA
 from langchain.schema import Document
 from langchain_core.retrievers import BaseRetriever
 from pydantic import Field
-from langchain_openai import ChatOpenAI  # FIXED: Use ChatOpenAI for chat models
 # --- CONFIG ---
 DB_PATH = "json_vector.db"
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 EMBEDDING_MODEL = "text-embedding-ada-002"
-# --- Streamlit State Initialization ---
 if "ingested_batches" not in st.session_state:
     st.session_state.ingested_batches = 0
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
 if "modal_open" not in st.session_state:
     st.session_state.modal_open = False
 if "modal_content" not in st.session_state:
@@ -30,14 +31,13 @@ if "modal_content" not in st.session_state:
 if "modal_title" not in st.session_state:
     st.session_state.modal_title = ""
-st.set_page_config(page_title="Cumulative JSON Vector Search (SQLite)", layout="wide")
-st.title("LLM-Powered Analytics: Cumulative JSON Vector DB (SQLite, Local)")
 uploaded_files = st.file_uploader(
     "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
 )
-# --- Helper: Flatten any unstructured JSON (handles dict, list, nested, various keys) ---
 def flatten_json_obj(obj, parent_key="", sep="."):
     items = {}
     if isinstance(obj, dict):
@@ -52,13 +52,11 @@ def flatten_json_obj(obj, parent_key="", sep="."):
         items[parent_key] = obj
     return items
-# --- Embedding function (openai>=1.0.0 style) ---
 def get_embedding(text):
     client = openai.OpenAI(api_key=OPENAI_API_KEY)
     response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
     return response.data[0].embedding
-# --- Ensure DB Table (accumulates all uploads, never deletes old data) ---
 def ensure_table():
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
@@ -75,7 +73,6 @@ def ensure_table():
     conn.commit()
     conn.close()
-# --- Ingest and accumulate uploaded files ---
 def ingest_json_files(files):
     ensure_table()
     rows = []
@@ -83,15 +80,11 @@ def ingest_json_files(files):
     for file in files:
         raw = json.load(file)
         source_name = file.name
-        # Handle top-level list or dict
         if isinstance(raw, list):
             records = raw
         elif isinstance(raw, dict):
             main_lists = [v for v in raw.values() if isinstance(v, list)]
-            if main_lists:
-                records = main_lists[0]
-            else:
-                records = [raw]
         else:
             records = [raw]
         for rec in records:
@@ -104,7 +97,6 @@ def ingest_json_files(files):
     df = pd.DataFrame(rows, columns=["batch_time", "source_file", "raw_json", "flat_text"])
     st.write(f"Flattened {len(df)} records. Generating embeddings (this may take time, please wait)...")
     df["embedding"] = df["flat_text"].apply(get_embedding)
-    # Insert into DB
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
     for _, row in df.iterrows():
@@ -121,7 +113,6 @@ def ingest_json_files(files):
 if uploaded_files and st.button("Ingest batch to database"):
     ingest_json_files(uploaded_files)
-# --- Query entire cumulative DB (ALL past and present records) ---
 def query_vector_db(user_query, top_k=5):
     query_emb = get_embedding(user_query)
     conn = sqlite3.connect(DB_PATH)
@@ -130,7 +121,7 @@ def query_vector_db(user_query, top_k=5):
     results = []
     for row in cursor.fetchall():
         db_emb = np.frombuffer(row[5], dtype=np.float32)
-        if len(db_emb) != len(query_emb): continue  # Skip malformed
         sim = np.dot(query_emb, db_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(db_emb))
         results.append((sim, row))
     conn.close()
@@ -147,33 +138,58 @@ def query_vector_db(user_query, top_k=5):
         docs.append(Document(page_content=row[4], metadata=meta))
     return docs
-# --- LangChain Retriever (BaseRetriever subclass, Pydantic v2 compliant) ---
 class SQLiteVectorRetriever(BaseRetriever):
     top_k: int = Field(default=5)
     def _get_relevant_documents(self, query, run_manager=None, **kwargs):
         return query_vector_db(query, self.top_k)
-llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)  # FIXED: use ChatOpenAI!
 retriever = SQLiteVectorRetriever(top_k=5)
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     retriever=retriever,
     return_source_documents=True,
 )
-# --- Chat UI & Conversation Loop (with modal) ---
-st.header("Chat with all accumulated records")
 def show_json_links_and_modal():
-    for speaker, msg in reversed(st.session_state.chat_history):
-        if speaker == "AI_DOCS":
-            docs = msg
-            for idx, doc in enumerate(docs):
-                if st.button(f"View JSON: {doc.metadata['source_file']} (#{doc.metadata['id']})", key=f"modal_{idx}"):
-                    st.session_state.modal_open = True
-                    st.session_state.modal_content = json.dumps(json.loads(doc.metadata["raw_json"]), indent=2)
-                    st.session_state.modal_title = f"{doc.metadata['source_file']} (#{doc.metadata['id']})"
             break
     if st.session_state.modal_open:
         with st.expander(f"JSON Record: {st.session_state.modal_title}", expanded=True):
@@ -181,23 +197,32 @@ def show_json_links_and_modal():
             if st.button("Close", key="close_modal"):
                 st.session_state.modal_open = False
-user_input = st.text_input("Ask a question about ALL data (old and new):", key="user_input")
-if st.button("Send") and user_input:
-    with st.spinner("Thinking..."):
-        result = qa_chain(user_input)
-        st.session_state.chat_history.append(("User", user_input))
-        st.session_state.chat_history.append(("AI", result['result']))
-        st.session_state.chat_history.append(("AI_DOCS", result['source_documents']))
-for speaker, msg in st.session_state.chat_history:
-    if speaker == "User":
-        st.markdown(f"<div style='color: #4F8BF9;'><b>User:</b> {msg}</div>", unsafe_allow_html=True)
-    elif speaker == "AI":
-        st.markdown(f"<div style='color: #1C6E4C;'><b>Agent:</b> {msg}</div>", unsafe_allow_html=True)
 show_json_links_and_modal()
 if st.button("Clear chat"):
-    st.session_state.chat_history = []
 st.info(f"Batches ingested so far (this session): {st.session_state.ingested_batches}")

 from langchain.schema import Document
 from langchain_core.retrievers import BaseRetriever
 from pydantic import Field
+from langchain_openai import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
 # --- CONFIG ---
 DB_PATH = "json_vector.db"
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 EMBEDDING_MODEL = "text-embedding-ada-002"
+# --- State Initialization ---
 if "ingested_batches" not in st.session_state:
     st.session_state.ingested_batches = 0
+if "messages" not in st.session_state:
+    st.session_state.messages = []
 if "modal_open" not in st.session_state:
     st.session_state.modal_open = False
 if "modal_content" not in st.session_state:
 if "modal_title" not in st.session_state:
     st.session_state.modal_title = ""
+st.set_page_config(page_title="Chat with Your JSON Vectors", layout="wide")
+st.title("Chat with Your Vectorized JSON Files (LangChain, SQLite, LLM)")
 uploaded_files = st.file_uploader(
     "Upload JSON files in batches (any structure)", type="json", accept_multiple_files=True
 )
 def flatten_json_obj(obj, parent_key="", sep="."):
     items = {}
     if isinstance(obj, dict):
         items[parent_key] = obj
     return items
 def get_embedding(text):
     client = openai.OpenAI(api_key=OPENAI_API_KEY)
     response = client.embeddings.create(input=[text], model=EMBEDDING_MODEL)
     return response.data[0].embedding
 def ensure_table():
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
     conn.commit()
     conn.close()
 def ingest_json_files(files):
     ensure_table()
     rows = []
     for file in files:
         raw = json.load(file)
         source_name = file.name
         if isinstance(raw, list):
             records = raw
         elif isinstance(raw, dict):
             main_lists = [v for v in raw.values() if isinstance(v, list)]
+            records = main_lists[0] if main_lists else [raw]
         else:
             records = [raw]
         for rec in records:
     df = pd.DataFrame(rows, columns=["batch_time", "source_file", "raw_json", "flat_text"])
     st.write(f"Flattened {len(df)} records. Generating embeddings (this may take time, please wait)...")
     df["embedding"] = df["flat_text"].apply(get_embedding)
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
     for _, row in df.iterrows():
 if uploaded_files and st.button("Ingest batch to database"):
     ingest_json_files(uploaded_files)
 def query_vector_db(user_query, top_k=5):
     query_emb = get_embedding(user_query)
     conn = sqlite3.connect(DB_PATH)
     results = []
     for row in cursor.fetchall():
         db_emb = np.frombuffer(row[5], dtype=np.float32)
+        if len(db_emb) != len(query_emb): continue
         sim = np.dot(query_emb, db_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(db_emb))
         results.append((sim, row))
     conn.close()
         docs.append(Document(page_content=row[4], metadata=meta))
     return docs
 class SQLiteVectorRetriever(BaseRetriever):
     top_k: int = Field(default=5)
     def _get_relevant_documents(self, query, run_manager=None, **kwargs):
         return query_vector_db(query, self.top_k)
+# --- FINETUNED SYSTEM PROMPT FOR DIRECT ANSWERS ---
+system_prompt = (
+    "You are a JSON data assistant. Always give a direct, concise answer based only on the context provided. "
+    "If you do not see the answer in the context, reply: 'I don’t have that information.' "
+    "Never make up information. Never ask for clarification."
+)
+prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    ("human", "{question}")
+])
+llm = ChatOpenAI(model="gpt-4.1", openai_api_key=OPENAI_API_KEY, temperature=0)
 retriever = SQLiteVectorRetriever(top_k=5)
 qa_chain = RetrievalQA.from_chain_type(
     llm=llm,
     retriever=retriever,
+    chain_type_kwargs={"prompt": prompt},
     return_source_documents=True,
 )
+# --- Conversation Area (fine-tuned style) ---
+st.markdown("### Ask any question about your data, just like ChatGPT.")
+for msg in st.session_state.messages:
+    if msg["role"] == "user":
+        st.markdown(f"<div style='color: #4F8BF9;'><b>User:</b> {msg['content']}</div>", unsafe_allow_html=True)
+    elif msg["role"] == "assistant":
+        st.markdown(f"<div style='color: #1C6E4C;'><b>Agent:</b> {msg['content']}</div>", unsafe_allow_html=True)
+    elif msg["role"] == "function":
+        st.markdown(f"<details><summary><b>Function Output:</b></summary><pre>{msg['content']}</pre></details>", unsafe_allow_html=True)
 def show_json_links_and_modal():
+    # Look for last function message (top results) and display view buttons
+    for msg in reversed(st.session_state.messages):
+        if msg.get("role") == "function" and msg.get("content"):
+            try:
+                docs = json.loads(msg["content"])
+                if isinstance(docs, list):
+                    for idx, doc in enumerate(docs):
+                        if isinstance(doc, dict) and "record" in doc:
+                            if st.button(f"View JSON: {doc.get('file', 'unknown')} record #{idx+1}", key=f"modal_function_{idx}"):
+                                st.session_state.modal_open = True
+                                st.session_state.modal_content = json.dumps(doc["record"], indent=2)
+                                st.session_state.modal_title = f"{doc.get('file', 'unknown')} record #{idx+1}"
+            except Exception:
+                continue
             break
     if st.session_state.modal_open:
         with st.expander(f"JSON Record: {st.session_state.modal_title}", expanded=True):
             if st.button("Close", key="close_modal"):
                 st.session_state.modal_open = False
 show_json_links_and_modal()
+def send_message():
+    user_input = st.session_state.temp_input.strip()
+    if not user_input:
+        return
+    st.session_state.messages.append({"role": "user", "content": user_input})
+    with st.spinner("Thinking..."):
+        # Use the chain with { "question": ... } to match prompt format
+        result = qa_chain({"question": user_input})
+        answer = result['result']
+        st.session_state.messages.append({"role": "assistant", "content": answer})
+        docs = result['source_documents']
+        doc_list = []
+        for doc in docs:
+            doc_list.append({
+                "file": doc.metadata["source_file"],
+                "id": doc.metadata["id"],
+                "record": json.loads(doc.metadata["raw_json"])
+            })
+        st.session_state.messages.append({"role": "function", "content": json.dumps(doc_list, indent=2)})
+    st.session_state.temp_input = ""
+st.text_input("Your message:", key="temp_input", on_change=send_message)
 if st.button("Clear chat"):
+    st.session_state.messages = []
 st.info(f"Batches ingested so far (this session): {st.session_state.ingested_batches}")