import streamlit as st import streamlit.components.v1 as components import pandas as pd import openai import faiss import numpy as np # ------------------ RAG FUNCTIONS ------------------ # @st.cache_resource def build_faiss_index(csv_file: str): """ Loads the CSV, creates embeddings for each row (with full info from every column) and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks, and the DataFrame. """ # Load CSV data df = pd.read_csv(csv_file) # Create a summary chunk with key facts (e.g., total row count) summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}." # Create detailed text chunks for each row (include column names and their values) text_chunks = [] for idx, row in df.iterrows(): row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns]) full_text = f"Row {idx}: {row_text}" text_chunks.append(full_text) # Prepend the summary chunk to ensure key data is included in retrieval texts = [summary_chunk] + text_chunks # Create embeddings for each chunk using OpenAI’s embedding model embeddings = [] for text in texts: response = openai.Embedding.create( model="text-embedding-ada-002", input=text ) embedding = response["data"][0]["embedding"] embeddings.append(embedding) # Convert embeddings to a NumPy array for FAISS (float32) embedding_matrix = np.array(embeddings, dtype=np.float32) dimension = embedding_matrix.shape[1] # Create a FAISS index using inner product similarity index = faiss.IndexFlatIP(dimension) # (Optional) For cosine similarity, normalize the embedding vectors: # faiss.normalize_L2(embedding_matrix) index.add(embedding_matrix) return index, texts, df def get_relevant_chunks(query: str, index, texts, top_k=10): """ Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks. """ response = openai.Embedding.create( model="text-embedding-ada-002", input=query ) query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1) # If using cosine similarity, normalize query_embedding here as well. distances, indices = index.search(query_embedding, top_k) relevant_texts = [texts[i] for i in indices[0]] return relevant_texts def answer_query(query: str, index, texts): """ Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API. The prompt instructs GPT to answer solely using the provided data. """ relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10) combined_context = "\n\n".join(relevant_chunks) messages = [ { "role": "system", "content": ( "You are a helpful assistant. Answer ONLY using the provided data context. " "Do not add any information that isn't in the context." ) }, { "role": "user", "content": ( f"Data:\n{combined_context}\n\n" f"Question: {query}\n\n" "Answer using only the data above." ) } ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, temperature=0.0 ) return response["choices"][0]["message"]["content"] # ------------------ EXISTING APP LAYOUT ------------------ # # Set page config to wide layout st.set_page_config(layout="wide") st.title("Model Insights") # Embed the new dashboard URL in the main area new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF" iframe_code = f''' ''' components.html(iframe_code, height=800) # ------------------ SIDEBAR FOR Q&A ------------------ # with st.sidebar: st.markdown("

Ask Your Data

", unsafe_allow_html=True) openai_api_key = st.text_input("Enter your OpenAI API key:", type="password") user_message = st.text_input("", placeholder="Ask a question from the data...") send_button = st.button('Generate Answer') # Only proceed if the API key is provided if openai_api_key: openai.api_key = openai_api_key # Build the FAISS index from the CSV file (Final_Clean_Data.csv) index, texts, df = build_faiss_index("Final_Clean_Data.csv") if send_button and user_message: query_lower = user_message.lower() # Handle factual queries directly to avoid hallucination if "total rows" in query_lower or "how many rows" in query_lower: answer = f"The CSV has {df.shape[0]} rows." elif "distribution of" in query_lower: # For example, if the query asks for "distribution of ae_cluster" if "ae_cluster" in query_lower: if "ae_cluster" in df.columns: distribution = df["ae_cluster"].value_counts() answer = f"Distribution of ae_cluster:\n{distribution.to_string()}" else: answer = "The CSV does not contain a column named 'ae_cluster'." else: # Fallback: Use the retrieval-based answer for other distributions answer = answer_query(user_message, index, texts) else: # Use the retrieval-augmented generation for general queries answer = answer_query(user_message, index, texts) st.sidebar.subheader("Answer") st.sidebar.write(answer) else: st.sidebar.warning("Please enter your OpenAI API key to proceed.")