import streamlit as st import streamlit.components.v1 as components import pandas as pd import openai import faiss import numpy as np # ------------------ RAG FUNCTIONS ------------------ # @st.cache_resource def build_faiss_index(csv_file: str): """ Loads the CSV, creates embeddings for each row (with full info from every column) and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks, and the DataFrame. """ # Load CSV data df = pd.read_csv(csv_file) # Create a summary chunk with key facts (e.g., total row count) summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}." # Create detailed text chunks for each row (include column names and their values) text_chunks = [] for idx, row in df.iterrows(): row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns]) full_text = f"Row {idx}: {row_text}" text_chunks.append(full_text) # Prepend the summary chunk to ensure key data is included in retrieval texts = [summary_chunk] + text_chunks # Create embeddings for each chunk using OpenAI’s embedding model embeddings = [] for text in texts: response = openai.Embedding.create( model="text-embedding-ada-002", input=text ) embedding = response["data"][0]["embedding"] embeddings.append(embedding) # Convert embeddings to a NumPy array for FAISS (float32) embedding_matrix = np.array(embeddings, dtype=np.float32) dimension = embedding_matrix.shape[1] # Create a FAISS index using inner product similarity index = faiss.IndexFlatIP(dimension) # (Optional) For cosine similarity, normalize the embedding vectors: # faiss.normalize_L2(embedding_matrix) index.add(embedding_matrix) return index, texts, df def get_relevant_chunks(query: str, index, texts, top_k=10): """ Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks. """ response = openai.Embedding.create( model="text-embedding-ada-002", input=query ) query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1) # If using cosine similarity, normalize query_embedding here as well. distances, indices = index.search(query_embedding, top_k) relevant_texts = [texts[i] for i in indices[0]] return relevant_texts def answer_query(query: str, index, texts): """ Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API. The prompt instructs GPT to answer solely using the provided data. """ relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10) combined_context = "\n\n".join(relevant_chunks) messages = [ { "role": "system", "content": ( "You are a helpful assistant. Answer ONLY using the provided data context. " "Do not add any information that isn't in the context." ) }, { "role": "user", "content": ( f"Data:\n{combined_context}\n\n" f"Question: {query}\n\n" "Answer using only the data above." ) } ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, temperature=0.0 ) return response["choices"][0]["message"]["content"] # ------------------ EXISTING APP LAYOUT ------------------ # # Set page config to wide layout st.set_page_config(layout="wide") st.title("Model Insights") # Embed the new dashboard URL in the main area new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF" iframe_code = f''' ''' components.html(iframe_code, height=800) # ------------------ SIDEBAR FOR Q&A ------------------ # with st.sidebar: st.markdown("