Spaces:
Running
Running
| import streamlit as st | |
| import streamlit.components.v1 as components | |
| import pandas as pd | |
| import openai | |
| import faiss | |
| import numpy as np | |
| # ------------------ RAG FUNCTIONS ------------------ # | |
| def build_faiss_index(csv_file: str): | |
| """ | |
| Loads the CSV, creates embeddings for each row (with full info from every column) | |
| and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks, | |
| and the DataFrame. | |
| """ | |
| # Load CSV data | |
| df = pd.read_csv(csv_file) | |
| # Create a summary chunk with key facts (e.g., total row count) | |
| summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}." | |
| # Create detailed text chunks for each row (include column names and their values) | |
| text_chunks = [] | |
| for idx, row in df.iterrows(): | |
| row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns]) | |
| full_text = f"Row {idx}: {row_text}" | |
| text_chunks.append(full_text) | |
| # Prepend the summary chunk to ensure key data is included in retrieval | |
| texts = [summary_chunk] + text_chunks | |
| # Create embeddings for each chunk using OpenAI’s embedding model | |
| embeddings = [] | |
| for text in texts: | |
| response = openai.Embedding.create( | |
| model="text-embedding-ada-002", | |
| input=text | |
| ) | |
| embedding = response["data"][0]["embedding"] | |
| embeddings.append(embedding) | |
| # Convert embeddings to a NumPy array for FAISS (float32) | |
| embedding_matrix = np.array(embeddings, dtype=np.float32) | |
| dimension = embedding_matrix.shape[1] | |
| # Create a FAISS index using inner product similarity | |
| index = faiss.IndexFlatIP(dimension) | |
| # (Optional) For cosine similarity, normalize the embedding vectors: | |
| # faiss.normalize_L2(embedding_matrix) | |
| index.add(embedding_matrix) | |
| return index, texts, df | |
| def get_relevant_chunks(query: str, index, texts, top_k=10): | |
| """ | |
| Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks. | |
| """ | |
| response = openai.Embedding.create( | |
| model="text-embedding-ada-002", | |
| input=query | |
| ) | |
| query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1) | |
| # If using cosine similarity, normalize query_embedding here as well. | |
| distances, indices = index.search(query_embedding, top_k) | |
| relevant_texts = [texts[i] for i in indices[0]] | |
| return relevant_texts | |
| def answer_query(query: str, index, texts): | |
| """ | |
| Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API. | |
| The prompt instructs GPT to answer solely using the provided data. | |
| """ | |
| relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10) | |
| combined_context = "\n\n".join(relevant_chunks) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a helpful assistant. Answer ONLY using the provided data context. " | |
| "Do not add any information that isn't in the context." | |
| ) | |
| }, | |
| { | |
| "role": "user", | |
| "content": ( | |
| f"Data:\n{combined_context}\n\n" | |
| f"Question: {query}\n\n" | |
| "Answer using only the data above." | |
| ) | |
| } | |
| ] | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=messages, | |
| temperature=0.0 | |
| ) | |
| return response["choices"][0]["message"]["content"] | |
| # ------------------ EXISTING APP LAYOUT ------------------ # | |
| # Set page config to wide layout | |
| st.set_page_config(layout="wide") | |
| st.title("Model Insights") | |
| # Embed the new dashboard URL in the main area | |
| new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF" | |
| iframe_code = f''' | |
| <iframe width="100%" height="100%" | |
| src="{new_dashboard_url}" | |
| frameborder="0" | |
| style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);" | |
| allowfullscreen | |
| sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"> | |
| </iframe> | |
| ''' | |
| components.html(iframe_code, height=800) | |
| # ------------------ SIDEBAR FOR Q&A ------------------ # | |
| with st.sidebar: | |
| st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True) | |
| openai_api_key = st.text_input("Enter your OpenAI API key:", type="password") | |
| user_message = st.text_input("", placeholder="Ask a question from the data...") | |
| send_button = st.button('Generate Answer') | |
| # Only proceed if the API key is provided | |
| if openai_api_key: | |
| openai.api_key = openai_api_key | |
| # Build the FAISS index from the CSV file (Final_Clean_Data.csv) | |
| index, texts, df = build_faiss_index("Final_Clean_Data.csv") | |
| if send_button and user_message: | |
| query_lower = user_message.lower() | |
| # Handle factual queries directly to avoid hallucination | |
| if "total rows" in query_lower or "how many rows" in query_lower: | |
| answer = f"The CSV has {df.shape[0]} rows." | |
| elif "distribution of" in query_lower: | |
| # For example, if the query asks for "distribution of ae_cluster" | |
| if "ae_cluster" in query_lower: | |
| if "ae_cluster" in df.columns: | |
| distribution = df["ae_cluster"].value_counts() | |
| answer = f"Distribution of ae_cluster:\n{distribution.to_string()}" | |
| else: | |
| answer = "The CSV does not contain a column named 'ae_cluster'." | |
| else: | |
| # Fallback: Use the retrieval-based answer for other distributions | |
| answer = answer_query(user_message, index, texts) | |
| else: | |
| # Use the retrieval-augmented generation for general queries | |
| answer = answer_query(user_message, index, texts) | |
| st.sidebar.subheader("Answer") | |
| st.sidebar.write(answer) | |
| else: | |
| st.sidebar.warning("Please enter your OpenAI API key to proceed.") | |