santu24's picture
committed to main
ea646b7 verified
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import openai
import faiss
import numpy as np
# ------------------ RAG FUNCTIONS ------------------ #
@st.cache_resource
def build_faiss_index(csv_file: str):
"""
Loads the CSV, creates embeddings for each row (with full info from every column)
and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks,
and the DataFrame.
"""
# Load CSV data
df = pd.read_csv(csv_file)
# Create a summary chunk with key facts (e.g., total row count)
summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}."
# Create detailed text chunks for each row (include column names and their values)
text_chunks = []
for idx, row in df.iterrows():
row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
full_text = f"Row {idx}: {row_text}"
text_chunks.append(full_text)
# Prepend the summary chunk to ensure key data is included in retrieval
texts = [summary_chunk] + text_chunks
# Create embeddings for each chunk using OpenAI’s embedding model
embeddings = []
for text in texts:
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
embedding = response["data"][0]["embedding"]
embeddings.append(embedding)
# Convert embeddings to a NumPy array for FAISS (float32)
embedding_matrix = np.array(embeddings, dtype=np.float32)
dimension = embedding_matrix.shape[1]
# Create a FAISS index using inner product similarity
index = faiss.IndexFlatIP(dimension)
# (Optional) For cosine similarity, normalize the embedding vectors:
# faiss.normalize_L2(embedding_matrix)
index.add(embedding_matrix)
return index, texts, df
def get_relevant_chunks(query: str, index, texts, top_k=10):
"""
Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks.
"""
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=query
)
query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
# If using cosine similarity, normalize query_embedding here as well.
distances, indices = index.search(query_embedding, top_k)
relevant_texts = [texts[i] for i in indices[0]]
return relevant_texts
def answer_query(query: str, index, texts):
"""
Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API.
The prompt instructs GPT to answer solely using the provided data.
"""
relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10)
combined_context = "\n\n".join(relevant_chunks)
messages = [
{
"role": "system",
"content": (
"You are a helpful assistant. Answer ONLY using the provided data context. "
"Do not add any information that isn't in the context."
)
},
{
"role": "user",
"content": (
f"Data:\n{combined_context}\n\n"
f"Question: {query}\n\n"
"Answer using only the data above."
)
}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.0
)
return response["choices"][0]["message"]["content"]
# ------------------ EXISTING APP LAYOUT ------------------ #
# Set page config to wide layout
st.set_page_config(layout="wide")
st.title("Model Insights")
# Embed the new dashboard URL in the main area
new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF"
iframe_code = f'''
<iframe width="100%" height="100%"
src="{new_dashboard_url}"
frameborder="0"
style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);"
allowfullscreen
sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox">
</iframe>
'''
components.html(iframe_code, height=800)
# ------------------ SIDEBAR FOR Q&A ------------------ #
with st.sidebar:
st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True)
openai_api_key = st.text_input("Enter your OpenAI API key:", type="password")
user_message = st.text_input("", placeholder="Ask a question from the data...")
send_button = st.button('Generate Answer')
# Only proceed if the API key is provided
if openai_api_key:
openai.api_key = openai_api_key
# Build the FAISS index from the CSV file (Final_Clean_Data.csv)
index, texts, df = build_faiss_index("Final_Clean_Data.csv")
if send_button and user_message:
query_lower = user_message.lower()
# Handle factual queries directly to avoid hallucination
if "total rows" in query_lower or "how many rows" in query_lower:
answer = f"The CSV has {df.shape[0]} rows."
elif "distribution of" in query_lower:
# For example, if the query asks for "distribution of ae_cluster"
if "ae_cluster" in query_lower:
if "ae_cluster" in df.columns:
distribution = df["ae_cluster"].value_counts()
answer = f"Distribution of ae_cluster:\n{distribution.to_string()}"
else:
answer = "The CSV does not contain a column named 'ae_cluster'."
else:
# Fallback: Use the retrieval-based answer for other distributions
answer = answer_query(user_message, index, texts)
else:
# Use the retrieval-augmented generation for general queries
answer = answer_query(user_message, index, texts)
st.sidebar.subheader("Answer")
st.sidebar.write(answer)
else:
st.sidebar.warning("Please enter your OpenAI API key to proceed.")