Spaces:
Running
Running
File size: 6,091 Bytes
ea646b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import openai
import faiss
import numpy as np
# ------------------ RAG FUNCTIONS ------------------ #
@st.cache_resource
def build_faiss_index(csv_file: str):
"""
Loads the CSV, creates embeddings for each row (with full info from every column)
and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks,
and the DataFrame.
"""
# Load CSV data
df = pd.read_csv(csv_file)
# Create a summary chunk with key facts (e.g., total row count)
summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}."
# Create detailed text chunks for each row (include column names and their values)
text_chunks = []
for idx, row in df.iterrows():
row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
full_text = f"Row {idx}: {row_text}"
text_chunks.append(full_text)
# Prepend the summary chunk to ensure key data is included in retrieval
texts = [summary_chunk] + text_chunks
# Create embeddings for each chunk using OpenAI’s embedding model
embeddings = []
for text in texts:
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
embedding = response["data"][0]["embedding"]
embeddings.append(embedding)
# Convert embeddings to a NumPy array for FAISS (float32)
embedding_matrix = np.array(embeddings, dtype=np.float32)
dimension = embedding_matrix.shape[1]
# Create a FAISS index using inner product similarity
index = faiss.IndexFlatIP(dimension)
# (Optional) For cosine similarity, normalize the embedding vectors:
# faiss.normalize_L2(embedding_matrix)
index.add(embedding_matrix)
return index, texts, df
def get_relevant_chunks(query: str, index, texts, top_k=10):
"""
Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks.
"""
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=query
)
query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
# If using cosine similarity, normalize query_embedding here as well.
distances, indices = index.search(query_embedding, top_k)
relevant_texts = [texts[i] for i in indices[0]]
return relevant_texts
def answer_query(query: str, index, texts):
"""
Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API.
The prompt instructs GPT to answer solely using the provided data.
"""
relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10)
combined_context = "\n\n".join(relevant_chunks)
messages = [
{
"role": "system",
"content": (
"You are a helpful assistant. Answer ONLY using the provided data context. "
"Do not add any information that isn't in the context."
)
},
{
"role": "user",
"content": (
f"Data:\n{combined_context}\n\n"
f"Question: {query}\n\n"
"Answer using only the data above."
)
}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0.0
)
return response["choices"][0]["message"]["content"]
# ------------------ EXISTING APP LAYOUT ------------------ #
# Set page config to wide layout
st.set_page_config(layout="wide")
st.title("Model Insights")
# Embed the new dashboard URL in the main area
new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF"
iframe_code = f'''
<iframe width="100%" height="100%"
src="{new_dashboard_url}"
frameborder="0"
style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);"
allowfullscreen
sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox">
</iframe>
'''
components.html(iframe_code, height=800)
# ------------------ SIDEBAR FOR Q&A ------------------ #
with st.sidebar:
st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True)
openai_api_key = st.text_input("Enter your OpenAI API key:", type="password")
user_message = st.text_input("", placeholder="Ask a question from the data...")
send_button = st.button('Generate Answer')
# Only proceed if the API key is provided
if openai_api_key:
openai.api_key = openai_api_key
# Build the FAISS index from the CSV file (Final_Clean_Data.csv)
index, texts, df = build_faiss_index("Final_Clean_Data.csv")
if send_button and user_message:
query_lower = user_message.lower()
# Handle factual queries directly to avoid hallucination
if "total rows" in query_lower or "how many rows" in query_lower:
answer = f"The CSV has {df.shape[0]} rows."
elif "distribution of" in query_lower:
# For example, if the query asks for "distribution of ae_cluster"
if "ae_cluster" in query_lower:
if "ae_cluster" in df.columns:
distribution = df["ae_cluster"].value_counts()
answer = f"Distribution of ae_cluster:\n{distribution.to_string()}"
else:
answer = "The CSV does not contain a column named 'ae_cluster'."
else:
# Fallback: Use the retrieval-based answer for other distributions
answer = answer_query(user_message, index, texts)
else:
# Use the retrieval-augmented generation for general queries
answer = answer_query(user_message, index, texts)
st.sidebar.subheader("Answer")
st.sidebar.write(answer)
else:
st.sidebar.warning("Please enter your OpenAI API key to proceed.")
|