File size: 6,091 Bytes
ea646b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import openai
import faiss
import numpy as np

# ------------------ RAG FUNCTIONS ------------------ #
@st.cache_resource
def build_faiss_index(csv_file: str):
    """
    Loads the CSV, creates embeddings for each row (with full info from every column)
    and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks,
    and the DataFrame.
    """
    # Load CSV data
    df = pd.read_csv(csv_file)
    
    # Create a summary chunk with key facts (e.g., total row count)
    summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}."
    
    # Create detailed text chunks for each row (include column names and their values)
    text_chunks = []
    for idx, row in df.iterrows():
        row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
        full_text = f"Row {idx}: {row_text}"
        text_chunks.append(full_text)
    
    # Prepend the summary chunk to ensure key data is included in retrieval
    texts = [summary_chunk] + text_chunks

    # Create embeddings for each chunk using OpenAI’s embedding model
    embeddings = []
    for text in texts:
        response = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=text
        )
        embedding = response["data"][0]["embedding"]
        embeddings.append(embedding)
    
    # Convert embeddings to a NumPy array for FAISS (float32)
    embedding_matrix = np.array(embeddings, dtype=np.float32)
    dimension = embedding_matrix.shape[1]
    
    # Create a FAISS index using inner product similarity
    index = faiss.IndexFlatIP(dimension)
    
    # (Optional) For cosine similarity, normalize the embedding vectors:
    # faiss.normalize_L2(embedding_matrix)
    
    index.add(embedding_matrix)
    return index, texts, df

def get_relevant_chunks(query: str, index, texts, top_k=10):
    """
    Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks.
    """
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=query
    )
    query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
    # If using cosine similarity, normalize query_embedding here as well.
    distances, indices = index.search(query_embedding, top_k)
    relevant_texts = [texts[i] for i in indices[0]]
    return relevant_texts

def answer_query(query: str, index, texts):
    """
    Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API.
    The prompt instructs GPT to answer solely using the provided data.
    """
    relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10)
    combined_context = "\n\n".join(relevant_chunks)

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. Answer ONLY using the provided data context. "
                "Do not add any information that isn't in the context."
            )
        },
        {
            "role": "user",
            "content": (
                f"Data:\n{combined_context}\n\n"
                f"Question: {query}\n\n"
                "Answer using only the data above."
            )
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.0
    )

    return response["choices"][0]["message"]["content"]

# ------------------ EXISTING APP LAYOUT ------------------ #

# Set page config to wide layout
st.set_page_config(layout="wide")
st.title("Model Insights")

# Embed the new dashboard URL in the main area
new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF"
iframe_code = f'''
<iframe width="100%" height="100%" 
src="{new_dashboard_url}" 
frameborder="0" 
style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);" 
allowfullscreen 
sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox">
</iframe>
'''
components.html(iframe_code, height=800)

# ------------------ SIDEBAR FOR Q&A ------------------ #
with st.sidebar:
    st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True)
    openai_api_key = st.text_input("Enter your OpenAI API key:", type="password")
    user_message = st.text_input("", placeholder="Ask a question from the data...")
    send_button = st.button('Generate Answer')

# Only proceed if the API key is provided
if openai_api_key:
    openai.api_key = openai_api_key

    # Build the FAISS index from the CSV file (Final_Clean_Data.csv)
    index, texts, df = build_faiss_index("Final_Clean_Data.csv")
    
    if send_button and user_message:
        query_lower = user_message.lower()
        # Handle factual queries directly to avoid hallucination
        if "total rows" in query_lower or "how many rows" in query_lower:
            answer = f"The CSV has {df.shape[0]} rows."
        elif "distribution of" in query_lower:
            # For example, if the query asks for "distribution of ae_cluster"
            if "ae_cluster" in query_lower:
                if "ae_cluster" in df.columns:
                    distribution = df["ae_cluster"].value_counts()
                    answer = f"Distribution of ae_cluster:\n{distribution.to_string()}"
                else:
                    answer = "The CSV does not contain a column named 'ae_cluster'."
            else:
                # Fallback: Use the retrieval-based answer for other distributions
                answer = answer_query(user_message, index, texts)
        else:
            # Use the retrieval-augmented generation for general queries
            answer = answer_query(user_message, index, texts)
        
        st.sidebar.subheader("Answer")
        st.sidebar.write(answer)
else:
    st.sidebar.warning("Please enter your OpenAI API key to proceed.")