import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import openai
import faiss
import numpy as np

# ------------------ RAG FUNCTIONS ------------------ #
@st.cache_resource
def build_faiss_index(csv_file: str):
    """
    Loads the CSV, creates embeddings for each row (with full info from every column)
    and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks,
    and the DataFrame.
    """
    # Load CSV data
    df = pd.read_csv(csv_file)
    
    # Create a summary chunk with key facts (e.g., total row count)
    summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}."
    
    # Create detailed text chunks for each row (include column names and their values)
    text_chunks = []
    for idx, row in df.iterrows():
        row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
        full_text = f"Row {idx}: {row_text}"
        text_chunks.append(full_text)
    
    # Prepend the summary chunk to ensure key data is included in retrieval
    texts = [summary_chunk] + text_chunks

    # Create embeddings for each chunk using OpenAI’s embedding model
    embeddings = []
    for text in texts:
        response = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=text
        )
        embedding = response["data"][0]["embedding"]
        embeddings.append(embedding)
    
    # Convert embeddings to a NumPy array for FAISS (float32)
    embedding_matrix = np.array(embeddings, dtype=np.float32)
    dimension = embedding_matrix.shape[1]
    
    # Create a FAISS index using inner product similarity
    index = faiss.IndexFlatIP(dimension)
    
    # (Optional) For cosine similarity, normalize the embedding vectors:
    # faiss.normalize_L2(embedding_matrix)
    
    index.add(embedding_matrix)
    return index, texts, df

def get_relevant_chunks(query: str, index, texts, top_k=10):
    """
    Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks.
    """
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=query
    )
    query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
    # If using cosine similarity, normalize query_embedding here as well.
    distances, indices = index.search(query_embedding, top_k)
    relevant_texts = [texts[i] for i in indices[0]]
    return relevant_texts

def answer_query(query: str, index, texts):
    """
    Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API.
    The prompt instructs GPT to answer solely using the provided data.
    """
    relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10)
    combined_context = "\n\n".join(relevant_chunks)

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful assistant. Answer ONLY using the provided data context. "
                "Do not add any information that isn't in the context."
            )
        },
        {
            "role": "user",
            "content": (
                f"Data:\n{combined_context}\n\n"
                f"Question: {query}\n\n"
                "Answer using only the data above."
            )
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.0
    )

    return response["choices"][0]["message"]["content"]

# ------------------ EXISTING APP LAYOUT ------------------ #

# Set page config to wide layout
st.set_page_config(layout="wide")
st.title("Model Insights")

# Embed the new dashboard URL in the main area
new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF"
iframe_code = f'''
<iframe width="100%" height="100%" 
src="{new_dashboard_url}" 
frameborder="0" 
style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);" 
allowfullscreen 
sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox">
</iframe>
'''
components.html(iframe_code, height=800)

# ------------------ SIDEBAR FOR Q&A ------------------ #
with st.sidebar:
    st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True)
    openai_api_key = st.text_input("Enter your OpenAI API key:", type="password")
    user_message = st.text_input("", placeholder="Ask a question from the data...")
    send_button = st.button('Generate Answer')

# Only proceed if the API key is provided
if openai_api_key:
    openai.api_key = openai_api_key

    # Build the FAISS index from the CSV file (Final_Clean_Data.csv)
    index, texts, df = build_faiss_index("Final_Clean_Data.csv")
    
    if send_button and user_message:
        query_lower = user_message.lower()
        # Handle factual queries directly to avoid hallucination
        if "total rows" in query_lower or "how many rows" in query_lower:
            answer = f"The CSV has {df.shape[0]} rows."
        elif "distribution of" in query_lower:
            # For example, if the query asks for "distribution of ae_cluster"
            if "ae_cluster" in query_lower:
                if "ae_cluster" in df.columns:
                    distribution = df["ae_cluster"].value_counts()
                    answer = f"Distribution of ae_cluster:\n{distribution.to_string()}"
                else:
                    answer = "The CSV does not contain a column named 'ae_cluster'."
            else:
                # Fallback: Use the retrieval-based answer for other distributions
                answer = answer_query(user_message, index, texts)
        else:
            # Use the retrieval-augmented generation for general queries
            answer = answer_query(user_message, index, texts)
        
        st.sidebar.subheader("Answer")
        st.sidebar.write(answer)
else:
    st.sidebar.warning("Please enter your OpenAI API key to proceed.")