Spaces:

santu24
/

Real_Estate_Analysis

Running

App Files Files Community

Real_Estate_Analysis / app.py

santu24

committed to main

ea646b7 verified 11 months ago

raw

history blame contribute delete

6.09 kB

	import streamlit as st
	import streamlit.components.v1 as components
	import pandas as pd
	import openai
	import faiss
	import numpy as np

	# ------------------ RAG FUNCTIONS ------------------ #
	@st.cache_resource
	def build_faiss_index(csv_file: str):
	"""
	Loads the CSV, creates embeddings for each row (with full info from every column)
	and adds a dedicated summary chunk. Builds and returns a FAISS index, list of text chunks,
	and the DataFrame.
	"""
	# Load CSV data
	df = pd.read_csv(csv_file)

	# Create a summary chunk with key facts (e.g., total row count)
	summary_chunk = f"SUMMARY: Total rows in the CSV = {df.shape[0]}."

	# Create detailed text chunks for each row (include column names and their values)
	text_chunks = []
	for idx, row in df.iterrows():
	row_text = " \| ".join([f"{col}: {row[col]}" for col in df.columns])
	full_text = f"Row {idx}: {row_text}"
	text_chunks.append(full_text)

	# Prepend the summary chunk to ensure key data is included in retrieval
	texts = [summary_chunk] + text_chunks

	# Create embeddings for each chunk using OpenAI’s embedding model
	embeddings = []
	for text in texts:
	response = openai.Embedding.create(
	model="text-embedding-ada-002",
	input=text
	)
	embedding = response["data"][0]["embedding"]
	embeddings.append(embedding)

	# Convert embeddings to a NumPy array for FAISS (float32)
	embedding_matrix = np.array(embeddings, dtype=np.float32)
	dimension = embedding_matrix.shape[1]

	# Create a FAISS index using inner product similarity
	index = faiss.IndexFlatIP(dimension)

	# (Optional) For cosine similarity, normalize the embedding vectors:
	# faiss.normalize_L2(embedding_matrix)

	index.add(embedding_matrix)
	return index, texts, df

	def get_relevant_chunks(query: str, index, texts, top_k=10):
	"""
	Given a query, embeds it and searches the FAISS index to return the top_k relevant text chunks.
	"""
	response = openai.Embedding.create(
	model="text-embedding-ada-002",
	input=query
	)
	query_embedding = np.array(response["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
	# If using cosine similarity, normalize query_embedding here as well.
	distances, indices = index.search(query_embedding, top_k)
	relevant_texts = [texts[i] for i in indices[0]]
	return relevant_texts

	def answer_query(query: str, index, texts):
	"""
	Retrieves relevant context chunks from the FAISS index and calls the ChatCompletion API.
	The prompt instructs GPT to answer solely using the provided data.
	"""
	relevant_chunks = get_relevant_chunks(query, index, texts, top_k=10)
	combined_context = "\n\n".join(relevant_chunks)

	messages = [
	{
	"role": "system",
	"content": (
	"You are a helpful assistant. Answer ONLY using the provided data context. "
	"Do not add any information that isn't in the context."
	)
	},
	{
	"role": "user",
	"content": (
	f"Data:\n{combined_context}\n\n"
	f"Question: {query}\n\n"
	"Answer using only the data above."
	)
	}
	]

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=messages,
	temperature=0.0
	)

	return response["choices"][0]["message"]["content"]

	# ------------------ EXISTING APP LAYOUT ------------------ #

	# Set page config to wide layout
	st.set_page_config(layout="wide")
	st.title("Model Insights")

	# Embed the new dashboard URL in the main area
	new_dashboard_url = "https://lookerstudio.google.com/embed/reporting/b3fcc2c4-24c5-4869-b128-c71e658b3f16/page/7m1DF"
	iframe_code = f'''
	<iframe width="100%" height="100%"
	src="{new_dashboard_url}"
	frameborder="0"
	style="border:0; margin:0; padding:0; height: calc(100vh - 4rem);"
	allowfullscreen
	sandbox="allow-storage-access-by-user-activation allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox">
	</iframe>
	'''
	components.html(iframe_code, height=800)

	# ------------------ SIDEBAR FOR Q&A ------------------ #
	with st.sidebar:
	st.markdown("<h2 style='border-bottom: 1px solid #ccc; color: #3949ab;'>Ask Your Data</h2>", unsafe_allow_html=True)
	openai_api_key = st.text_input("Enter your OpenAI API key:", type="password")
	user_message = st.text_input("", placeholder="Ask a question from the data...")
	send_button = st.button('Generate Answer')

	# Only proceed if the API key is provided
	if openai_api_key:
	openai.api_key = openai_api_key

	# Build the FAISS index from the CSV file (Final_Clean_Data.csv)
	index, texts, df = build_faiss_index("Final_Clean_Data.csv")

	if send_button and user_message:
	query_lower = user_message.lower()
	# Handle factual queries directly to avoid hallucination
	if "total rows" in query_lower or "how many rows" in query_lower:
	answer = f"The CSV has {df.shape[0]} rows."
	elif "distribution of" in query_lower:
	# For example, if the query asks for "distribution of ae_cluster"
	if "ae_cluster" in query_lower:
	if "ae_cluster" in df.columns:
	distribution = df["ae_cluster"].value_counts()
	answer = f"Distribution of ae_cluster:\n{distribution.to_string()}"
	else:
	answer = "The CSV does not contain a column named 'ae_cluster'."
	else:
	# Fallback: Use the retrieval-based answer for other distributions
	answer = answer_query(user_message, index, texts)
	else:
	# Use the retrieval-augmented generation for general queries
	answer = answer_query(user_message, index, texts)

	st.sidebar.subheader("Answer")
	st.sidebar.write(answer)
	else:
	st.sidebar.warning("Please enter your OpenAI API key to proceed.")