mayankjen's picture
first upload
fe07c43 verified
import streamlit as st
from dotenv import load_dotenv
import os
import sqlite3
import pandas as pd
import google.generativeai as genai
import re
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
import tempfile
# Load environment variables
load_dotenv()
# Configure API keys
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
groq_api_key = os.getenv("GROQ_API_KEY")
# Function to load Google Gemini Model and get response
def get_gemini_response(question, prompt, schema_info):
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content([prompt, schema_info, question])
return response.text
# Function to retrieve query from the database
def read_sql_query(sql, db):
try:
conn = sqlite3.connect(db)
df = pd.read_sql_query(sql, conn)
conn.close()
return df
except sqlite3.Error as e:
st.error(f"An error occurred: {e.args[0]}")
return None
# Function to convert DataFrame to SQLite database
def dataframe_to_sqlite(df, db_name, table_name):
conn = sqlite3.connect(db_name)
df.to_sql(table_name, conn, if_exists='replace', index=False)
conn.close()
st.success("Data successfully loaded into the database!")
# Function to get schema information
def get_schema_info(df):
columns = df.columns.tolist()
dtypes = df.dtypes.astype(str).tolist()
schema_info = "Table name: DATA\nColumns:\n"
for col, dtype in zip(columns, dtypes):
schema_info += f"- {col} ({dtype})\n"
return schema_info
# Function to clean SQL query
def clean_sql_query(query):
query = re.sub(r'```sql|```', '', query)
query = query.strip()
return query
# Function to process PDF file
def process_document(file):
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(file.getvalue())
tmp_file_path = tmp_file.name
loader = PyPDFLoader(tmp_file_path)
documents = loader.load()
os.unlink(tmp_file_path) # Delete the temporary file
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(documents)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
return vectorstore.as_retriever()
# Streamlit App
st.set_page_config(page_title="AI-Powered Data Assistant")
st.header("AI-Powered Data Assistant")
# File upload section
st.subheader("Upload Your Data")
excel_file = st.file_uploader("Choose an Excel or CSV file", type=["xlsx", "csv"])
pdf_file = st.file_uploader("Choose a PDF file", type=["pdf"])
if excel_file:
try:
if excel_file.name.endswith('.csv'):
df = pd.read_csv(excel_file)
else:
df = pd.read_excel(excel_file)
dataframe_to_sqlite(df, "data.db", "DATA")
st.write(df.head())
st.success("Excel/CSV file successfully uploaded and data loaded into the database!")
st.session_state['schema_info'] = get_schema_info(df)
except Exception as e:
st.error(f"Error processing the Excel/CSV file: {str(e)}")
if pdf_file:
try:
retriever = process_document(pdf_file)
st.success("PDF file successfully processed!")
st.session_state['retriever'] = retriever
except Exception as e:
st.error(f"Error processing the PDF file: {str(e)}")
# Add radio button for data source selection
data_source = st.radio(
"Choose your data source for the query:",
("PDF", "Excel/CSV"),
index=None,
key="data_source"
)
# User query input
question = st.text_input("Ask a question about your data:", key="input")
submit = st.button("Get Answer")
if submit and question:
if not data_source:
st.error("Please select a data source (PDF or Excel/CSV) before submitting your question.")
elif data_source == "PDF" and 'retriever' in st.session_state:
with st.spinner("Processing your question using the PDF content..."):
llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
pdf_prompt = PromptTemplate.from_template("""
You are an AI assistant specialized in analyzing and answering questions about PDF documents.
Use the provided context to answer the user's question accurately and concisely.
If the answer is not directly stated in the context, use your knowledge to provide a reasonable response,
but make it clear when you're inferring or speculating.
If you cannot answer the question based on the given context, say so clearly.
Context: {context}
Question: {input}
Provide a clear, concise, and informative answer:
""")
rag_chain = (
{"context": st.session_state['retriever'], "input": RunnablePassthrough()}
| pdf_prompt
| llm
)
response = rag_chain.invoke(question)
st.subheader("Answer:")
st.write(response.content)
elif data_source == "Excel/CSV" and 'schema_info' in st.session_state:
with st.spinner("Generating SQL query..."):
sql_query = get_gemini_response(question, "Generate SQL for this query:", st.session_state['schema_info'])
clean_query = clean_sql_query(sql_query)
st.subheader("Generated SQL Query:")
st.code(clean_query, language="sql")
with st.spinner("Executing query and fetching results..."):
result_df = read_sql_query(clean_query, "data.db")
if result_df is not None and not result_df.empty:
st.subheader("Query Result:")
st.dataframe(result_df)
else:
st.warning("No data found or an error occurred while executing the query.")
else:
st.error(f"Unable to process the query. Please make sure you've uploaded the appropriate file ({data_source}).")