import streamlit as st from dotenv import load_dotenv import os import sqlite3 import pandas as pd import google.generativeai as genai import re from langchain import hub from langchain_chroma import Chroma from langchain_community.document_loaders import PyPDFLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_groq import ChatGroq from langchain.prompts import PromptTemplate from langchain.schema.runnable import RunnablePassthrough import tempfile # Load environment variables load_dotenv() # Configure API keys genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) groq_api_key = os.getenv("GROQ_API_KEY") # Function to load Google Gemini Model and get response def get_gemini_response(question, prompt, schema_info): model = genai.GenerativeModel('gemini-pro') response = model.generate_content([prompt, schema_info, question]) return response.text # Function to retrieve query from the database def read_sql_query(sql, db): try: conn = sqlite3.connect(db) df = pd.read_sql_query(sql, conn) conn.close() return df except sqlite3.Error as e: st.error(f"An error occurred: {e.args[0]}") return None # Function to convert DataFrame to SQLite database def dataframe_to_sqlite(df, db_name, table_name): conn = sqlite3.connect(db_name) df.to_sql(table_name, conn, if_exists='replace', index=False) conn.close() st.success("Data successfully loaded into the database!") # Function to get schema information def get_schema_info(df): columns = df.columns.tolist() dtypes = df.dtypes.astype(str).tolist() schema_info = "Table name: DATA\nColumns:\n" for col, dtype in zip(columns, dtypes): schema_info += f"- {col} ({dtype})\n" return schema_info # Function to clean SQL query def clean_sql_query(query): query = re.sub(r'```sql|```', '', query) query = query.strip() return query # Function to process PDF file def process_document(file): with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: tmp_file.write(file.getvalue()) tmp_file_path = tmp_file.name loader = PyPDFLoader(tmp_file_path) documents = loader.load() os.unlink(tmp_file_path) # Delete the temporary file text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) splits = text_splitter.split_documents(documents) vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings()) return vectorstore.as_retriever() # Streamlit App st.set_page_config(page_title="AI-Powered Data Assistant") st.header("AI-Powered Data Assistant") # File upload section st.subheader("Upload Your Data") excel_file = st.file_uploader("Choose an Excel or CSV file", type=["xlsx", "csv"]) pdf_file = st.file_uploader("Choose a PDF file", type=["pdf"]) if excel_file: try: if excel_file.name.endswith('.csv'): df = pd.read_csv(excel_file) else: df = pd.read_excel(excel_file) dataframe_to_sqlite(df, "data.db", "DATA") st.write(df.head()) st.success("Excel/CSV file successfully uploaded and data loaded into the database!") st.session_state['schema_info'] = get_schema_info(df) except Exception as e: st.error(f"Error processing the Excel/CSV file: {str(e)}") if pdf_file: try: retriever = process_document(pdf_file) st.success("PDF file successfully processed!") st.session_state['retriever'] = retriever except Exception as e: st.error(f"Error processing the PDF file: {str(e)}") # Add radio button for data source selection data_source = st.radio( "Choose your data source for the query:", ("PDF", "Excel/CSV"), index=None, key="data_source" ) # User query input question = st.text_input("Ask a question about your data:", key="input") submit = st.button("Get Answer") if submit and question: if not data_source: st.error("Please select a data source (PDF or Excel/CSV) before submitting your question.") elif data_source == "PDF" and 'retriever' in st.session_state: with st.spinner("Processing your question using the PDF content..."): llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768") pdf_prompt = PromptTemplate.from_template(""" You are an AI assistant specialized in analyzing and answering questions about PDF documents. Use the provided context to answer the user's question accurately and concisely. If the answer is not directly stated in the context, use your knowledge to provide a reasonable response, but make it clear when you're inferring or speculating. If you cannot answer the question based on the given context, say so clearly. Context: {context} Question: {input} Provide a clear, concise, and informative answer: """) rag_chain = ( {"context": st.session_state['retriever'], "input": RunnablePassthrough()} | pdf_prompt | llm ) response = rag_chain.invoke(question) st.subheader("Answer:") st.write(response.content) elif data_source == "Excel/CSV" and 'schema_info' in st.session_state: with st.spinner("Generating SQL query..."): sql_query = get_gemini_response(question, "Generate SQL for this query:", st.session_state['schema_info']) clean_query = clean_sql_query(sql_query) st.subheader("Generated SQL Query:") st.code(clean_query, language="sql") with st.spinner("Executing query and fetching results..."): result_df = read_sql_query(clean_query, "data.db") if result_df is not None and not result_df.empty: st.subheader("Query Result:") st.dataframe(result_df) else: st.warning("No data found or an error occurred while executing the query.") else: st.error(f"Unable to process the query. Please make sure you've uploaded the appropriate file ({data_source}).")