File size: 6,508 Bytes
fe07c43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import streamlit as st
from dotenv import load_dotenv
import os
import sqlite3
import pandas as pd
import google.generativeai as genai
import re
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
import tempfile

# Load environment variables
load_dotenv()

# Configure API keys
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
groq_api_key = os.getenv("GROQ_API_KEY")

# Function to load Google Gemini Model and get response
def get_gemini_response(question, prompt, schema_info):
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content([prompt, schema_info, question])
    return response.text

# Function to retrieve query from the database
def read_sql_query(sql, db):
    try:
        conn = sqlite3.connect(db)
        df = pd.read_sql_query(sql, conn)
        conn.close()
        return df
    except sqlite3.Error as e:
        st.error(f"An error occurred: {e.args[0]}")
        return None

# Function to convert DataFrame to SQLite database
def dataframe_to_sqlite(df, db_name, table_name):
    conn = sqlite3.connect(db_name)
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()
    st.success("Data successfully loaded into the database!")

# Function to get schema information
def get_schema_info(df):
    columns = df.columns.tolist()
    dtypes = df.dtypes.astype(str).tolist()
    schema_info = "Table name: DATA\nColumns:\n"
    for col, dtype in zip(columns, dtypes):
        schema_info += f"- {col} ({dtype})\n"
    return schema_info

# Function to clean SQL query
def clean_sql_query(query):
    query = re.sub(r'```sql|```', '', query)
    query = query.strip()
    return query

# Function to process PDF file
def process_document(file):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
        tmp_file.write(file.getvalue())
        tmp_file_path = tmp_file.name

    loader = PyPDFLoader(tmp_file_path)
    documents = loader.load()
    
    os.unlink(tmp_file_path)  # Delete the temporary file
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    splits = text_splitter.split_documents(documents)
    
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
    return vectorstore.as_retriever()

# Streamlit App
st.set_page_config(page_title="AI-Powered Data Assistant")
st.header("AI-Powered Data Assistant")

# File upload section
st.subheader("Upload Your Data")
excel_file = st.file_uploader("Choose an Excel or CSV file", type=["xlsx", "csv"])
pdf_file = st.file_uploader("Choose a PDF file", type=["pdf"])

if excel_file:
    try:
        if excel_file.name.endswith('.csv'):
            df = pd.read_csv(excel_file)
        else:
            df = pd.read_excel(excel_file)
        
        dataframe_to_sqlite(df, "data.db", "DATA")
        st.write(df.head())
        st.success("Excel/CSV file successfully uploaded and data loaded into the database!")
        
        st.session_state['schema_info'] = get_schema_info(df)
    except Exception as e:
        st.error(f"Error processing the Excel/CSV file: {str(e)}")

if pdf_file:
    try:
        retriever = process_document(pdf_file)
        st.success("PDF file successfully processed!")
        st.session_state['retriever'] = retriever
    except Exception as e:
        st.error(f"Error processing the PDF file: {str(e)}")

# Add radio button for data source selection
data_source = st.radio(
    "Choose your data source for the query:",
    ("PDF", "Excel/CSV"),
    index=None,
    key="data_source"
)

# User query input
question = st.text_input("Ask a question about your data:", key="input")
submit = st.button("Get Answer")

if submit and question:
    if not data_source:
        st.error("Please select a data source (PDF or Excel/CSV) before submitting your question.")
    elif data_source == "PDF" and 'retriever' in st.session_state:
        with st.spinner("Processing your question using the PDF content..."):
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
            pdf_prompt = PromptTemplate.from_template("""

            You are an AI assistant specialized in analyzing and answering questions about PDF documents. 

            Use the provided context to answer the user's question accurately and concisely. 

            If the answer is not directly stated in the context, use your knowledge to provide a reasonable response, 

            but make it clear when you're inferring or speculating. 

            If you cannot answer the question based on the given context, say so clearly.



            Context: {context}



            Question: {input}



            Provide a clear, concise, and informative answer:

            """)
            
            rag_chain = (
                {"context": st.session_state['retriever'], "input": RunnablePassthrough()}
                | pdf_prompt
                | llm
            )
            
            response = rag_chain.invoke(question)
            
            st.subheader("Answer:")
            st.write(response.content)

    elif data_source == "Excel/CSV" and 'schema_info' in st.session_state:
        with st.spinner("Generating SQL query..."):
            sql_query = get_gemini_response(question, "Generate SQL for this query:", st.session_state['schema_info'])
            clean_query = clean_sql_query(sql_query)
        
        st.subheader("Generated SQL Query:")
        st.code(clean_query, language="sql")
        
        with st.spinner("Executing query and fetching results..."):
            result_df = read_sql_query(clean_query, "data.db")
        
        if result_df is not None and not result_df.empty:
            st.subheader("Query Result:")
            st.dataframe(result_df)
        else:
            st.warning("No data found or an error occurred while executing the query.")

    else:
        st.error(f"Unable to process the query. Please make sure you've uploaded the appropriate file ({data_source}).")