import streamlit as st import pandas as pd import json import io import os from langchain.llms import OpenAI from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.chains import RetrievalQA import PyPDF2 from docx import Document from dotenv import load_dotenv, find_dotenv _ = load_dotenv(find_dotenv()) # Get API key from Streamlit secrets API_KEY = os.getenv("OPENAI_API_KEY") # Initialize Chroma embeddings_model = OpenAIEmbeddings(openai_api_key=API_KEY) persist_directory = "db" vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings_model) def create_agent(file_content, file_type): """Create an agent based on file content and type.""" if file_type == "csv": df = pd.read_csv(io.StringIO(file_content.decode("utf-8")), header=0) elif file_type == "xlsx": df = pd.read_excel(file_content, header=0) elif file_type == "json": df = pd.DataFrame(json.loads(file_content.decode("utf-8"))) elif file_type in ["pdf", "docx"]: text = extract_text_from_file(file_content, file_type) text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_text(text) df = pd.DataFrame({"text": texts}) else: raise ValueError(f"Unsupported file type: {file_type}") # Add data to Chroma (if not already present) global vectorstore if not vectorstore._collection.count(): vectorstore.add_texts(texts=df['text'].tolist(), metadatas=[{'source': file_type}] * len(df)) llm = OpenAI(openai_api_key=API_KEY) return create_pandas_dataframe_agent(llm, df, verbose=False) def extract_text_from_file(file_content, file_type): """Extract text from PDF or Word document.""" if file_type == "pdf": pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) text = "" for page in pdf_reader.pages: text += page.extract_text() elif file_type == "docx": doc = Document(io.BytesIO(file_content)) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) else: raise ValueError(f"Unsupported file type: {file_type}") return text def query_agent(query): """Query the agent and return the response as a string.""" # Initialize RetrievalQA chain qa_chain = RetrievalQA.from_chain_type( llm=OpenAI(openai_api_key=API_KEY), chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), ) # Get answer from RetrievalQA chain result = qa_chain({"query": query}) answer = result['result'] return answer # --- Streamlit app --- st.title("👨‍💻 Chat with your data") st.write("Please upload your data file below.") uploaded_file = st.file_uploader("Upload a file", type=["csv", "xlsx", "json", "pdf", "docx"]) if uploaded_file is not None: file_content = uploaded_file.read() file_type = uploaded_file.name.split(".")[-1] query = st.text_area("Type your query here") if st.button("Submit Query", type="primary"): # Persist Chroma collection (if it doesn't exist) if not vectorstore._collection.count(): create_agent(file_content, file_type) # Call create_agent to load and index data vectorstore.persist() st.write("Data loaded and persisted to Chroma.") response = query_agent(query) st.write(response)