Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| from llama_parse import LlamaParse | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings.fastembed import FastEmbedEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import RetrievalQA | |
| from langchain_groq import ChatGroq | |
| import joblib | |
| import tempfile | |
| # API keys | |
| llama_cloud_api_key = "llx-rVenNfvEyWTTZ2bOJIY7zymr6oyyucfdBusq407A6RzZhMKb" | |
| groq_api_key = "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" | |
| # Function to load or parse data from uploaded PDF file | |
| def load_or_parse_data(uploaded_file): | |
| data_file = "./data/parsed_data.pkl" | |
| with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
| temp_file.write(uploaded_file.getvalue()) | |
| temp_file_path = temp_file.name | |
| parsing_instruction = """The provided document is a quarterly report filed by Uber Technologies, | |
| Inc. with the Securities and Exchange Commission (SEC)... | |
| """ | |
| parser = LlamaParse(api_key=llama_cloud_api_key, result_type="markdown", parsing_instruction=parsing_instruction, max_timeout=5000) | |
| llama_parse_documents = parser.load_data(temp_file_path) | |
| os.remove(temp_file_path) | |
| return llama_parse_documents | |
| # User uploads PDF file | |
| uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| llama_parse_documents = load_or_parse_data(uploaded_file) | |
| if llama_parse_documents: | |
| # Create data directory if it doesn't exist | |
| os.makedirs("data", exist_ok=True) | |
| # Further processing of the parsed data... | |
| # Further processing of the parsed data | |
| with open('data/output.md', 'a') as f: | |
| for doc in llama_parse_documents: | |
| f.write(doc.text + '\n') | |
| markdown_path = "data/output.md" | |
| loader = UnstructuredMarkdownLoader(markdown_path) | |
| documents = loader.load() | |
| # Split loaded documents into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) | |
| docs = text_splitter.split_documents(documents) | |
| # Initialize Embeddings | |
| embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5") | |
| if docs: | |
| # Create and persist a Chroma vector database from the chunked documents | |
| vs = Chroma.from_documents( | |
| documents=docs, | |
| embedding=embed_model, | |
| persist_directory="chroma_db_llamaparse1", | |
| collection_name="rag" | |
| ) | |
| # Initialize ChatGroq model | |
| chat_model = ChatGroq( | |
| temperature=0, | |
| model_name="mixtral-8x7b-32768", | |
| api_key=groq_api_key | |
| ) | |
| # Convert retrieved documents into QA format | |
| custom_prompt_template = """ | |
| Use the following pieces of information to answer the user's question. | |
| If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
| Context: {context} | |
| Question: {question} | |
| Only return the helpful answer below and nothing else. | |
| Helpful answer: | |
| """ | |
| prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question']) | |
| # Initialize RetrievalQA | |
| qa = RetrievalQA.from_chain_type( | |
| llm=chat_model, | |
| chain_type="stuff", | |
| retriever=vs.as_retriever(search_kwargs={'k': 3}), | |
| return_source_documents=True, | |
| chain_type_kwargs={"prompt": prompt} | |
| ) | |
| # Define function to interactively ask questions and retrieve answers | |
| def ask_question(question): | |
| response = qa.invoke({"query": question}) | |
| return response["result"] | |
| # Example questions | |
| example_questions = [ | |
| "What is the Balance of UBER TECHNOLOGIES, INC. as of December 31, 2021?", | |
| "What is the Cash flows from operating activities associated with bad expense specified in the document?", | |
| "What is Loss (income) from equity method investments, net?" | |
| ] | |
| # Ask questions and display answers | |
| for idx, question in enumerate(example_questions, start=1): | |
| st.subheader(f"Question {idx}: {question}") | |
| answer = ask_question(question) | |
| st.write(f"Answer: {answer}") | |
| else: | |
| st.write("No documents were parsed.") | |