Spaces:
Sleeping
Sleeping
| import os | |
| import uuid | |
| import tempfile | |
| import streamlit as st | |
| import openai | |
| from langchain.retrievers.multi_vector import MultiVectorRetriever | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.storage import InMemoryStore | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.llms import OpenAI | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain.schema.output_parser import StrOutputParser | |
| import uuid | |
| from langchain.schema.document import Document | |
| from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser | |
| from langchain.document_loaders import PyPDFLoader | |
| # Set OpenAI API key | |
| OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"] | |
| if not OPENAI_API_KEY: | |
| st.error("OPENAI_API_KEY not set in environment variables!") | |
| raise SystemExit | |
| openai.api_key = OPENAI_API_KEY | |
| def process_pdf(uploaded_file): | |
| with st.spinner("Processing PDF..."): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| tmp.write(uploaded_file.getvalue()) | |
| tmp_path = tmp.name | |
| loaders = [PyPDFLoader(tmp_path)] | |
| docs = [] | |
| for l in loaders: | |
| docs.extend(l.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000) | |
| docs = text_splitter.split_documents(docs) | |
| return docs | |
| def smaller_chunks_strategy(docs): | |
| prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="1") | |
| if prompt: | |
| with st.spinner('Processing with smaller_chunks_strategy'): | |
| vectorstore = Chroma( | |
| collection_name="full_documents", | |
| embedding_function=OpenAIEmbeddings() | |
| ) | |
| store = InMemoryStore() | |
| id_key = "doc_id" | |
| retriever = MultiVectorRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| id_key=id_key, | |
| ) | |
| doc_ids = [str(uuid.uuid4()) for _ in docs] | |
| child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400) | |
| sub_docs = [] | |
| for i, doc in enumerate(docs): | |
| _id = doc_ids[i] | |
| _sub_docs = child_text_splitter.split_documents([doc]) | |
| for _doc in _sub_docs: | |
| _doc.metadata[id_key] = _id | |
| sub_docs.extend(_sub_docs) | |
| retriever.vectorstore.add_documents(sub_docs) | |
| retriever.docstore.mset(list(zip(doc_ids, docs))) | |
| memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
| qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=memory) | |
| st.info(prompt, icon="π§") | |
| result = qa({"question": prompt}) | |
| st.success(result['answer'], icon="π€") | |
| def summary_strategy(docs): | |
| prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="2") | |
| if prompt: | |
| with st.spinner('Processing with summary_strategy'): | |
| chain = ( | |
| {"doc": lambda x: x.page_content} | |
| | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}") | |
| | ChatOpenAI(max_retries=0) | |
| | StrOutputParser() | |
| ) | |
| summaries = chain.batch(docs, {"max_concurrency": 5}) | |
| vectorstore = Chroma( | |
| collection_name="summaries", | |
| embedding_function= OpenAIEmbeddings() | |
| ) | |
| store = InMemoryStore() | |
| id_key = "doc_id" | |
| retriever = MultiVectorRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| id_key=id_key, | |
| ) | |
| doc_ids = [str(uuid.uuid4()) for _ in docs] | |
| summary_docs = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(summaries)] | |
| retriever.vectorstore.add_documents(summary_docs) | |
| retriever.docstore.mset(list(zip(doc_ids, docs))) | |
| qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)) | |
| st.info(prompt, icon="π§") | |
| result = qa({"question": prompt}) | |
| st.success(result['answer'], icon="π€") | |
| def hypothetical_questions_strategy(docs): | |
| prompt = st.text_input("Enter Your Question:", placeholder="Ask something", key="3") | |
| if prompt: | |
| with st.spinner('Processing with hypothetical_questions_strategy'): | |
| functions = [ | |
| { | |
| "name": "hypothetical_questions", | |
| "description": "Generate hypothetical questions", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "questions": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| }, | |
| }, | |
| "required": ["questions"] | |
| } | |
| } | |
| ] | |
| chain = ( | |
| {"doc": lambda x: x.page_content} | |
| | ChatPromptTemplate.from_template("Generate a list of 3 hypothetical questions that the below document could be used to answer:\n\n{doc}") | |
| | ChatOpenAI(max_retries=0, model="gpt-4").bind(functions=functions, function_call={"name": "hypothetical_questions"}) | |
| | JsonKeyOutputFunctionsParser(key_name="questions") | |
| ) | |
| hypothetical_questions = chain.batch(docs, {"max_concurrency": 5}) | |
| vectorstore = Chroma( | |
| collection_name="hypo-questions", | |
| embedding_function=OpenAIEmbeddings() | |
| ) | |
| store = InMemoryStore() | |
| id_key = "doc_id" | |
| retriever = MultiVectorRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| id_key=id_key, | |
| ) | |
| doc_ids = [str(uuid.uuid4()) for _ in docs] | |
| question_docs = [] | |
| for i, question_list in enumerate(hypothetical_questions): | |
| question_docs.extend([Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]) | |
| retriever.vectorstore.add_documents(question_docs) | |
| retriever.docstore.mset(list(zip(doc_ids, docs))) | |
| qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever, memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)) | |
| st.info(prompt, icon="π§") | |
| result = qa({"question": prompt}) | |
| st.success(result['answer'], icon="π€") | |
| def app(): | |
| image_path = "icon.png" | |
| st.sidebar.image(image_path, caption="icon", use_column_width=True) | |
| st.title("VecDBCompare 0.0.1") | |
| st.sidebar.markdown(""" | |
| # π **VecDBCompare: Your Vector DB Strategy Tester** | |
| ## π **What is it?** | |
| VecDBCompare lets you evaluate and compare three vector database retrieval strategies in a snap! | |
| ## π€ **How to Use?** | |
| 1. **Upload a PDF** π | |
| 2. Get **Three QABots** π€π€π€, each with a different strategy. | |
| 3. **Ask questions** β and see how each bot responds differently. | |
| 4. **Decide** β which strategy works best for you! | |
| ## π **Why VecDBCompare?** | |
| - **Simple & Fast** β‘: Upload, ask, and compare! | |
| - **Real-time Comparison** π: See strategies in action side-by-side. | |
| - **Empower Your Choice** π‘: Pick the best strategy for your needs. | |
| Dive in and discover with VecDBCompare! π | |
| """) | |
| uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"]) | |
| if uploaded_file: | |
| docs = process_pdf(uploaded_file) | |
| option = st.selectbox( | |
| "Which retrieval strategy would you like to use?", | |
| ("Smaller Chunks", "Summary", "Hypothetical Questions") | |
| ) | |
| if option == 'Smaller Chunks': | |
| smaller_chunks_strategy(docs) | |
| elif option == 'Summary': | |
| summary_strategy(docs) | |
| elif option == 'Hypothetical Questions': | |
| hypothetical_questions_strategy(docs) | |
| if __name__ == "__main__": | |
| st.set_page_config(layout="wide") | |
| app() | |