Spaces:
Runtime error
Runtime error
| import os | |
| from langchain.document_loaders import PyPDFium2Loader | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from sllim import chat | |
| # Standard Textract client setup | |
| template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct. | |
| DOCUMENTS: | |
| {docs} | |
| --- | |
| QUERY: | |
| {query} | |
| """ | |
| embeddings = OpenAIEmbeddings() | |
| def process_file(file_path): | |
| index_path = get_index_name(file_path) | |
| if os.path.exists(index_path): | |
| return | |
| loader = PyPDFium2Loader(file_path) | |
| data = loader.load() | |
| # Parse text into paragraphs | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=50, | |
| length_function=len, | |
| ) | |
| docs = text_splitter.split_documents(data) | |
| # Embed paragraphs | |
| db = FAISS.from_documents(docs, embeddings) | |
| db.save_local(index_path) | |
| def get_index_name(file_path): | |
| basename = os.path.splitext(os.path.basename(file_path))[0] | |
| index_path = basename + "_faiss_index" | |
| return index_path | |
| def ask_question_all(history): | |
| indices = [] | |
| docs = [] | |
| messages = [] | |
| for user, bot in history: | |
| if not isinstance(user, str): | |
| indices.append(get_index_name(user[0])) | |
| elif bot: | |
| messages.append({"role": "user", "content": user}) | |
| messages.append({"role": "assistant", "content": bot}) | |
| else: | |
| # Handle new message | |
| for index_path in indices: | |
| db = FAISS.load_local(index_path, embeddings) | |
| docs.extend(db.similarity_search(user)) | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": template.format( | |
| query=user, docs="\n".join(map(lambda x: x.page_content, docs)) | |
| ), | |
| } | |
| ) | |
| # send similar paragraphs with question to model | |
| return chat(messages, model="gpt-3.5-turbo") | |
| def ask_question(query, upload_file, history=None): | |
| file_path = upload_file.name | |
| index_path = get_index_name(file_path) | |
| if not os.path.exists(index_path): | |
| loader = PyPDFium2Loader(file_path) | |
| data = loader.load() | |
| # Parse text into paragraphs | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=50, | |
| length_function=len, | |
| ) | |
| docs = text_splitter.split_documents(data) | |
| # Embed paragraphs | |
| db = FAISS.from_documents(docs, embeddings) | |
| db.save_local(index_path) | |
| else: | |
| db = FAISS.load_local(index_path, embeddings) | |
| docs = db.similarity_search(query) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": template.format( | |
| query=query, docs="\n".join(map(lambda x: x.page_content, docs)) | |
| ), | |
| } | |
| ] | |
| # send similar paragraphs with question to model | |
| return chat(messages, model="gpt-3.5-turbo") | |