Spaces:

acpotts
/

MidtermAgenticRAG

Runtime error

App Files Files Community

acpotts commited on Sep 18, 2024

Commit

2859653

1 Parent(s): 4eb274b

add pdf capability

Browse files

Files changed (2) hide show

app.py +18 -7
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -10,7 +10,11 @@ from aimakerspace.openai_utils.prompts import (
 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
 system_template = """\
 Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
@@ -47,24 +51,31 @@ class RetrievalAugmentedQAPipeline:
         return {"response": generate_response(), "context": context_list}
-text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
         temp_file_path = temp_file.name
     with open(temp_file_path, "wb") as f:
         f.write(file.content)
-    text_loader = TextFileLoader(temp_file_path)
-    documents = text_loader.load_documents()
-    texts = text_splitter.split_texts(documents)
     return texts
 @cl.on_chat_start
 async def on_chat_start():
     files = None

 from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_openai.embeddings import OpenAIEmbeddings
 import chainlit as cl
+import tempfile
+from langchain_community.document_loaders.pdf import PyPDFLoader
 system_template = """\
 Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
         return {"response": generate_response(), "context": context_list}
+# text_splitter = CharacterTextSplitter()
+text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="standard_deviation")
 def process_text_file(file: AskFileResponse):
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=file.name) as temp_file:
         temp_file_path = temp_file.name
     with open(temp_file_path, "wb") as f:
         f.write(file.content)
+    if file.type == 'text/plain':
+        text_loader = TextFileLoader(temp_file_path)
+        documents = text_loader.load_documents()
+    elif file.type == 'application/pdf':
+        pdf_loader = PyPDFLoader(temp_file_path)
+        documents = pdf_loader.load()
+    else:
+        raise ValueError("Provide a .txt or .pdf file")
+    texts = [x.page_content for x in text_splitter.transform_documents(documents)]
     return texts
 @cl.on_chat_start
 async def on_chat_start():
     files = None

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 numpy
 chainlit==0.7.700
-openai

 numpy
 chainlit==0.7.700
+openai
+langchain_experimental
+langchain_openai