Spaces:

Cheselle
/

DocDoc

Sleeping

App Files Files Community

Cheselle commited on Aug 21, 2024

Commit

81fd7f8

1 Parent(s): d4239dd

Trying pdf feature

Browse files

Files changed (2) hide show

.DS_Store +0 -0
app.py +29 -6

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
 system_template = """\
 Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
@@ -49,7 +50,6 @@ class RetrievalAugmentedQAPipeline:
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
     import tempfile
@@ -64,6 +64,22 @@ def process_text_file(file: AskFileResponse):
     texts = text_splitter.split_texts(documents)
     return texts
 @cl.on_chat_start
 async def on_chat_start():
@@ -72,8 +88,8 @@ async def on_chat_start():
     # Wait for the user to upload a file
     while files == None:
         files = await cl.AskFileMessage(
-            content="Please upload a Text File file to begin!",
-            accept=["text/plain"],
             max_size_mb=2,
             timeout=180,
         ).send()
@@ -85,8 +101,15 @@ async def on_chat_start():
     )
     await msg.send()
-    # load the file
-    texts = process_text_file(file)
     print(f"Processing {len(texts)} text chunks")
@@ -119,4 +142,4 @@ async def main(message):
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
-    await msg.send()

 from aimakerspace.vectordatabase import VectorDatabase
 from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 import chainlit as cl
+import fitz  # PyMuPDF
 system_template = """\
 Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 text_splitter = CharacterTextSplitter()
 def process_text_file(file: AskFileResponse):
     import tempfile
     texts = text_splitter.split_texts(documents)
     return texts
+def process_pdf_file(file: AskFileResponse):
+    import tempfile
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+        temp_file_path = temp_file.name
+    with open(temp_file_path, "wb") as f:
+        f.write(file.content)
+    doc = fitz.open(temp_file_path)
+    texts = []
+    for page in doc:
+        texts.append(page.get_text())
+    os.remove(temp_file_path)  # Clean up the temporary file after processing
+    return texts
 @cl.on_chat_start
 async def on_chat_start():
     # Wait for the user to upload a file
     while files == None:
         files = await cl.AskFileMessage(
+            content="Please upload a Text or PDF file to begin!",
+            accept=["text/plain", "application/pdf"],
             max_size_mb=2,
             timeout=180,
         ).send()
     )
     await msg.send()
+    # Load the file based on its type
+    if file.type == "text/plain":
+        texts = process_text_file(file)
+    elif file.type == "application/pdf":
+        texts = process_pdf_file(file)
+    else:
+        msg.content = "Unsupported file type."
+        await msg.update()
+        return
     print(f"Processing {len(texts)} text chunks")
     async for stream_resp in result["response"]:
         await msg.stream_token(stream_resp)
+    await msg.send()