Trying pdf feature
Browse files
.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
|
|
| 11 |
from aimakerspace.vectordatabase import VectorDatabase
|
| 12 |
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
|
| 13 |
import chainlit as cl
|
|
|
|
| 14 |
|
| 15 |
system_template = """\
|
| 16 |
Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
|
|
@@ -49,7 +50,6 @@ class RetrievalAugmentedQAPipeline:
|
|
| 49 |
|
| 50 |
text_splitter = CharacterTextSplitter()
|
| 51 |
|
| 52 |
-
|
| 53 |
def process_text_file(file: AskFileResponse):
|
| 54 |
import tempfile
|
| 55 |
|
|
@@ -64,6 +64,22 @@ def process_text_file(file: AskFileResponse):
|
|
| 64 |
texts = text_splitter.split_texts(documents)
|
| 65 |
return texts
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
@cl.on_chat_start
|
| 69 |
async def on_chat_start():
|
|
@@ -72,8 +88,8 @@ async def on_chat_start():
|
|
| 72 |
# Wait for the user to upload a file
|
| 73 |
while files == None:
|
| 74 |
files = await cl.AskFileMessage(
|
| 75 |
-
content="Please upload a Text
|
| 76 |
-
accept=["text/plain"],
|
| 77 |
max_size_mb=2,
|
| 78 |
timeout=180,
|
| 79 |
).send()
|
|
@@ -85,8 +101,15 @@ async def on_chat_start():
|
|
| 85 |
)
|
| 86 |
await msg.send()
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
print(f"Processing {len(texts)} text chunks")
|
| 92 |
|
|
@@ -119,4 +142,4 @@ async def main(message):
|
|
| 119 |
async for stream_resp in result["response"]:
|
| 120 |
await msg.stream_token(stream_resp)
|
| 121 |
|
| 122 |
-
await msg.send()
|
|
|
|
| 11 |
from aimakerspace.vectordatabase import VectorDatabase
|
| 12 |
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
|
| 13 |
import chainlit as cl
|
| 14 |
+
import fitz # PyMuPDF
|
| 15 |
|
| 16 |
system_template = """\
|
| 17 |
Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
|
|
|
|
| 50 |
|
| 51 |
text_splitter = CharacterTextSplitter()
|
| 52 |
|
|
|
|
| 53 |
def process_text_file(file: AskFileResponse):
|
| 54 |
import tempfile
|
| 55 |
|
|
|
|
| 64 |
texts = text_splitter.split_texts(documents)
|
| 65 |
return texts
|
| 66 |
|
| 67 |
+
def process_pdf_file(file: AskFileResponse):
|
| 68 |
+
import tempfile
|
| 69 |
+
|
| 70 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 71 |
+
temp_file_path = temp_file.name
|
| 72 |
+
|
| 73 |
+
with open(temp_file_path, "wb") as f:
|
| 74 |
+
f.write(file.content)
|
| 75 |
+
|
| 76 |
+
doc = fitz.open(temp_file_path)
|
| 77 |
+
texts = []
|
| 78 |
+
for page in doc:
|
| 79 |
+
texts.append(page.get_text())
|
| 80 |
+
|
| 81 |
+
os.remove(temp_file_path) # Clean up the temporary file after processing
|
| 82 |
+
return texts
|
| 83 |
|
| 84 |
@cl.on_chat_start
|
| 85 |
async def on_chat_start():
|
|
|
|
| 88 |
# Wait for the user to upload a file
|
| 89 |
while files == None:
|
| 90 |
files = await cl.AskFileMessage(
|
| 91 |
+
content="Please upload a Text or PDF file to begin!",
|
| 92 |
+
accept=["text/plain", "application/pdf"],
|
| 93 |
max_size_mb=2,
|
| 94 |
timeout=180,
|
| 95 |
).send()
|
|
|
|
| 101 |
)
|
| 102 |
await msg.send()
|
| 103 |
|
| 104 |
+
# Load the file based on its type
|
| 105 |
+
if file.type == "text/plain":
|
| 106 |
+
texts = process_text_file(file)
|
| 107 |
+
elif file.type == "application/pdf":
|
| 108 |
+
texts = process_pdf_file(file)
|
| 109 |
+
else:
|
| 110 |
+
msg.content = "Unsupported file type."
|
| 111 |
+
await msg.update()
|
| 112 |
+
return
|
| 113 |
|
| 114 |
print(f"Processing {len(texts)} text chunks")
|
| 115 |
|
|
|
|
| 142 |
async for stream_resp in result["response"]:
|
| 143 |
await msg.stream_token(stream_resp)
|
| 144 |
|
| 145 |
+
await msg.send()
|