Spaces:
Sleeping
Sleeping
Commit
·
9ccc897
1
Parent(s):
1dd8837
Adds pdf reader
Browse files- aimakerspace/text_utils.py +39 -0
- app.py +19 -3
aimakerspace/text_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
from typing import List
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
class TextFileLoader:
|
|
@@ -35,6 +36,44 @@ class TextFileLoader:
|
|
| 35 |
self.load()
|
| 36 |
return self.documents
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
class CharacterTextSplitter:
|
| 40 |
def __init__(
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import List
|
| 3 |
+
from PyPDF2 import PdfReader
|
| 4 |
|
| 5 |
|
| 6 |
class TextFileLoader:
|
|
|
|
| 36 |
self.load()
|
| 37 |
return self.documents
|
| 38 |
|
| 39 |
+
class PDFFileLoader:
|
| 40 |
+
def __init__(self, path: str):
|
| 41 |
+
self.documents = []
|
| 42 |
+
self.path = path
|
| 43 |
+
|
| 44 |
+
def load(self):
|
| 45 |
+
if os.path.isdir(self.path):
|
| 46 |
+
self.load_directory()
|
| 47 |
+
elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
|
| 48 |
+
self.load_file()
|
| 49 |
+
else:
|
| 50 |
+
raise ValueError(
|
| 51 |
+
"Provided path is neither a valid directory nor a .pdf file."
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
def load_file(self):
|
| 55 |
+
with open(self.path, "rb") as file:
|
| 56 |
+
pdf_reader = PdfReader(file)
|
| 57 |
+
text = ""
|
| 58 |
+
for page in pdf_reader.pages:
|
| 59 |
+
text += page.extract_text()
|
| 60 |
+
self.documents.append(text)
|
| 61 |
+
|
| 62 |
+
def load_directory(self):
|
| 63 |
+
for root, _, files in os.walk(self.path):
|
| 64 |
+
for file in files:
|
| 65 |
+
if file.endswith(".pdf"):
|
| 66 |
+
file_path = os.path.join(root, file)
|
| 67 |
+
with open(file_path, "rb") as f:
|
| 68 |
+
pdf_reader = PdfReader(f)
|
| 69 |
+
text = ""
|
| 70 |
+
for page in pdf_reader.pages:
|
| 71 |
+
text += page.extract_text()
|
| 72 |
+
self.documents.append(text)
|
| 73 |
+
|
| 74 |
+
def load_documents(self):
|
| 75 |
+
self.load()
|
| 76 |
+
return self.documents
|
| 77 |
|
| 78 |
class CharacterTextSplitter:
|
| 79 |
def __init__(
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
from typing import List
|
| 3 |
from chainlit.types import AskFileResponse
|
| 4 |
-
from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader
|
| 5 |
from aimakerspace.openai_utils.prompts import (
|
| 6 |
UserRolePrompt,
|
| 7 |
SystemRolePrompt,
|
|
@@ -64,6 +64,19 @@ def process_text_file(file: AskFileResponse):
|
|
| 64 |
texts = text_splitter.split_texts(documents)
|
| 65 |
return texts
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
@cl.on_chat_start
|
| 69 |
async def on_chat_start():
|
|
@@ -73,7 +86,7 @@ async def on_chat_start():
|
|
| 73 |
while files == None:
|
| 74 |
files = await cl.AskFileMessage(
|
| 75 |
content="Please upload a Text File file to begin!",
|
| 76 |
-
accept=["text/plain"],
|
| 77 |
max_size_mb=2,
|
| 78 |
timeout=180,
|
| 79 |
).send()
|
|
@@ -86,7 +99,10 @@ async def on_chat_start():
|
|
| 86 |
await msg.send()
|
| 87 |
|
| 88 |
# load the file
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
print(f"Processing {len(texts)} text chunks")
|
| 92 |
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import List
|
| 3 |
from chainlit.types import AskFileResponse
|
| 4 |
+
from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFFileLoader
|
| 5 |
from aimakerspace.openai_utils.prompts import (
|
| 6 |
UserRolePrompt,
|
| 7 |
SystemRolePrompt,
|
|
|
|
| 64 |
texts = text_splitter.split_texts(documents)
|
| 65 |
return texts
|
| 66 |
|
| 67 |
+
def process_pdf_file(file: AskFileResponse):
|
| 68 |
+
import tempfile
|
| 69 |
+
|
| 70 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".pdf") as temp_file:
|
| 71 |
+
temp_file_path = temp_file.name
|
| 72 |
+
|
| 73 |
+
with open(temp_file_path, "wb") as f:
|
| 74 |
+
f.write(file.content)
|
| 75 |
+
|
| 76 |
+
text_loader = PDFFileLoader(temp_file_path)
|
| 77 |
+
documents = text_loader.load_documents()
|
| 78 |
+
texts = text_splitter.split_texts(documents)
|
| 79 |
+
return texts
|
| 80 |
|
| 81 |
@cl.on_chat_start
|
| 82 |
async def on_chat_start():
|
|
|
|
| 86 |
while files == None:
|
| 87 |
files = await cl.AskFileMessage(
|
| 88 |
content="Please upload a Text File file to begin!",
|
| 89 |
+
accept=["text/plain", "application/pdf"],
|
| 90 |
max_size_mb=2,
|
| 91 |
timeout=180,
|
| 92 |
).send()
|
|
|
|
| 99 |
await msg.send()
|
| 100 |
|
| 101 |
# load the file
|
| 102 |
+
if file.path.endswith(".pdf"):
|
| 103 |
+
texts = process_text_file(file)
|
| 104 |
+
else:
|
| 105 |
+
texts = process_pdf_file(file)
|
| 106 |
|
| 107 |
print(f"Processing {len(texts)} text chunks")
|
| 108 |
|