danicafisher commited on
Commit
9ccc897
·
1 Parent(s): 1dd8837

Adds pdf reader

Browse files
Files changed (2) hide show
  1. aimakerspace/text_utils.py +39 -0
  2. app.py +19 -3
aimakerspace/text_utils.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from typing import List
 
3
 
4
 
5
  class TextFileLoader:
@@ -35,6 +36,44 @@ class TextFileLoader:
35
  self.load()
36
  return self.documents
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  class CharacterTextSplitter:
40
  def __init__(
 
1
  import os
2
  from typing import List
3
+ from PyPDF2 import PdfReader
4
 
5
 
6
  class TextFileLoader:
 
36
  self.load()
37
  return self.documents
38
 
39
+ class PDFFileLoader:
40
+ def __init__(self, path: str):
41
+ self.documents = []
42
+ self.path = path
43
+
44
+ def load(self):
45
+ if os.path.isdir(self.path):
46
+ self.load_directory()
47
+ elif os.path.isfile(self.path) and self.path.endswith(".pdf"):
48
+ self.load_file()
49
+ else:
50
+ raise ValueError(
51
+ "Provided path is neither a valid directory nor a .pdf file."
52
+ )
53
+
54
+ def load_file(self):
55
+ with open(self.path, "rb") as file:
56
+ pdf_reader = PdfReader(file)
57
+ text = ""
58
+ for page in pdf_reader.pages:
59
+ text += page.extract_text()
60
+ self.documents.append(text)
61
+
62
+ def load_directory(self):
63
+ for root, _, files in os.walk(self.path):
64
+ for file in files:
65
+ if file.endswith(".pdf"):
66
+ file_path = os.path.join(root, file)
67
+ with open(file_path, "rb") as f:
68
+ pdf_reader = PdfReader(f)
69
+ text = ""
70
+ for page in pdf_reader.pages:
71
+ text += page.extract_text()
72
+ self.documents.append(text)
73
+
74
+ def load_documents(self):
75
+ self.load()
76
+ return self.documents
77
 
78
  class CharacterTextSplitter:
79
  def __init__(
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  from typing import List
3
  from chainlit.types import AskFileResponse
4
- from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader
5
  from aimakerspace.openai_utils.prompts import (
6
  UserRolePrompt,
7
  SystemRolePrompt,
@@ -64,6 +64,19 @@ def process_text_file(file: AskFileResponse):
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  @cl.on_chat_start
69
  async def on_chat_start():
@@ -73,7 +86,7 @@ async def on_chat_start():
73
  while files == None:
74
  files = await cl.AskFileMessage(
75
  content="Please upload a Text File file to begin!",
76
- accept=["text/plain"],
77
  max_size_mb=2,
78
  timeout=180,
79
  ).send()
@@ -86,7 +99,10 @@ async def on_chat_start():
86
  await msg.send()
87
 
88
  # load the file
89
- texts = process_text_file(file)
 
 
 
90
 
91
  print(f"Processing {len(texts)} text chunks")
92
 
 
1
  import os
2
  from typing import List
3
  from chainlit.types import AskFileResponse
4
+ from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFFileLoader
5
  from aimakerspace.openai_utils.prompts import (
6
  UserRolePrompt,
7
  SystemRolePrompt,
 
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
67
+ def process_pdf_file(file: AskFileResponse):
68
+ import tempfile
69
+
70
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".pdf") as temp_file:
71
+ temp_file_path = temp_file.name
72
+
73
+ with open(temp_file_path, "wb") as f:
74
+ f.write(file.content)
75
+
76
+ text_loader = PDFFileLoader(temp_file_path)
77
+ documents = text_loader.load_documents()
78
+ texts = text_splitter.split_texts(documents)
79
+ return texts
80
 
81
  @cl.on_chat_start
82
  async def on_chat_start():
 
86
  while files == None:
87
  files = await cl.AskFileMessage(
88
  content="Please upload a Text File file to begin!",
89
+ accept=["text/plain", "application/pdf"],
90
  max_size_mb=2,
91
  timeout=180,
92
  ).send()
 
99
  await msg.send()
100
 
101
  # load the file
102
+ if file.path.endswith(".pdf"):
103
+ texts = process_text_file(file)
104
+ else:
105
+ texts = process_pdf_file(file)
106
 
107
  print(f"Processing {len(texts)} text chunks")
108