Cheselle commited on
Commit
81fd7f8
·
1 Parent(s): d4239dd

Trying pdf feature

Browse files
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. app.py +29 -6
.DS_Store ADDED
Binary file (8.2 kB). View file
 
app.py CHANGED
@@ -11,6 +11,7 @@ from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
 
14
 
15
  system_template = """\
16
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
@@ -49,7 +50,6 @@ class RetrievalAugmentedQAPipeline:
49
 
50
  text_splitter = CharacterTextSplitter()
51
 
52
-
53
  def process_text_file(file: AskFileResponse):
54
  import tempfile
55
 
@@ -64,6 +64,22 @@ def process_text_file(file: AskFileResponse):
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  @cl.on_chat_start
69
  async def on_chat_start():
@@ -72,8 +88,8 @@ async def on_chat_start():
72
  # Wait for the user to upload a file
73
  while files == None:
74
  files = await cl.AskFileMessage(
75
- content="Please upload a Text File file to begin!",
76
- accept=["text/plain"],
77
  max_size_mb=2,
78
  timeout=180,
79
  ).send()
@@ -85,8 +101,15 @@ async def on_chat_start():
85
  )
86
  await msg.send()
87
 
88
- # load the file
89
- texts = process_text_file(file)
 
 
 
 
 
 
 
90
 
91
  print(f"Processing {len(texts)} text chunks")
92
 
@@ -119,4 +142,4 @@ async def main(message):
119
  async for stream_resp in result["response"]:
120
  await msg.stream_token(stream_resp)
121
 
122
- await msg.send()
 
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
  import chainlit as cl
14
+ import fitz # PyMuPDF
15
 
16
  system_template = """\
17
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 
50
 
51
  text_splitter = CharacterTextSplitter()
52
 
 
53
  def process_text_file(file: AskFileResponse):
54
  import tempfile
55
 
 
64
  texts = text_splitter.split_texts(documents)
65
  return texts
66
 
67
+ def process_pdf_file(file: AskFileResponse):
68
+ import tempfile
69
+
70
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
71
+ temp_file_path = temp_file.name
72
+
73
+ with open(temp_file_path, "wb") as f:
74
+ f.write(file.content)
75
+
76
+ doc = fitz.open(temp_file_path)
77
+ texts = []
78
+ for page in doc:
79
+ texts.append(page.get_text())
80
+
81
+ os.remove(temp_file_path) # Clean up the temporary file after processing
82
+ return texts
83
 
84
  @cl.on_chat_start
85
  async def on_chat_start():
 
88
  # Wait for the user to upload a file
89
  while files == None:
90
  files = await cl.AskFileMessage(
91
+ content="Please upload a Text or PDF file to begin!",
92
+ accept=["text/plain", "application/pdf"],
93
  max_size_mb=2,
94
  timeout=180,
95
  ).send()
 
101
  )
102
  await msg.send()
103
 
104
+ # Load the file based on its type
105
+ if file.type == "text/plain":
106
+ texts = process_text_file(file)
107
+ elif file.type == "application/pdf":
108
+ texts = process_pdf_file(file)
109
+ else:
110
+ msg.content = "Unsupported file type."
111
+ await msg.update()
112
+ return
113
 
114
  print(f"Processing {len(texts)} text chunks")
115
 
 
142
  async for stream_resp in result["response"]:
143
  await msg.stream_token(stream_resp)
144
 
145
+ await msg.send()