Marthee commited on
Commit
d58d00f
·
verified ·
1 Parent(s): 30980a7

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +27 -12
pdftotext.py CHANGED
@@ -141,18 +141,33 @@ def apiFiltering(apitext):
141
  return filtered_items
142
 
143
 
144
-
145
- import fitz
146
-
147
- def texts_from_pdfAllText(input_pdf_data):
148
- pdf_document = fitz.open('pdf',input_pdf_data)
149
-
150
- for page_num in range(pdf_document.page_count):
151
- page = pdf_document[page_num]
152
- text_instances = page.get_text()
153
-
154
- print(text_instances)
155
- return text_instances
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  # import fitz
157
 
158
  # import tsadropboxretrieval
 
141
  return filtered_items
142
 
143
 
144
+ def texts_from_pdfAllText(link):
145
+
146
+ pdf_content = None
147
+
148
+ if link and ('http' in link or 'dropbox' in link):
149
+ # Modify Dropbox link for direct download
150
+ if 'dl=0' in link:
151
+ link = link.replace('dl=0', 'dl=1')
152
+
153
+ # Download the PDF content from the shareable link
154
+ response = requests.get(link)
155
+ pdf_content = BytesIO(response.content) # Store the content in memory
156
+ print('Downloaded from shareable link.')
157
+ # Check if the PDF content is available
158
+ if pdf_content is None:
159
+ raise ValueError("No valid PDF content found.")
160
+
161
+ # Open the PDF using fitz (PyMuPDF) directly from memory
162
+ pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
163
+ print('PDF opened in memory.')
164
+
165
+ for page_num in range(pdf_document.page_count):
166
+ page = pdf_document[page_num]
167
+ text_instances = page.get_text()
168
+
169
+ print(text_instances)
170
+ return text_instances
171
  # import fitz
172
 
173
  # import tsadropboxretrieval