Spaces:
Paused
Paused
Update pdftotext.py
Browse files- pdftotext.py +27 -12
pdftotext.py
CHANGED
|
@@ -141,18 +141,33 @@ def apiFiltering(apitext):
|
|
| 141 |
return filtered_items
|
| 142 |
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
# import fitz
|
| 157 |
|
| 158 |
# import tsadropboxretrieval
|
|
|
|
| 141 |
return filtered_items
|
| 142 |
|
| 143 |
|
| 144 |
+
def texts_from_pdfAllText(link):
|
| 145 |
+
|
| 146 |
+
pdf_content = None
|
| 147 |
+
|
| 148 |
+
if link and ('http' in link or 'dropbox' in link):
|
| 149 |
+
# Modify Dropbox link for direct download
|
| 150 |
+
if 'dl=0' in link:
|
| 151 |
+
link = link.replace('dl=0', 'dl=1')
|
| 152 |
+
|
| 153 |
+
# Download the PDF content from the shareable link
|
| 154 |
+
response = requests.get(link)
|
| 155 |
+
pdf_content = BytesIO(response.content) # Store the content in memory
|
| 156 |
+
print('Downloaded from shareable link.')
|
| 157 |
+
# Check if the PDF content is available
|
| 158 |
+
if pdf_content is None:
|
| 159 |
+
raise ValueError("No valid PDF content found.")
|
| 160 |
+
|
| 161 |
+
# Open the PDF using fitz (PyMuPDF) directly from memory
|
| 162 |
+
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
|
| 163 |
+
print('PDF opened in memory.')
|
| 164 |
+
|
| 165 |
+
for page_num in range(pdf_document.page_count):
|
| 166 |
+
page = pdf_document[page_num]
|
| 167 |
+
text_instances = page.get_text()
|
| 168 |
+
|
| 169 |
+
print(text_instances)
|
| 170 |
+
return text_instances
|
| 171 |
# import fitz
|
| 172 |
|
| 173 |
# import tsadropboxretrieval
|