Spaces:
Paused
Paused
Update pdftotext.py
Browse files- pdftotext.py +52 -14
pdftotext.py
CHANGED
|
@@ -5,19 +5,57 @@ import tsadropboxretrieval
|
|
| 5 |
|
| 6 |
def texts_from_pdf(dbpdfpath):
|
| 7 |
print('intexts')
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return alltexts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def texts_from_pdf(dbpdfpath):
|
| 7 |
print('intexts')
|
| 8 |
+
|
| 9 |
+
pdf_content = None
|
| 10 |
+
|
| 11 |
+
# Case 1: If it's a shareable link
|
| 12 |
+
if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
|
| 13 |
+
# Modify Dropbox link for direct download
|
| 14 |
+
if 'dl=0' in pdfshareablelink:
|
| 15 |
+
pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
|
| 16 |
+
|
| 17 |
+
# Download the PDF content from the shareable link
|
| 18 |
+
response = requests.get(pdfshareablelink)
|
| 19 |
+
pdf_content = BytesIO(response.content) # Store the content in memory
|
| 20 |
+
print('Downloaded from shareable link.')
|
| 21 |
+
|
| 22 |
+
# Case 2: If it's a Dropbox path, use the Dropbox API to download
|
| 23 |
+
elif dbpdfpath:
|
| 24 |
+
dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
|
| 25 |
+
print('Dropbox team access initialized.')
|
| 26 |
+
md, res = dbxTeam.files_download(path=dbpdfpath)
|
| 27 |
+
pdf_content = BytesIO(res.content) # Store the content in memory
|
| 28 |
+
print('Downloaded from Dropbox path.')
|
| 29 |
+
|
| 30 |
+
# Check if the PDF content is available
|
| 31 |
+
if pdf_content is None:
|
| 32 |
+
raise ValueError("No valid PDF content found.")
|
| 33 |
|
| 34 |
+
# Open the PDF using fitz (PyMuPDF) directly from memory
|
| 35 |
+
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
|
| 36 |
+
print('PDF opened in memory.')
|
| 37 |
+
|
| 38 |
+
alltexts = ''
|
| 39 |
+
for page_num in range(pdf_document.page_count):
|
| 40 |
+
page = pdf_document.load_page(page_num)
|
| 41 |
+
text_instances = page.get_text() # Extract text from each page
|
| 42 |
+
alltexts += text_instances
|
| 43 |
+
|
| 44 |
return alltexts
|
| 45 |
+
# print('intexts')
|
| 46 |
+
# dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
|
| 47 |
+
# print('dbdone')
|
| 48 |
+
# md, res =dbxTeam.files_download(path=dbpdfpath)
|
| 49 |
+
# print('downloaded')
|
| 50 |
+
# dataDoc = res.content
|
| 51 |
+
# print('l')
|
| 52 |
+
# pdf_document = fitz.open('pdf',dataDoc)
|
| 53 |
+
# print('k')
|
| 54 |
+
# alltexts=''
|
| 55 |
+
# for page_num in range(pdf_document.page_count):
|
| 56 |
+
# page = pdf_document[page_num]
|
| 57 |
+
# text_instances = page.get_text()
|
| 58 |
+
# alltexts+=text_instances
|
| 59 |
+
|
| 60 |
+
# # alltexts = alltexts.replace('\n', ' ')
|
| 61 |
+
# return alltexts
|