Spaces:

Marthee
/

PdftoText1

Paused

App Files Files Community

Marthee commited on Sep 19, 2024

Commit

ffc4abe

verified ·

1 Parent(s): f57c685

Update pdftotext.py

Browse files

Files changed (1) hide show

pdftotext.py +52 -14

pdftotext.py CHANGED Viewed

@@ -5,19 +5,57 @@ import tsadropboxretrieval
 def texts_from_pdf(dbpdfpath):
     print('intexts')
-    dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
-    print('dbdone')
-    md, res =dbxTeam.files_download(path=dbpdfpath)
-    print('downloaded')
-    dataDoc = res.content
-    print('l')
-    pdf_document = fitz.open('pdf',dataDoc)
-    print('k')
-    alltexts=''
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document[page_num]
-        text_instances = page.get_text()
-        alltexts+=text_instances
-    # alltexts = alltexts.replace('\n', ' ')
     return alltexts

 def texts_from_pdf(dbpdfpath):
     print('intexts')
+    pdf_content = None
+    # Case 1: If it's a shareable link
+    if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
+        # Modify Dropbox link for direct download
+        if 'dl=0' in pdfshareablelink:
+            pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
+        # Download the PDF content from the shareable link
+        response = requests.get(pdfshareablelink)
+        pdf_content = BytesIO(response.content)  # Store the content in memory
+        print('Downloaded from shareable link.')
+    # Case 2: If it's a Dropbox path, use the Dropbox API to download
+    elif dbpdfpath:
+        dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
+        print('Dropbox team access initialized.')
+        md, res = dbxTeam.files_download(path=dbpdfpath)
+        pdf_content = BytesIO(res.content)  # Store the content in memory
+        print('Downloaded from Dropbox path.')
+    # Check if the PDF content is available
+    if pdf_content is None:
+        raise ValueError("No valid PDF content found.")
+    # Open the PDF using fitz (PyMuPDF) directly from memory
+    pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
+    print('PDF opened in memory.')
+    alltexts = ''
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_num)
+        text_instances = page.get_text()  # Extract text from each page
+        alltexts += text_instances
     return alltexts
+    # print('intexts')
+    # dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
+    # print('dbdone')
+    # md, res =dbxTeam.files_download(path=dbpdfpath)
+    # print('downloaded')
+    # dataDoc = res.content
+    # print('l')
+    # pdf_document = fitz.open('pdf',dataDoc)
+    # print('k')
+    # alltexts=''
+    # for page_num in range(pdf_document.page_count):
+    #     page = pdf_document[page_num]
+    #     text_instances = page.get_text()
+    #     alltexts+=text_instances
+    # # alltexts = alltexts.replace('\n', ' ')
+    # return alltexts