Spaces:

Marthee
/

PdftoText1

Paused

Marthee commited on Jun 4, 2025

Commit

d58d00f

verified ·

1 Parent(s): 30980a7

Update pdftotext.py

Files changed (1) hide show

pdftotext.py CHANGED Viewed

@@ -141,18 +141,33 @@ def apiFiltering(apitext):
     return filtered_items
-import fitz
-def texts_from_pdfAllText(input_pdf_data):
-    pdf_document = fitz.open('pdf',input_pdf_data)
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document[page_num]
-        text_instances = page.get_text()
-    print(text_instances)
-    return text_instances
 # import fitz
 # import tsadropboxretrieval

     return filtered_items
+def texts_from_pdfAllText(link):
+      pdf_content = None
+      if link and ('http' in link or 'dropbox' in link):
+        # Modify Dropbox link for direct download
+        if 'dl=0' in link:
+            link = link.replace('dl=0', 'dl=1')
+        # Download the PDF content from the shareable link
+        response = requests.get(link)
+        pdf_content = BytesIO(response.content)  # Store the content in memory
+        print('Downloaded from shareable link.')
+    # Check if the PDF content is available
+      if pdf_content is None:
+          raise ValueError("No valid PDF content found.")
+      # Open the PDF using fitz (PyMuPDF) directly from memory
+      pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
+      print('PDF opened in memory.')
+      for page_num in range(pdf_document.page_count):
+            page = pdf_document[page_num]
+            text_instances = page.get_text()
+      print(text_instances)
+      return text_instances
 # import fitz
 # import tsadropboxretrieval