Spaces:

Marthee
/

PdftoText1

Paused

App Files Files Community

Marthee commited on Nov 11, 2024

Commit

43bf96b

verified ·

1 Parent(s): ad8236d

Update pdftotext.py

Browse files

Files changed (1) hide show

pdftotext.py +104 -102

pdftotext.py CHANGED Viewed

@@ -2,119 +2,121 @@ import fitz  # PyMuPDF
 from io import BytesIO
 import re
 import requests
-def texts_from_pdf(pdfshareablelink, heading_to_search):
-    print('intexts')
-    pdf_content = None
     # Case 1: If it's a shareable link
-    if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
         # Modify Dropbox link for direct download
-        if 'dl=0' in pdfshareablelink:
-            pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
         # Download the PDF content from the shareable link
-        response = requests.get(pdfshareablelink)
         pdf_content = BytesIO(response.content)  # Store the content in memory
         print('Downloaded from shareable link.')
-    elif dbpdfpath:
-        dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
-        print('Dropbox team access initialized.')
-        md, res = dbxTeam.files_download(path=dbpdfpath)
-        pdf_content = BytesIO(res.content)  # Store the content in memory
-        print('Downloaded from Dropbox path.')
     # Check if the PDF content is available
-    if pdf_content is None:
-        raise ValueError("No valid PDF content found.")
-    # Open the PDF using fitz (PyMuPDF) directly from memory
-    pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
-    print('PDF opened in memory.')
-    all_text = ""  # Initialize a string to store all text
-    current_line = ""  # To build the current line
-    collecting_text = False  # Track whether we're currently collecting text under the heading
-    f10_count = 0  # Counter for F10 headings
-    current_y = None  # To track the y-coordinate
-    # Define a regex pattern to match headings
-    heading_pattern = re.compile(r"[A-Za-z]\d{2}")  # Heading pattern (letter followed by two numbers)
-    # Loop through each page in the PDF
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_num)
-        # Get text as dictionary to extract lines
-        text_dict = page.get_text("dict")
-        # Collect header y-coordinates to determine header area
-        header_y_values = []
-        # First pass to collect y-coordinates for detecting header area
-        for block in text_dict['blocks']:
-            if 'lines' in block:  # Check if 'lines' key exists
-                for line in block['lines']:
-                    for span in line['spans']:
-                        header_y_values.append(span['bbox'][1])  # Collect top y-coordinates of spans
-        # Determine a threshold for the header area (e.g., top 20% of the page height)
-        header_threshold = min(header_y_values) + (page.rect.height * 0.1)  # Adding 10% for a buffer
-        print(f"Header threshold for page {page_num + 1}: {header_threshold}")
-        # Iterate over blocks, lines, and spans to extract lines of text
-        for block in text_dict['blocks']:
-            if 'lines' in block:  # Check if 'lines' key exists
-                for line in block['lines']:
-                    for span in line['spans']:
-                        span_text = span['text'].strip()
-                        span_y = span['bbox'][1]  # Get the top y-coordinate of the span
-                        # Check if it's a heading based on the format
-                        if heading_pattern.match(span_text):
-                            if heading_to_search in span_text:
-                                f10_count += 1  # Increment the F10 counter
-                                # Start collecting text under the second occurrence of F10
-                                if f10_count == 2:
-                                    collecting_text = True  # Start collecting text
-                                    print(f"Starting collection under heading: {span_text}")
-                            # Stop collecting text if we reach a new heading
-                            if collecting_text:
-                                # If we encounter a new heading, we stop the collection
-                                if heading_pattern.match(span_text) and span_text != heading_to_search:
-                                    print(f"Ending collection at heading: {span_text}")
-                                    collecting_text = False  # Stop collecting
-                                    return all_text.strip()  # Return collected text
-                        # If we're collecting text, add it to the output
-                        if collecting_text:
-                            # Exclude spans that fall within the header area
-                            if span_y < header_threshold:
-                                continue  # Skip spans in the header area
-                            if current_y is None:
-                                current_y = span_y  # Initialize the first y-coordinate
-                            # Check if the current span belongs to the same line (based on y-coordinate)
-                            if abs(current_y - span_y) < 2:  # Threshold to determine if it's the same line
-                                current_line += " " + span_text  # Add span text to the current line
-                            else:
-                                # If it's a new line, append the current line to all_text
-                                all_text += current_line.strip() + '\n'  # Add line to all_text with a newline
-                                current_line = span_text  # Start the new line with the current span
-                                current_y = span_y  # Update the y-coordinate for the new line
-        # Append the current line if we hit a new line at the end of the page
-        if current_line:
-            all_text += current_line.strip() + '\n'
-            current_line = ""  # Reset for the next line
-    # print(f"\nCollected Text:\n{all_text.strip()}")
     return all_text.strip() if f10_count > 1 else "Heading not found"
 # import fitz
 # import tsadropboxretrieval

 from io import BytesIO
 import re
 import requests
+def texts_from_pdf(pdfshareablelinks, heading_to_search):
+    print('intexts',pdfshareablelinks)
     # Case 1: If it's a shareable link
+    for link in pdfshareablelinks:
+      pdf_content = None
+      if link and ('http' in link or 'dropbox' in link):
         # Modify Dropbox link for direct download
+        if 'dl=0' in link:
+            link = link.replace('dl=0', 'dl=1')
         # Download the PDF content from the shareable link
+        response = requests.get(link)
         pdf_content = BytesIO(response.content)  # Store the content in memory
         print('Downloaded from shareable link.')
+    # elif dbpdfpath:
+    #     dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
+    #     print('Dropbox team access initialized.')
+    #     md, res = dbxTeam.files_download(path=dbpdfpath)
+    #     pdf_content = BytesIO(res.content)  # Store the content in memory
+    #     print('Downloaded from Dropbox path.')
     # Check if the PDF content is available
+      if pdf_content is None:
+          raise ValueError("No valid PDF content found.")
+      # Open the PDF using fitz (PyMuPDF) directly from memory
+      pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
+      print('PDF opened in memory.')
+      all_text = ""  # Initialize a string to store all text
+      current_line = ""  # To build the current line
+      collecting_text = False  # Track whether we're currently collecting text under the heading
+      f10_count = 0  # Counter for F10 headings
+      current_y = None  # To track the y-coordinate
+      # Define a regex pattern to match headings
+      heading_pattern = re.compile(r"[A-Za-z]\d{2}")  # Heading pattern (letter followed by two numbers)
+      # Loop through each page in the PDF
+      for page_num in range(pdf_document.page_count):
+          page = pdf_document.load_page(page_num)
+          # Get text as dictionary to extract lines
+          text_dict = page.get_text("dict")
+          # Collect header y-coordinates to determine header area
+          header_y_values = []
+          # First pass to collect y-coordinates for detecting header area
+          for block in text_dict['blocks']:
+              if 'lines' in block:  # Check if 'lines' key exists
+                  for line in block['lines']:
+                      for span in line['spans']:
+                          header_y_values.append(span['bbox'][1])  # Collect top y-coordinates of spans
+          # Determine a threshold for the header area (e.g., top 20% of the page height)
+          header_threshold = min(header_y_values) + (page.rect.height * 0.1)  # Adding 10% for a buffer
+          # print(f"Header threshold for page {page_num + 1}: {header_threshold}")
+          # Iterate over blocks, lines, and spans to extract lines of text
+          for block in text_dict['blocks']:
+              if 'lines' in block:  # Check if 'lines' key exists
+                  for line in block['lines']:
+                      for span in line['spans']:
+                          span_text = span['text'].strip()
+                          span_y = span['bbox'][1]  # Get the top y-coordinate of the span
+                          # Check if it's a heading based on the format
+                          if heading_pattern.match(span_text):
+                              if heading_to_search in span_text:
+                                  f10_count += 1  # Increment the F10 counter
+                                  # Start collecting text under the second occurrence of F10
+                                  if f10_count == 2:
+                                      collecting_text = True  # Start collecting text
+                                      print(f"Starting collection under heading: {span_text}")
+                              # Stop collecting text if we reach a new heading
+                              if collecting_text:
+                                  # If we encounter a new heading, we stop the collection
+                                  if heading_pattern.match(span_text) and span_text != heading_to_search:
+                                      print(f"Ending collection at heading: {span_text}")
+                                      collecting_text = False  # Stop collecting
+                                      return all_text.strip()  # Return collected text
+                          # If we're collecting text, add it to the output
+                          if collecting_text:
+                              # Exclude spans that fall within the header area
+                              if span_y < header_threshold:
+                                  continue  # Skip spans in the header area
+                              if current_y is None:
+                                  current_y = span_y  # Initialize the first y-coordinate
+                              # Check if the current span belongs to the same line (based on y-coordinate)
+                              if abs(current_y - span_y) < 2:  # Threshold to determine if it's the same line
+                                  current_line += " " + span_text  # Add span text to the current line
+                              else:
+                                  # If it's a new line, append the current line to all_text
+                                  all_text += current_line.strip() + '\n'  # Add line to all_text with a newline
+                                  current_line = span_text  # Start the new line with the current span
+                                  current_y = span_y  # Update the y-coordinate for the new line
+          # Append the current line if we hit a new line at the end of the page
+          if current_line:
+              all_text += current_line.strip() + '\n'
+              current_line = ""  # Reset for the next line
+      # print(f"\nCollected Text:\n{all_text.strip()}")
     return all_text.strip() if f10_count > 1 else "Heading not found"
 # import fitz
 # import tsadropboxretrieval