Marthee commited on
Commit
43bf96b
·
verified ·
1 Parent(s): ad8236d

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +104 -102
pdftotext.py CHANGED
@@ -2,119 +2,121 @@ import fitz # PyMuPDF
2
  from io import BytesIO
3
  import re
4
  import requests
5
- def texts_from_pdf(pdfshareablelink, heading_to_search):
6
- print('intexts')
7
 
8
- pdf_content = None
9
 
10
  # Case 1: If it's a shareable link
11
- if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
 
 
 
12
  # Modify Dropbox link for direct download
13
- if 'dl=0' in pdfshareablelink:
14
- pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
15
 
16
  # Download the PDF content from the shareable link
17
- response = requests.get(pdfshareablelink)
18
  pdf_content = BytesIO(response.content) # Store the content in memory
19
  print('Downloaded from shareable link.')
20
- elif dbpdfpath:
21
- dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
22
- print('Dropbox team access initialized.')
23
- md, res = dbxTeam.files_download(path=dbpdfpath)
24
- pdf_content = BytesIO(res.content) # Store the content in memory
25
- print('Downloaded from Dropbox path.')
26
 
27
  # Check if the PDF content is available
28
- if pdf_content is None:
29
- raise ValueError("No valid PDF content found.")
30
-
31
- # Open the PDF using fitz (PyMuPDF) directly from memory
32
- pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
33
- print('PDF opened in memory.')
34
-
35
- all_text = "" # Initialize a string to store all text
36
- current_line = "" # To build the current line
37
- collecting_text = False # Track whether we're currently collecting text under the heading
38
- f10_count = 0 # Counter for F10 headings
39
- current_y = None # To track the y-coordinate
40
-
41
- # Define a regex pattern to match headings
42
- heading_pattern = re.compile(r"[A-Za-z]\d{2}") # Heading pattern (letter followed by two numbers)
43
-
44
- # Loop through each page in the PDF
45
- for page_num in range(pdf_document.page_count):
46
- page = pdf_document.load_page(page_num)
47
-
48
- # Get text as dictionary to extract lines
49
- text_dict = page.get_text("dict")
50
-
51
- # Collect header y-coordinates to determine header area
52
- header_y_values = []
53
-
54
- # First pass to collect y-coordinates for detecting header area
55
- for block in text_dict['blocks']:
56
- if 'lines' in block: # Check if 'lines' key exists
57
- for line in block['lines']:
58
- for span in line['spans']:
59
- header_y_values.append(span['bbox'][1]) # Collect top y-coordinates of spans
60
-
61
- # Determine a threshold for the header area (e.g., top 20% of the page height)
62
- header_threshold = min(header_y_values) + (page.rect.height * 0.1) # Adding 10% for a buffer
63
- print(f"Header threshold for page {page_num + 1}: {header_threshold}")
64
-
65
- # Iterate over blocks, lines, and spans to extract lines of text
66
- for block in text_dict['blocks']:
67
- if 'lines' in block: # Check if 'lines' key exists
68
- for line in block['lines']:
69
- for span in line['spans']:
70
- span_text = span['text'].strip()
71
- span_y = span['bbox'][1] # Get the top y-coordinate of the span
72
-
73
- # Check if it's a heading based on the format
74
- if heading_pattern.match(span_text):
75
- if heading_to_search in span_text:
76
- f10_count += 1 # Increment the F10 counter
77
-
78
- # Start collecting text under the second occurrence of F10
79
- if f10_count == 2:
80
- collecting_text = True # Start collecting text
81
- print(f"Starting collection under heading: {span_text}")
82
-
83
- # Stop collecting text if we reach a new heading
84
- if collecting_text:
85
- # If we encounter a new heading, we stop the collection
86
- if heading_pattern.match(span_text) and span_text != heading_to_search:
87
- print(f"Ending collection at heading: {span_text}")
88
- collecting_text = False # Stop collecting
89
-
90
- return all_text.strip() # Return collected text
91
-
92
- # If we're collecting text, add it to the output
93
- if collecting_text:
94
- # Exclude spans that fall within the header area
95
- if span_y < header_threshold:
96
- continue # Skip spans in the header area
97
-
98
- if current_y is None:
99
- current_y = span_y # Initialize the first y-coordinate
100
-
101
- # Check if the current span belongs to the same line (based on y-coordinate)
102
- if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
103
- current_line += " " + span_text # Add span text to the current line
104
- else:
105
- # If it's a new line, append the current line to all_text
106
- all_text += current_line.strip() + '\n' # Add line to all_text with a newline
107
- current_line = span_text # Start the new line with the current span
108
- current_y = span_y # Update the y-coordinate for the new line
109
-
110
- # Append the current line if we hit a new line at the end of the page
111
- if current_line:
112
- all_text += current_line.strip() + '\n'
113
- current_line = "" # Reset for the next line
114
-
115
- # print(f"\nCollected Text:\n{all_text.strip()}")
116
  return all_text.strip() if f10_count > 1 else "Heading not found"
117
-
118
  # import fitz
119
 
120
  # import tsadropboxretrieval
 
2
  from io import BytesIO
3
  import re
4
  import requests
5
+ def texts_from_pdf(pdfshareablelinks, heading_to_search):
6
+ print('intexts',pdfshareablelinks)
7
 
8
+
9
 
10
  # Case 1: If it's a shareable link
11
+ for link in pdfshareablelinks:
12
+ pdf_content = None
13
+
14
+ if link and ('http' in link or 'dropbox' in link):
15
  # Modify Dropbox link for direct download
16
+ if 'dl=0' in link:
17
+ link = link.replace('dl=0', 'dl=1')
18
 
19
  # Download the PDF content from the shareable link
20
+ response = requests.get(link)
21
  pdf_content = BytesIO(response.content) # Store the content in memory
22
  print('Downloaded from shareable link.')
23
+ # elif dbpdfpath:
24
+ # dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
25
+ # print('Dropbox team access initialized.')
26
+ # md, res = dbxTeam.files_download(path=dbpdfpath)
27
+ # pdf_content = BytesIO(res.content) # Store the content in memory
28
+ # print('Downloaded from Dropbox path.')
29
 
30
  # Check if the PDF content is available
31
+ if pdf_content is None:
32
+ raise ValueError("No valid PDF content found.")
33
+
34
+ # Open the PDF using fitz (PyMuPDF) directly from memory
35
+ pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
36
+ print('PDF opened in memory.')
37
+
38
+ all_text = "" # Initialize a string to store all text
39
+ current_line = "" # To build the current line
40
+ collecting_text = False # Track whether we're currently collecting text under the heading
41
+ f10_count = 0 # Counter for F10 headings
42
+ current_y = None # To track the y-coordinate
43
+
44
+ # Define a regex pattern to match headings
45
+ heading_pattern = re.compile(r"[A-Za-z]\d{2}") # Heading pattern (letter followed by two numbers)
46
+
47
+ # Loop through each page in the PDF
48
+ for page_num in range(pdf_document.page_count):
49
+ page = pdf_document.load_page(page_num)
50
+
51
+ # Get text as dictionary to extract lines
52
+ text_dict = page.get_text("dict")
53
+
54
+ # Collect header y-coordinates to determine header area
55
+ header_y_values = []
56
+
57
+ # First pass to collect y-coordinates for detecting header area
58
+ for block in text_dict['blocks']:
59
+ if 'lines' in block: # Check if 'lines' key exists
60
+ for line in block['lines']:
61
+ for span in line['spans']:
62
+ header_y_values.append(span['bbox'][1]) # Collect top y-coordinates of spans
63
+
64
+ # Determine a threshold for the header area (e.g., top 20% of the page height)
65
+ header_threshold = min(header_y_values) + (page.rect.height * 0.1) # Adding 10% for a buffer
66
+ # print(f"Header threshold for page {page_num + 1}: {header_threshold}")
67
+
68
+ # Iterate over blocks, lines, and spans to extract lines of text
69
+ for block in text_dict['blocks']:
70
+ if 'lines' in block: # Check if 'lines' key exists
71
+ for line in block['lines']:
72
+ for span in line['spans']:
73
+ span_text = span['text'].strip()
74
+ span_y = span['bbox'][1] # Get the top y-coordinate of the span
75
+
76
+ # Check if it's a heading based on the format
77
+ if heading_pattern.match(span_text):
78
+ if heading_to_search in span_text:
79
+ f10_count += 1 # Increment the F10 counter
80
+
81
+ # Start collecting text under the second occurrence of F10
82
+ if f10_count == 2:
83
+ collecting_text = True # Start collecting text
84
+ print(f"Starting collection under heading: {span_text}")
85
+
86
+ # Stop collecting text if we reach a new heading
87
+ if collecting_text:
88
+ # If we encounter a new heading, we stop the collection
89
+ if heading_pattern.match(span_text) and span_text != heading_to_search:
90
+ print(f"Ending collection at heading: {span_text}")
91
+ collecting_text = False # Stop collecting
92
+
93
+ return all_text.strip() # Return collected text
94
+
95
+ # If we're collecting text, add it to the output
96
+ if collecting_text:
97
+ # Exclude spans that fall within the header area
98
+ if span_y < header_threshold:
99
+ continue # Skip spans in the header area
100
+
101
+ if current_y is None:
102
+ current_y = span_y # Initialize the first y-coordinate
103
+
104
+ # Check if the current span belongs to the same line (based on y-coordinate)
105
+ if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
106
+ current_line += " " + span_text # Add span text to the current line
107
+ else:
108
+ # If it's a new line, append the current line to all_text
109
+ all_text += current_line.strip() + '\n' # Add line to all_text with a newline
110
+ current_line = span_text # Start the new line with the current span
111
+ current_y = span_y # Update the y-coordinate for the new line
112
+
113
+ # Append the current line if we hit a new line at the end of the page
114
+ if current_line:
115
+ all_text += current_line.strip() + '\n'
116
+ current_line = "" # Reset for the next line
117
+
118
+ # print(f"\nCollected Text:\n{all_text.strip()}")
119
  return all_text.strip() if f10_count > 1 else "Heading not found"
 
120
  # import fitz
121
 
122
  # import tsadropboxretrieval