Marthee commited on
Commit
7c0ffb1
·
verified ·
1 Parent(s): 79594bb

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +166 -50
pdftotext.py CHANGED
@@ -1,25 +1,23 @@
1
- import fitz
2
-
3
- import tsadropboxretrieval
4
  from io import BytesIO
 
5
  import requests
6
- def texts_from_pdf(pdfshareablelink):
 
7
  print('intexts')
8
-
9
  pdf_content = None
10
-
11
  # Case 1: If it's a shareable link
12
  if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
13
  # Modify Dropbox link for direct download
14
  if 'dl=0' in pdfshareablelink:
15
  pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
16
-
17
  # Download the PDF content from the shareable link
18
  response = requests.get(pdfshareablelink)
19
  pdf_content = BytesIO(response.content) # Store the content in memory
20
  print('Downloaded from shareable link.')
21
-
22
- # Case 2: If it's a Dropbox path, use the Dropbox API to download
23
  elif dbpdfpath:
24
  dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
25
  print('Dropbox team access initialized.')
@@ -34,62 +32,180 @@ def texts_from_pdf(pdfshareablelink):
34
  # Open the PDF using fitz (PyMuPDF) directly from memory
35
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
36
  print('PDF opened in memory.')
37
-
38
  all_text = "" # Initialize a string to store all text
39
  current_line = "" # To build the current line
40
- current_y = None # Track the y-coordinate of the current line
41
-
 
 
 
 
 
42
  # Loop through each page in the PDF
43
  for page_num in range(pdf_document.page_count):
44
  page = pdf_document.load_page(page_num)
45
-
46
  # Get text as dictionary to extract lines
47
  text_dict = page.get_text("dict")
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
49
  # Iterate over blocks, lines, and spans to extract lines of text
50
  for block in text_dict['blocks']:
51
  if 'lines' in block: # Check if 'lines' key exists
52
  for line in block['lines']:
53
  for span in line['spans']:
54
  span_text = span['text'].strip()
55
- span_y = span['bbox'][1] # Y-coordinate of the span (bbox[1] is the top y-coordinate)
56
-
57
- # Check if the current span belongs to the same line (based on y-coordinate)
58
- if current_y is None:
59
- current_y = span_y # Initialize the first y-coordinate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
62
- # If the y-coordinate is close enough, add to the current line
63
- current_line += " " + span_text
64
- else:
65
- # If it's a new line, append the current line and reset
66
- all_text += current_line.strip() + '\n' # Add line to all_text with a newline
67
- current_line = span_text # Start the new line with the current span
68
- current_y = span_y # Update the y-coordinate for the new line
69
-
70
- # Append the last line of the page (if there's any)
71
  if current_line:
72
  all_text += current_line.strip() + '\n'
73
- current_line = "" # Reset after each page
74
- # all_text = all_text.replace('\n', ' ')
75
- # return all_lines
76
- print(all_text)
77
- return all_text
78
- # print('intexts')
79
- # dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
80
- # print('dbdone')
81
- # md, res =dbxTeam.files_download(path=dbpdfpath)
82
- # print('downloaded')
83
- # dataDoc = res.content
84
- # print('l')
85
- # pdf_document = fitz.open('pdf',dataDoc)
86
- # print('k')
87
- # alltexts=''
88
- # for page_num in range(pdf_document.page_count):
89
- # page = pdf_document[page_num]
90
- # text_instances = page.get_text()
91
- # alltexts+=text_instances
92
-
93
- # # alltexts = alltexts.replace('\n', ' ')
94
- # return alltexts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
 
1
+ import fitz # PyMuPDF
 
 
2
  from io import BytesIO
3
+ import re
4
  import requests
5
+
6
+ def texts_from_pdf(pdfshareablelink, heading_to_search):
7
  print('intexts')
8
+
9
  pdf_content = None
10
+
11
  # Case 1: If it's a shareable link
12
  if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
13
  # Modify Dropbox link for direct download
14
  if 'dl=0' in pdfshareablelink:
15
  pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
16
+
17
  # Download the PDF content from the shareable link
18
  response = requests.get(pdfshareablelink)
19
  pdf_content = BytesIO(response.content) # Store the content in memory
20
  print('Downloaded from shareable link.')
 
 
21
  elif dbpdfpath:
22
  dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
23
  print('Dropbox team access initialized.')
 
32
  # Open the PDF using fitz (PyMuPDF) directly from memory
33
  pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
34
  print('PDF opened in memory.')
35
+
36
  all_text = "" # Initialize a string to store all text
37
  current_line = "" # To build the current line
38
+ collecting_text = False # Track whether we're currently collecting text under the heading
39
+ f10_count = 0 # Counter for F10 headings
40
+ current_y = None # To track the y-coordinate
41
+
42
+ # Define a regex pattern to match headings
43
+ heading_pattern = re.compile(r"[A-Za-z]\d{2}") # Heading pattern (letter followed by two numbers)
44
+
45
  # Loop through each page in the PDF
46
  for page_num in range(pdf_document.page_count):
47
  page = pdf_document.load_page(page_num)
48
+
49
  # Get text as dictionary to extract lines
50
  text_dict = page.get_text("dict")
51
+
52
+ # Collect header y-coordinates to determine header area
53
+ header_y_values = []
54
+
55
+ # First pass to collect y-coordinates for detecting header area
56
+ for block in text_dict['blocks']:
57
+ if 'lines' in block: # Check if 'lines' key exists
58
+ for line in block['lines']:
59
+ for span in line['spans']:
60
+ header_y_values.append(span['bbox'][1]) # Collect top y-coordinates of spans
61
 
62
+ # Determine a threshold for the header area (e.g., top 20% of the page height)
63
+ header_threshold = min(header_y_values) + (page.rect.height * 0.1) # Adding 10% for a buffer
64
+ print(f"Header threshold for page {page_num + 1}: {header_threshold}")
65
+
66
  # Iterate over blocks, lines, and spans to extract lines of text
67
  for block in text_dict['blocks']:
68
  if 'lines' in block: # Check if 'lines' key exists
69
  for line in block['lines']:
70
  for span in line['spans']:
71
  span_text = span['text'].strip()
72
+ span_y = span['bbox'][1] # Get the top y-coordinate of the span
73
+
74
+ # Check if it's a heading based on the format
75
+ if heading_pattern.match(span_text):
76
+ if heading_to_search in span_text:
77
+ f10_count += 1 # Increment the F10 counter
78
+
79
+ # Start collecting text under the second occurrence of F10
80
+ if f10_count == 2:
81
+ collecting_text = True # Start collecting text
82
+ print(f"Starting collection under heading: {span_text}")
83
+
84
+ # Stop collecting text if we reach a new heading
85
+ if collecting_text:
86
+ # If we encounter a new heading, we stop the collection
87
+ if heading_pattern.match(span_text) and span_text != heading_to_search:
88
+ print(f"Ending collection at heading: {span_text}")
89
+ collecting_text = False # Stop collecting
90
+ return all_text.strip() # Return collected text
91
+
92
+ # If we're collecting text, add it to the output
93
+ if collecting_text:
94
+ # Exclude spans that fall within the header area
95
+ if span_y < header_threshold:
96
+ continue # Skip spans in the header area
97
+
98
+ if current_y is None:
99
+ current_y = span_y # Initialize the first y-coordinate
100
 
101
+ # Check if the current span belongs to the same line (based on y-coordinate)
102
+ if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
103
+ current_line += " " + span_text # Add span text to the current line
104
+ else:
105
+ # If it's a new line, append the current line to all_text
106
+ all_text += current_line.strip() + '\n' # Add line to all_text with a newline
107
+ current_line = span_text # Start the new line with the current span
108
+ current_y = span_y # Update the y-coordinate for the new line
109
+
110
+ # Append the current line if we hit a new line at the end of the page
111
  if current_line:
112
  all_text += current_line.strip() + '\n'
113
+ current_line = "" # Reset for the next line
114
+
115
+ return all_text.strip() if f10_count == 2 else "Second heading not found"
116
+
117
+ # import fitz
118
+
119
+ # import tsadropboxretrieval
120
+ # from io import BytesIO
121
+ # import requests
122
+ # def texts_from_pdf(pdfshareablelink):
123
+ # print('intexts')
124
+
125
+ # pdf_content = None
126
+
127
+ # # Case 1: If it's a shareable link
128
+ # if pdfshareablelink and ('http' in pdfshareablelink or 'dropbox' in pdfshareablelink):
129
+ # # Modify Dropbox link for direct download
130
+ # if 'dl=0' in pdfshareablelink:
131
+ # pdfshareablelink = pdfshareablelink.replace('dl=0', 'dl=1')
132
+
133
+ # # Download the PDF content from the shareable link
134
+ # response = requests.get(pdfshareablelink)
135
+ # pdf_content = BytesIO(response.content) # Store the content in memory
136
+ # print('Downloaded from shareable link.')
137
+
138
+ # # Case 2: If it's a Dropbox path, use the Dropbox API to download
139
+ # elif dbpdfpath:
140
+ # dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
141
+ # print('Dropbox team access initialized.')
142
+ # md, res = dbxTeam.files_download(path=dbpdfpath)
143
+ # pdf_content = BytesIO(res.content) # Store the content in memory
144
+ # print('Downloaded from Dropbox path.')
145
+
146
+ # # Check if the PDF content is available
147
+ # if pdf_content is None:
148
+ # raise ValueError("No valid PDF content found.")
149
+
150
+ # # Open the PDF using fitz (PyMuPDF) directly from memory
151
+ # pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
152
+ # print('PDF opened in memory.')
153
+
154
+ # all_text = "" # Initialize a string to store all text
155
+ # current_line = "" # To build the current line
156
+ # current_y = None # Track the y-coordinate of the current line
157
+
158
+ # # Loop through each page in the PDF
159
+ # for page_num in range(pdf_document.page_count):
160
+ # page = pdf_document.load_page(page_num)
161
+
162
+ # # Get text as dictionary to extract lines
163
+ # text_dict = page.get_text("dict")
164
+
165
+ # # Iterate over blocks, lines, and spans to extract lines of text
166
+ # for block in text_dict['blocks']:
167
+ # if 'lines' in block: # Check if 'lines' key exists
168
+ # for line in block['lines']:
169
+ # for span in line['spans']:
170
+ # span_text = span['text'].strip()
171
+ # span_y = span['bbox'][1] # Y-coordinate of the span (bbox[1] is the top y-coordinate)
172
+
173
+ # # Check if the current span belongs to the same line (based on y-coordinate)
174
+ # if current_y is None:
175
+ # current_y = span_y # Initialize the first y-coordinate
176
+
177
+ # if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
178
+ # # If the y-coordinate is close enough, add to the current line
179
+ # current_line += " " + span_text
180
+ # else:
181
+ # # If it's a new line, append the current line and reset
182
+ # all_text += current_line.strip() + '\n' # Add line to all_text with a newline
183
+ # current_line = span_text # Start the new line with the current span
184
+ # current_y = span_y # Update the y-coordinate for the new line
185
+
186
+ # # Append the last line of the page (if there's any)
187
+ # if current_line:
188
+ # all_text += current_line.strip() + '\n'
189
+ # current_line = "" # Reset after each page
190
+ # # all_text = all_text.replace('\n', ' ')
191
+ # # return all_lines
192
+ # print(all_text)
193
+ # return all_text
194
+ # # print('intexts')
195
+ # # dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
196
+ # # print('dbdone')
197
+ # # md, res =dbxTeam.files_download(path=dbpdfpath)
198
+ # # print('downloaded')
199
+ # # dataDoc = res.content
200
+ # # print('l')
201
+ # # pdf_document = fitz.open('pdf',dataDoc)
202
+ # # print('k')
203
+ # # alltexts=''
204
+ # # for page_num in range(pdf_document.page_count):
205
+ # # page = pdf_document[page_num]
206
+ # # text_instances = page.get_text()
207
+ # # alltexts+=text_instances
208
+
209
+ # # # alltexts = alltexts.replace('\n', ' ')
210
+ # # return alltexts
211