Marthee commited on
Commit
79594bb
·
verified ·
1 Parent(s): 13f591b

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +18 -17
pdftotext.py CHANGED
@@ -48,24 +48,25 @@ def texts_from_pdf(pdfshareablelink):
48
 
49
  # Iterate over blocks, lines, and spans to extract lines of text
50
  for block in text_dict['blocks']:
51
- for line in block['lines']:
52
- for span in line['spans']:
53
- span_text = span['text'].strip()
54
- span_y = span['bbox'][1] # Y-coordinate of the span (bbox[1] is the top y-coordinate)
55
-
56
- # Check if the current span belongs to the same line (based on y-coordinate)
57
- if current_y is None:
58
- current_y = span_y # Initialize the first y-coordinate
59
 
60
- if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
61
- # If the y-coordinate is close enough, add to the current line
62
- current_line += " " + span_text
63
- else:
64
- # If it's a new line, append the current line and reset
65
- all_text += current_line.strip() + '\n' # Add line to all_text with a newline
66
- current_line = span_text # Start the new line with the current span
67
- current_y = span_y # Update the y-coordinate for the new line
68
-
 
 
 
 
69
  # Append the last line of the page (if there's any)
70
  if current_line:
71
  all_text += current_line.strip() + '\n'
 
48
 
49
  # Iterate over blocks, lines, and spans to extract lines of text
50
  for block in text_dict['blocks']:
51
+ if 'lines' in block: # Check if 'lines' key exists
52
+ for line in block['lines']:
53
+ for span in line['spans']:
54
+ span_text = span['text'].strip()
55
+ span_y = span['bbox'][1] # Y-coordinate of the span (bbox[1] is the top y-coordinate)
 
 
 
56
 
57
+ # Check if the current span belongs to the same line (based on y-coordinate)
58
+ if current_y is None:
59
+ current_y = span_y # Initialize the first y-coordinate
60
+
61
+ if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
62
+ # If the y-coordinate is close enough, add to the current line
63
+ current_line += " " + span_text
64
+ else:
65
+ # If it's a new line, append the current line and reset
66
+ all_text += current_line.strip() + '\n' # Add line to all_text with a newline
67
+ current_line = span_text # Start the new line with the current span
68
+ current_y = span_y # Update the y-coordinate for the new line
69
+
70
  # Append the last line of the page (if there's any)
71
  if current_line:
72
  all_text += current_line.strip() + '\n'