Spaces:
Paused
Paused
Update pdftotext.py
Browse files- pdftotext.py +18 -17
pdftotext.py
CHANGED
|
@@ -48,24 +48,25 @@ def texts_from_pdf(pdfshareablelink):
|
|
| 48 |
|
| 49 |
# Iterate over blocks, lines, and spans to extract lines of text
|
| 50 |
for block in text_dict['blocks']:
|
| 51 |
-
|
| 52 |
-
for
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
# Check if the current span belongs to the same line (based on y-coordinate)
|
| 57 |
-
if current_y is None:
|
| 58 |
-
current_y = span_y # Initialize the first y-coordinate
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# Append the last line of the page (if there's any)
|
| 70 |
if current_line:
|
| 71 |
all_text += current_line.strip() + '\n'
|
|
|
|
| 48 |
|
| 49 |
# Iterate over blocks, lines, and spans to extract lines of text
|
| 50 |
for block in text_dict['blocks']:
|
| 51 |
+
if 'lines' in block: # Check if 'lines' key exists
|
| 52 |
+
for line in block['lines']:
|
| 53 |
+
for span in line['spans']:
|
| 54 |
+
span_text = span['text'].strip()
|
| 55 |
+
span_y = span['bbox'][1] # Y-coordinate of the span (bbox[1] is the top y-coordinate)
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# Check if the current span belongs to the same line (based on y-coordinate)
|
| 58 |
+
if current_y is None:
|
| 59 |
+
current_y = span_y # Initialize the first y-coordinate
|
| 60 |
+
|
| 61 |
+
if abs(current_y - span_y) < 2: # Threshold to determine if it's the same line
|
| 62 |
+
# If the y-coordinate is close enough, add to the current line
|
| 63 |
+
current_line += " " + span_text
|
| 64 |
+
else:
|
| 65 |
+
# If it's a new line, append the current line and reset
|
| 66 |
+
all_text += current_line.strip() + '\n' # Add line to all_text with a newline
|
| 67 |
+
current_line = span_text # Start the new line with the current span
|
| 68 |
+
current_y = span_y # Update the y-coordinate for the new line
|
| 69 |
+
|
| 70 |
# Append the last line of the page (if there's any)
|
| 71 |
if current_line:
|
| 72 |
all_text += current_line.strip() + '\n'
|