Spaces:
Sleeping
Sleeping
Update pdftotext.py
Browse files- pdftotext.py +4 -3
pdftotext.py
CHANGED
|
@@ -2,8 +2,7 @@ import fitz # PyMuPDF
|
|
| 2 |
from io import BytesIO
|
| 3 |
import re
|
| 4 |
import requests
|
| 5 |
-
|
| 6 |
-
def texts_from_pdf(pdfshareablelink, heading_to_search):
|
| 7 |
print('intexts')
|
| 8 |
|
| 9 |
pdf_content = None
|
|
@@ -87,6 +86,7 @@ def texts_from_pdf(pdfshareablelink, heading_to_search):
|
|
| 87 |
if heading_pattern.match(span_text) and span_text != heading_to_search:
|
| 88 |
print(f"Ending collection at heading: {span_text}")
|
| 89 |
collecting_text = False # Stop collecting
|
|
|
|
| 90 |
return all_text.strip() # Return collected text
|
| 91 |
|
| 92 |
# If we're collecting text, add it to the output
|
|
@@ -112,7 +112,8 @@ def texts_from_pdf(pdfshareablelink, heading_to_search):
|
|
| 112 |
all_text += current_line.strip() + '\n'
|
| 113 |
current_line = "" # Reset for the next line
|
| 114 |
|
| 115 |
-
|
|
|
|
| 116 |
|
| 117 |
# import fitz
|
| 118 |
|
|
|
|
| 2 |
from io import BytesIO
|
| 3 |
import re
|
| 4 |
import requests
|
| 5 |
+
def texts_from_pdf(pdfshareablelink, heading_to_search):
|
|
|
|
| 6 |
print('intexts')
|
| 7 |
|
| 8 |
pdf_content = None
|
|
|
|
| 86 |
if heading_pattern.match(span_text) and span_text != heading_to_search:
|
| 87 |
print(f"Ending collection at heading: {span_text}")
|
| 88 |
collecting_text = False # Stop collecting
|
| 89 |
+
|
| 90 |
return all_text.strip() # Return collected text
|
| 91 |
|
| 92 |
# If we're collecting text, add it to the output
|
|
|
|
| 112 |
all_text += current_line.strip() + '\n'
|
| 113 |
current_line = "" # Reset for the next line
|
| 114 |
|
| 115 |
+
# print(f"\nCollected Text:\n{all_text.strip()}")
|
| 116 |
+
return all_text.strip() if f10_count > 1 else "Heading not found"
|
| 117 |
|
| 118 |
# import fitz
|
| 119 |
|