Spaces:
Paused
Paused
Update pdftotext.py
Browse files- pdftotext.py +11 -2
pdftotext.py
CHANGED
|
@@ -141,6 +141,13 @@ def apiFiltering(apitext):
|
|
| 141 |
return filtered_items
|
| 142 |
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def texts_from_pdfAllText(link):
|
| 145 |
|
| 146 |
pdf_content = None
|
|
@@ -167,8 +174,10 @@ def texts_from_pdfAllText(link):
|
|
| 167 |
text_instances = page.get_text()
|
| 168 |
all_text+=text_instances
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
# import fitz
|
| 173 |
|
| 174 |
# import tsadropboxretrieval
|
|
|
|
| 141 |
return filtered_items
|
| 142 |
|
| 143 |
|
| 144 |
+
def clean_text(text):
|
| 145 |
+
# Replace all newlines and tabs with a space
|
| 146 |
+
text = re.sub(r'[\n\t]+', ' ', text)
|
| 147 |
+
# Collapse multiple spaces into one
|
| 148 |
+
text = re.sub(r'\s+', ' ', text)
|
| 149 |
+
return text.strip()
|
| 150 |
+
|
| 151 |
def texts_from_pdfAllText(link):
|
| 152 |
|
| 153 |
pdf_content = None
|
|
|
|
| 174 |
text_instances = page.get_text()
|
| 175 |
all_text+=text_instances
|
| 176 |
|
| 177 |
+
|
| 178 |
+
cleaned_text = clean_text(all_text)
|
| 179 |
+
print(cleaned_text)
|
| 180 |
+
return cleaned_text
|
| 181 |
# import fitz
|
| 182 |
|
| 183 |
# import tsadropboxretrieval
|