Marthee commited on
Commit
96ac8c5
·
verified ·
1 Parent(s): f8d7052

Update pdftotext.py

Browse files
Files changed (1) hide show
  1. pdftotext.py +11 -2
pdftotext.py CHANGED
@@ -141,6 +141,13 @@ def apiFiltering(apitext):
141
  return filtered_items
142
 
143
 
 
 
 
 
 
 
 
144
  def texts_from_pdfAllText(link):
145
 
146
  pdf_content = None
@@ -167,8 +174,10 @@ def texts_from_pdfAllText(link):
167
  text_instances = page.get_text()
168
  all_text+=text_instances
169
 
170
- print(all_text)
171
- return all_text
 
 
172
  # import fitz
173
 
174
  # import tsadropboxretrieval
 
141
  return filtered_items
142
 
143
 
144
+ def clean_text(text):
145
+ # Replace all newlines and tabs with a space
146
+ text = re.sub(r'[\n\t]+', ' ', text)
147
+ # Collapse multiple spaces into one
148
+ text = re.sub(r'\s+', ' ', text)
149
+ return text.strip()
150
+
151
  def texts_from_pdfAllText(link):
152
 
153
  pdf_content = None
 
174
  text_instances = page.get_text()
175
  all_text+=text_instances
176
 
177
+
178
+ cleaned_text = clean_text(all_text)
179
+ print(cleaned_text)
180
+ return cleaned_text
181
  # import fitz
182
 
183
  # import tsadropboxretrieval