Spaces:
Running
Running
cd@bziiit.com
commited on
Commit
·
85a9e7b
1
Parent(s):
a8ccf10
Remove ebook converter demo watermarks from PDF text extraction
Browse files- pdf_processing.py +1 -0
pdf_processing.py
CHANGED
|
@@ -17,6 +17,7 @@ def load_and_preprocess_pdf(pdf_path):
|
|
| 17 |
for page in pdf.pages:
|
| 18 |
text += page.extract_text() or ""
|
| 19 |
|
|
|
|
| 20 |
return text
|
| 21 |
|
| 22 |
def split_text(text):
|
|
|
|
| 17 |
for page in pdf.pages:
|
| 18 |
text += page.extract_text() or ""
|
| 19 |
|
| 20 |
+
text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
|
| 21 |
return text
|
| 22 |
|
| 23 |
def split_text(text):
|