cd@bziiit.com commited on
Commit
85a9e7b
·
1 Parent(s): a8ccf10

Remove ebook converter demo watermarks from PDF text extraction

Browse files
Files changed (1) hide show
  1. pdf_processing.py +1 -0
pdf_processing.py CHANGED
@@ -17,6 +17,7 @@ def load_and_preprocess_pdf(pdf_path):
17
  for page in pdf.pages:
18
  text += page.extract_text() or ""
19
 
 
20
  return text
21
 
22
  def split_text(text):
 
17
  for page in pdf.pages:
18
  text += page.extract_text() or ""
19
 
20
+ text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
21
  return text
22
 
23
  def split_text(text):