Spaces:
Sleeping
Sleeping
File size: 894 Bytes
f499d4b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import re
from typing import List, Dict
def clean_pages(pages: List[Dict]) -> List[Dict]:
"""
Clean extracted PDF pages while preserving scientific content.
Parameters
----------
pages : List[Dict]
Page dictionaries with keys: 'page_num', 'text'
Returns
-------
List[Dict]
Cleaned pages with same structure.
"""
cleaned = []
for page in pages:
text = page["text"]
# Remove excessive whitespace
text = re.sub(r"\s+", " ", text)
# Remove common boilerplate patterns
text = re.sub(r"arXiv:\d+\.\d+(v\d+)?", "", text, flags=re.IGNORECASE)
text = re.sub(r"©.*?All rights reserved\.", "", text, flags=re.IGNORECASE)
cleaned.append(
{
"page_num": page["page_num"],
"text": text.strip(),
}
)
return cleaned
|