Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List, Dict | |
| def clean_pages(pages: List[Dict]) -> List[Dict]: | |
| """ | |
| Clean extracted PDF pages while preserving scientific content. | |
| Parameters | |
| ---------- | |
| pages : List[Dict] | |
| Page dictionaries with keys: 'page_num', 'text' | |
| Returns | |
| ------- | |
| List[Dict] | |
| Cleaned pages with same structure. | |
| """ | |
| cleaned = [] | |
| for page in pages: | |
| text = page["text"] | |
| # Remove excessive whitespace | |
| text = re.sub(r"\s+", " ", text) | |
| # Remove common boilerplate patterns | |
| text = re.sub(r"arXiv:\d+\.\d+(v\d+)?", "", text, flags=re.IGNORECASE) | |
| text = re.sub(r"©.*?All rights reserved\.", "", text, flags=re.IGNORECASE) | |
| cleaned.append( | |
| { | |
| "page_num": page["page_num"], | |
| "text": text.strip(), | |
| } | |
| ) | |
| return cleaned | |