File size: 894 Bytes
f499d4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
from typing import List, Dict


def clean_pages(pages: List[Dict]) -> List[Dict]:
    """
    Clean extracted PDF pages while preserving scientific content.

    Parameters
    ----------
    pages : List[Dict]
        Page dictionaries with keys: 'page_num', 'text'

    Returns
    -------
    List[Dict]
        Cleaned pages with same structure.
    """

    cleaned = []

    for page in pages:
        text = page["text"]

        # Remove excessive whitespace
        text = re.sub(r"\s+", " ", text)

        # Remove common boilerplate patterns
        text = re.sub(r"arXiv:\d+\.\d+(v\d+)?", "", text, flags=re.IGNORECASE)
        text = re.sub(r"©.*?All rights reserved\.", "", text, flags=re.IGNORECASE)

        cleaned.append(
            {
                "page_num": page["page_num"],
                "text": text.strip(),
            }
        )

    return cleaned