Spaces:

rohitdeshmukh318
/

RAG

Sleeping

File size: 894 Bytes

f499d4b

import re
from typing import List, Dict


def clean_pages(pages: List[Dict]) -> List[Dict]:
    """
    Clean extracted PDF pages while preserving scientific content.

    Parameters
    ----------
    pages : List[Dict]
        Page dictionaries with keys: 'page_num', 'text'

    Returns
    -------
    List[Dict]
        Cleaned pages with same structure.
    """

    cleaned = []

    for page in pages:
        text = page["text"]

        # Remove excessive whitespace
        text = re.sub(r"\s+", " ", text)

        # Remove common boilerplate patterns
        text = re.sub(r"arXiv:\d+\.\d+(v\d+)?", "", text, flags=re.IGNORECASE)
        text = re.sub(r"©.*?All rights reserved\.", "", text, flags=re.IGNORECASE)

        cleaned.append(
            {
                "page_num": page["page_num"],
                "text": text.strip(),
            }
        )

    return cleaned