File size: 997 Bytes
1bc3f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re


def normalize_text(text: str) -> str:
    """Clean and normalize extracted text from any format (PDF/DOCX/MD/TXT)."""
    if not text:
        return ""

    # Replace common PDF CID artifacts like (cid:1234)
    text = re.sub(r'\(cid:\d+\)', '', text)

    # Replace newlines/tabs with spaces
    text = text.replace('\n', ' ').replace('\t', ' ')

    # Remove emojis and pictographs
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002500-\U00002BEF"
        "\U00002700-\U000027BF"
        "\U0001F900-\U0001F9FF"
        "\U0001FA70-\U0001FAFF"
        "\U00002600-\U000026FF"
        "\U00002B00-\U00002BFF"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub("", text)

    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()