File size: 553 Bytes
06e73d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re
import html
from typing import List


def clean_text(text: str) -> str:
    """Clean and normalize raw text — decodes HTML, strips URLs, normalizes whitespace."""
    text = html.unescape(text)
    text = re.sub(r'http\S+', '', text)
    text = text.replace('\u201c', '"').replace(
        '\u201d', '"').replace('\u2013', '-')
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def preprocess_batch(texts: List[str]) -> List[str]:
    """Apply clean_text to a list of strings."""
    return [clean_text(text) for text in texts]