fake-news-api / src /data /preprocessing.py
aviseth's picture
Initial deployment
06e73d2
raw
history blame contribute delete
553 Bytes
import re
import html
from typing import List
def clean_text(text: str) -> str:
"""Clean and normalize raw text — decodes HTML, strips URLs, normalizes whitespace."""
text = html.unescape(text)
text = re.sub(r'http\S+', '', text)
text = text.replace('\u201c', '"').replace(
'\u201d', '"').replace('\u2013', '-')
text = re.sub(r'\s+', ' ', text).strip()
return text
def preprocess_batch(texts: List[str]) -> List[str]:
"""Apply clean_text to a list of strings."""
return [clean_text(text) for text in texts]