multilabel-news-classifier / utils /russian_text_utils.py
Solareva Taisia
feat(deploy): add Railway.app support
1b82a45
"""Russian text preprocessing helpers.
Keep this module lightweight: it is imported by the FastAPI service at startup.
"""
from __future__ import annotations
import re
from typing import Optional
_WS_RE = re.compile(r"\s+")
def prepare_text_for_tokenization(text: Optional[str]) -> str:
"""Prepare raw text for tokenizer input.
- Handles None safely
- Strips surrounding whitespace
- Collapses internal whitespace/newlines
"""
if text is None:
return ""
# Normalize whitespace and strip.
s = _WS_RE.sub(" ", str(text)).strip()
return s