Spaces:

solarevat
/

multilabel-news-classifier

Sleeping

App Files Files Community

Solareva Taisia commited on Dec 20, 2025

Commit

090e11e

1 Parent(s): 28caba5

fix(api): add internal utils package to avoid bad imports

Browse files

Files changed (6) hide show

scripts/make_public_snapshot.py +1 -0
utils/__init__.py +48 -0
utils/data_processing.py +62 -0
utils/russian_text_utils.py +27 -0
utils/text_processing.py +43 -0
utils/tokenization.py +145 -0

scripts/make_public_snapshot.py CHANGED Viewed

@@ -40,6 +40,7 @@ INCLUDE_DIRS = [
     "experiments",
     "models",  # python code only; weights excluded by patterns below
     "monitoring",  # python code only; prediction logs excluded by patterns below
     "nginx",
     "pages",
     "scripts",

     "experiments",
     "models",  # python code only; weights excluded by patterns below
     "monitoring",  # python code only; prediction logs excluded by patterns below
+    "utils",
     "nginx",
     "pages",
     "scripts",

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Utility helpers used across API, training scripts, and dashboards.
+Important: keep this package lightweight at import time. In production, we want
+`uvicorn api.main:app` to import quickly and bind to the port; heavy deps like
+transformers/torch should only be imported when actually needed.
+This package also prevents ambiguous imports where `import utils` could resolve
+to an unrelated third-party PyPI package named `utils`.
+"""
+from __future__ import annotations
+from importlib import import_module
+from typing import Any
+__all__ = [
+    "RussianTextTokenizer",
+    "create_tokenizer",
+    "tokenize_text_pair",
+    "prepare_text_for_tokenization",
+    "normalise_text",
+    "create_vocab",
+    "process_tags",
+    "build_label_mapping",
+    "create_target_encoding",
+]
+_LAZY: dict[str, tuple[str, str]] = {
+    "RussianTextTokenizer": ("utils.tokenization", "RussianTextTokenizer"),
+    "create_tokenizer": ("utils.tokenization", "create_tokenizer"),
+    "tokenize_text_pair": ("utils.tokenization", "tokenize_text_pair"),
+    "prepare_text_for_tokenization": ("utils.russian_text_utils", "prepare_text_for_tokenization"),
+    "normalise_text": ("utils.text_processing", "normalise_text"),
+    "create_vocab": ("utils.text_processing", "create_vocab"),
+    "process_tags": ("utils.data_processing", "process_tags"),
+    "build_label_mapping": ("utils.data_processing", "build_label_mapping"),
+    "create_target_encoding": ("utils.data_processing", "create_target_encoding"),
+}
+def __getattr__(name: str) -> Any:
+    if name not in _LAZY:
+        raise AttributeError(f"module 'utils' has no attribute {name!r}")
+    module_name, attr_name = _LAZY[name]
+    mod = import_module(module_name)
+    return getattr(mod, attr_name)

utils/data_processing.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Label/tag processing helpers for multi-label classification."""
+from __future__ import annotations
+from typing import Dict, Iterable, List, Sequence, Tuple, Union
+import torch
+def process_tags(tags: Union[str, Sequence[str], None], sep: str = ",") -> List[str]:
+    """Convert raw tags to a list of normalized tag strings."""
+    if tags is None:
+        return []
+    if isinstance(tags, str):
+        parts = [t.strip() for t in tags.split(sep)]
+        return [p for p in parts if p]
+    # Sequence[str]
+    out: List[str] = []
+    for t in tags:
+        if t is None:
+            continue
+        s = str(t).strip()
+        if s:
+            out.append(s)
+    return out
+def build_label_mapping(
+    df,
+    *,
+    tags_col: str = "tags",
+    sep: str = ",",
+) -> Dict[str, int]:
+    """Build a tag->index mapping from a dataframe-like object.
+    Expects `df[tags_col]` to contain either comma-separated strings or lists.
+    """
+    tag_set = set()
+    for raw in df[tags_col].tolist():
+        tag_set.update(process_tags(raw, sep=sep))
+    return {tag: i for i, tag in enumerate(sorted(tag_set))}
+def create_target_encoding(
+    tag_lists: Iterable[Union[str, Sequence[str], None]],
+    label_to_idx: Dict[str, int],
+    *,
+    sep: str = ",",
+    dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Create a multi-hot target tensor of shape [N, num_labels]."""
+    tag_lists = list(tag_lists)
+    num_labels = len(label_to_idx)
+    y = torch.zeros((len(tag_lists), num_labels), dtype=dtype)
+    for i, raw in enumerate(tag_lists):
+        for tag in process_tags(raw, sep=sep):
+            j = label_to_idx.get(tag)
+            if j is not None:
+                y[i, j] = 1.0
+    return y

utils/russian_text_utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Russian text preprocessing helpers.
+Keep this module lightweight: it is imported by the FastAPI service at startup.
+"""
+from __future__ import annotations
+import re
+from typing import Optional
+_WS_RE = re.compile(r"\s+")
+def prepare_text_for_tokenization(text: Optional[str]) -> str:
+    """Prepare raw text for tokenizer input.
+    - Handles None safely
+    - Strips surrounding whitespace
+    - Collapses internal whitespace/newlines
+    """
+    if text is None:
+        return ""
+    # Normalize whitespace and strip.
+    s = _WS_RE.sub(" ", str(text)).strip()
+    return s

utils/text_processing.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Basic text normalization and vocabulary building utilities."""
+from __future__ import annotations
+import re
+from collections import Counter
+from typing import Dict
+# Keep letters (Latin + Cyrillic), digits, and whitespace.
+_CLEAN_RE = re.compile(r"[^0-9a-zA-Z\u0400-\u04FF\s]+", flags=re.UNICODE)
+_WS_RE = re.compile(r"\s+")
+def normalise_text(text: str) -> str:
+    """Lowercase, remove punctuation/special chars, and collapse whitespace."""
+    s = (text or "").lower()
+    s = _CLEAN_RE.sub(" ", s)
+    s = _WS_RE.sub(" ", s).strip()
+    return s
+def create_vocab(text: str, vocab_size: int = 50000) -> Dict[str, int]:
+    """Create a simple frequency-based vocabulary mapping.
+    Always includes:
+    - #PAD# -> 0
+    - #UNKN# -> 1
+    """
+    vocab: Dict[str, int] = {"#PAD#": 0, "#UNKN#": 1}
+    if vocab_size <= 0:
+        return vocab
+    tokens = normalise_text(text).split()
+    counts = Counter(tokens)
+    for word, _ in counts.most_common(max(0, vocab_size)):
+        if word in vocab:
+            continue
+        vocab[word] = len(vocab)
+    return vocab

utils/tokenization.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""Tokenization utilities used for transformer models."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+from transformers import AutoTokenizer
+@dataclass
+class RussianTextTokenizer:
+    """Thin wrapper around a HuggingFace tokenizer with sane defaults."""
+    model_name: str = "DeepPavlov/rubert-base-cased"
+    max_length: int = 128
+    padding: Union[bool, str] = "max_length"
+    truncation: bool = True
+    def __post_init__(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
+    def get_vocab_size(self) -> int:
+        return int(getattr(self.tokenizer, "vocab_size", len(self.tokenizer.get_vocab())))
+    def get_special_tokens(self) -> Dict[str, Optional[int]]:
+        return {
+            "pad_token_id": self.tokenizer.pad_token_id,
+            "cls_token_id": self.tokenizer.cls_token_id,
+            "sep_token_id": self.tokenizer.sep_token_id,
+            "unk_token_id": self.tokenizer.unk_token_id,
+        }
+    def tokenize(self, text: str, add_special_tokens: bool = True) -> List[str]:
+        return self.tokenizer.tokenize(text or "", add_special_tokens=add_special_tokens)
+    def encode(
+        self,
+        text: str,
+        *,
+        max_length: Optional[int] = None,
+        padding: Optional[Union[bool, str]] = None,
+        truncation: Optional[bool] = None,
+        return_tensors: Optional[str] = "pt",
+    ) -> Dict[str, Any]:
+        """Encode a single text.
+        Returns a dict containing `input_ids` and `attention_mask`.
+        """
+        max_length_eff = max_length or self.max_length
+        padding_eff = self.padding if padding is None else padding
+        truncation_eff = self.truncation if truncation is None else truncation
+        if return_tensors is None:
+            enc = self.tokenizer(
+                text or "",
+                max_length=max_length_eff,
+                padding=padding_eff,
+                truncation=truncation_eff,
+                return_attention_mask=True,
+                return_tensors=None,
+            )
+            # HuggingFace returns lists for a single example; standardize to batch-like shape.
+            return {
+                "input_ids": [enc["input_ids"]],
+                "attention_mask": [enc["attention_mask"]],
+            }
+        return self.tokenizer(
+            text or "",
+            max_length=max_length_eff,
+            padding=padding_eff,
+            truncation=truncation_eff,
+            return_attention_mask=True,
+            return_tensors=return_tensors,
+        )
+    def encode_batch(
+        self,
+        texts: List[str],
+        *,
+        max_length: Optional[int] = None,
+        padding: Optional[Union[bool, str]] = None,
+        truncation: Optional[bool] = None,
+        return_tensors: str = "pt",
+    ) -> Dict[str, Any]:
+        max_length_eff = max_length or self.max_length
+        padding_eff = self.padding if padding is None else padding
+        truncation_eff = self.truncation if truncation is None else truncation
+        return self.tokenizer(
+            [t or "" for t in texts],
+            max_length=max_length_eff,
+            padding=padding_eff,
+            truncation=truncation_eff,
+            return_attention_mask=True,
+            return_tensors=return_tensors,
+        )
+    def decode(self, token_ids: Union[List[int], Any], skip_special_tokens: bool = True) -> str:
+        # Avoid importing torch at module import time; handle torch tensors via duck-typing.
+        if hasattr(token_ids, "detach") and hasattr(token_ids, "cpu") and hasattr(token_ids, "tolist"):
+            token_ids = token_ids.detach().cpu().tolist()
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def get_token_info(self, token_id: int) -> Dict[str, Any]:
+        tok = self.tokenizer.convert_ids_to_tokens(int(token_id))
+        specials = set(self.tokenizer.all_special_ids)
+        return {
+            "token_id": int(token_id),
+            "token": tok,
+            "is_special": int(token_id) in specials,
+        }
+def create_tokenizer(model_name: str = "DeepPavlov/rubert-base-cased", max_length: int = 128) -> RussianTextTokenizer:
+    return RussianTextTokenizer(model_name=model_name, max_length=max_length)
+def tokenize_text_pair(
+    *,
+    title: str,
+    snippet: Optional[str],
+    tokenizer: RussianTextTokenizer,
+    max_title_len: int = 128,
+    max_snippet_len: int = 256,
+) -> Dict[str, Any]:
+    """Tokenize (title, snippet) as two independent sequences (not a single pair encoding)."""
+    title_enc = tokenizer.encode(title or "", max_length=max_title_len, return_tensors="pt")
+    out: Dict[str, Any] = {
+        "title_input_ids": title_enc["input_ids"].squeeze(0),
+        "title_attention_mask": title_enc["attention_mask"].squeeze(0),
+    }
+    if snippet is not None:
+        snip_enc = tokenizer.encode(snippet or "", max_length=max_snippet_len, return_tensors="pt")
+        out.update(
+            {
+                "snippet_input_ids": snip_enc["input_ids"].squeeze(0),
+                "snippet_attention_mask": snip_enc["attention_mask"].squeeze(0),
+            }
+        )
+    return out