Solareva Taisia commited on
Commit ·
090e11e
1
Parent(s): 28caba5
fix(api): add internal utils package to avoid bad imports
Browse files- scripts/make_public_snapshot.py +1 -0
- utils/__init__.py +48 -0
- utils/data_processing.py +62 -0
- utils/russian_text_utils.py +27 -0
- utils/text_processing.py +43 -0
- utils/tokenization.py +145 -0
scripts/make_public_snapshot.py
CHANGED
|
@@ -40,6 +40,7 @@ INCLUDE_DIRS = [
|
|
| 40 |
"experiments",
|
| 41 |
"models", # python code only; weights excluded by patterns below
|
| 42 |
"monitoring", # python code only; prediction logs excluded by patterns below
|
|
|
|
| 43 |
"nginx",
|
| 44 |
"pages",
|
| 45 |
"scripts",
|
|
|
|
| 40 |
"experiments",
|
| 41 |
"models", # python code only; weights excluded by patterns below
|
| 42 |
"monitoring", # python code only; prediction logs excluded by patterns below
|
| 43 |
+
"utils",
|
| 44 |
"nginx",
|
| 45 |
"pages",
|
| 46 |
"scripts",
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility helpers used across API, training scripts, and dashboards.
|
| 2 |
+
|
| 3 |
+
Important: keep this package lightweight at import time. In production, we want
|
| 4 |
+
`uvicorn api.main:app` to import quickly and bind to the port; heavy deps like
|
| 5 |
+
transformers/torch should only be imported when actually needed.
|
| 6 |
+
|
| 7 |
+
This package also prevents ambiguous imports where `import utils` could resolve
|
| 8 |
+
to an unrelated third-party PyPI package named `utils`.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from importlib import import_module
|
| 14 |
+
from typing import Any
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"RussianTextTokenizer",
|
| 18 |
+
"create_tokenizer",
|
| 19 |
+
"tokenize_text_pair",
|
| 20 |
+
"prepare_text_for_tokenization",
|
| 21 |
+
"normalise_text",
|
| 22 |
+
"create_vocab",
|
| 23 |
+
"process_tags",
|
| 24 |
+
"build_label_mapping",
|
| 25 |
+
"create_target_encoding",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
_LAZY: dict[str, tuple[str, str]] = {
|
| 29 |
+
"RussianTextTokenizer": ("utils.tokenization", "RussianTextTokenizer"),
|
| 30 |
+
"create_tokenizer": ("utils.tokenization", "create_tokenizer"),
|
| 31 |
+
"tokenize_text_pair": ("utils.tokenization", "tokenize_text_pair"),
|
| 32 |
+
"prepare_text_for_tokenization": ("utils.russian_text_utils", "prepare_text_for_tokenization"),
|
| 33 |
+
"normalise_text": ("utils.text_processing", "normalise_text"),
|
| 34 |
+
"create_vocab": ("utils.text_processing", "create_vocab"),
|
| 35 |
+
"process_tags": ("utils.data_processing", "process_tags"),
|
| 36 |
+
"build_label_mapping": ("utils.data_processing", "build_label_mapping"),
|
| 37 |
+
"create_target_encoding": ("utils.data_processing", "create_target_encoding"),
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def __getattr__(name: str) -> Any:
|
| 42 |
+
if name not in _LAZY:
|
| 43 |
+
raise AttributeError(f"module 'utils' has no attribute {name!r}")
|
| 44 |
+
module_name, attr_name = _LAZY[name]
|
| 45 |
+
mod = import_module(module_name)
|
| 46 |
+
return getattr(mod, attr_name)
|
| 47 |
+
|
| 48 |
+
|
utils/data_processing.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Label/tag processing helpers for multi-label classification."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Iterable, List, Sequence, Tuple, Union
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def process_tags(tags: Union[str, Sequence[str], None], sep: str = ",") -> List[str]:
|
| 11 |
+
"""Convert raw tags to a list of normalized tag strings."""
|
| 12 |
+
if tags is None:
|
| 13 |
+
return []
|
| 14 |
+
if isinstance(tags, str):
|
| 15 |
+
parts = [t.strip() for t in tags.split(sep)]
|
| 16 |
+
return [p for p in parts if p]
|
| 17 |
+
# Sequence[str]
|
| 18 |
+
out: List[str] = []
|
| 19 |
+
for t in tags:
|
| 20 |
+
if t is None:
|
| 21 |
+
continue
|
| 22 |
+
s = str(t).strip()
|
| 23 |
+
if s:
|
| 24 |
+
out.append(s)
|
| 25 |
+
return out
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def build_label_mapping(
|
| 29 |
+
df,
|
| 30 |
+
*,
|
| 31 |
+
tags_col: str = "tags",
|
| 32 |
+
sep: str = ",",
|
| 33 |
+
) -> Dict[str, int]:
|
| 34 |
+
"""Build a tag->index mapping from a dataframe-like object.
|
| 35 |
+
|
| 36 |
+
Expects `df[tags_col]` to contain either comma-separated strings or lists.
|
| 37 |
+
"""
|
| 38 |
+
tag_set = set()
|
| 39 |
+
for raw in df[tags_col].tolist():
|
| 40 |
+
tag_set.update(process_tags(raw, sep=sep))
|
| 41 |
+
return {tag: i for i, tag in enumerate(sorted(tag_set))}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def create_target_encoding(
|
| 45 |
+
tag_lists: Iterable[Union[str, Sequence[str], None]],
|
| 46 |
+
label_to_idx: Dict[str, int],
|
| 47 |
+
*,
|
| 48 |
+
sep: str = ",",
|
| 49 |
+
dtype: torch.dtype = torch.float32,
|
| 50 |
+
) -> torch.Tensor:
|
| 51 |
+
"""Create a multi-hot target tensor of shape [N, num_labels]."""
|
| 52 |
+
tag_lists = list(tag_lists)
|
| 53 |
+
num_labels = len(label_to_idx)
|
| 54 |
+
y = torch.zeros((len(tag_lists), num_labels), dtype=dtype)
|
| 55 |
+
for i, raw in enumerate(tag_lists):
|
| 56 |
+
for tag in process_tags(raw, sep=sep):
|
| 57 |
+
j = label_to_idx.get(tag)
|
| 58 |
+
if j is not None:
|
| 59 |
+
y[i, j] = 1.0
|
| 60 |
+
return y
|
| 61 |
+
|
| 62 |
+
|
utils/russian_text_utils.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Russian text preprocessing helpers.
|
| 2 |
+
|
| 3 |
+
Keep this module lightweight: it is imported by the FastAPI service at startup.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
_WS_RE = re.compile(r"\s+")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def prepare_text_for_tokenization(text: Optional[str]) -> str:
|
| 15 |
+
"""Prepare raw text for tokenizer input.
|
| 16 |
+
|
| 17 |
+
- Handles None safely
|
| 18 |
+
- Strips surrounding whitespace
|
| 19 |
+
- Collapses internal whitespace/newlines
|
| 20 |
+
"""
|
| 21 |
+
if text is None:
|
| 22 |
+
return ""
|
| 23 |
+
# Normalize whitespace and strip.
|
| 24 |
+
s = _WS_RE.sub(" ", str(text)).strip()
|
| 25 |
+
return s
|
| 26 |
+
|
| 27 |
+
|
utils/text_processing.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Basic text normalization and vocabulary building utilities."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from collections import Counter
|
| 7 |
+
from typing import Dict
|
| 8 |
+
|
| 9 |
+
# Keep letters (Latin + Cyrillic), digits, and whitespace.
|
| 10 |
+
_CLEAN_RE = re.compile(r"[^0-9a-zA-Z\u0400-\u04FF\s]+", flags=re.UNICODE)
|
| 11 |
+
_WS_RE = re.compile(r"\s+")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def normalise_text(text: str) -> str:
|
| 15 |
+
"""Lowercase, remove punctuation/special chars, and collapse whitespace."""
|
| 16 |
+
s = (text or "").lower()
|
| 17 |
+
s = _CLEAN_RE.sub(" ", s)
|
| 18 |
+
s = _WS_RE.sub(" ", s).strip()
|
| 19 |
+
return s
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def create_vocab(text: str, vocab_size: int = 50000) -> Dict[str, int]:
|
| 23 |
+
"""Create a simple frequency-based vocabulary mapping.
|
| 24 |
+
|
| 25 |
+
Always includes:
|
| 26 |
+
- #PAD# -> 0
|
| 27 |
+
- #UNKN# -> 1
|
| 28 |
+
"""
|
| 29 |
+
vocab: Dict[str, int] = {"#PAD#": 0, "#UNKN#": 1}
|
| 30 |
+
if vocab_size <= 0:
|
| 31 |
+
return vocab
|
| 32 |
+
|
| 33 |
+
tokens = normalise_text(text).split()
|
| 34 |
+
counts = Counter(tokens)
|
| 35 |
+
|
| 36 |
+
for word, _ in counts.most_common(max(0, vocab_size)):
|
| 37 |
+
if word in vocab:
|
| 38 |
+
continue
|
| 39 |
+
vocab[word] = len(vocab)
|
| 40 |
+
|
| 41 |
+
return vocab
|
| 42 |
+
|
| 43 |
+
|
utils/tokenization.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tokenization utilities used for transformer models."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Any, Dict, List, Optional, Union
|
| 7 |
+
|
| 8 |
+
from transformers import AutoTokenizer
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class RussianTextTokenizer:
|
| 13 |
+
"""Thin wrapper around a HuggingFace tokenizer with sane defaults."""
|
| 14 |
+
|
| 15 |
+
model_name: str = "DeepPavlov/rubert-base-cased"
|
| 16 |
+
max_length: int = 128
|
| 17 |
+
padding: Union[bool, str] = "max_length"
|
| 18 |
+
truncation: bool = True
|
| 19 |
+
|
| 20 |
+
def __post_init__(self) -> None:
|
| 21 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
|
| 22 |
+
|
| 23 |
+
def get_vocab_size(self) -> int:
|
| 24 |
+
return int(getattr(self.tokenizer, "vocab_size", len(self.tokenizer.get_vocab())))
|
| 25 |
+
|
| 26 |
+
def get_special_tokens(self) -> Dict[str, Optional[int]]:
|
| 27 |
+
return {
|
| 28 |
+
"pad_token_id": self.tokenizer.pad_token_id,
|
| 29 |
+
"cls_token_id": self.tokenizer.cls_token_id,
|
| 30 |
+
"sep_token_id": self.tokenizer.sep_token_id,
|
| 31 |
+
"unk_token_id": self.tokenizer.unk_token_id,
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def tokenize(self, text: str, add_special_tokens: bool = True) -> List[str]:
|
| 35 |
+
return self.tokenizer.tokenize(text or "", add_special_tokens=add_special_tokens)
|
| 36 |
+
|
| 37 |
+
def encode(
|
| 38 |
+
self,
|
| 39 |
+
text: str,
|
| 40 |
+
*,
|
| 41 |
+
max_length: Optional[int] = None,
|
| 42 |
+
padding: Optional[Union[bool, str]] = None,
|
| 43 |
+
truncation: Optional[bool] = None,
|
| 44 |
+
return_tensors: Optional[str] = "pt",
|
| 45 |
+
) -> Dict[str, Any]:
|
| 46 |
+
"""Encode a single text.
|
| 47 |
+
|
| 48 |
+
Returns a dict containing `input_ids` and `attention_mask`.
|
| 49 |
+
"""
|
| 50 |
+
max_length_eff = max_length or self.max_length
|
| 51 |
+
padding_eff = self.padding if padding is None else padding
|
| 52 |
+
truncation_eff = self.truncation if truncation is None else truncation
|
| 53 |
+
|
| 54 |
+
if return_tensors is None:
|
| 55 |
+
enc = self.tokenizer(
|
| 56 |
+
text or "",
|
| 57 |
+
max_length=max_length_eff,
|
| 58 |
+
padding=padding_eff,
|
| 59 |
+
truncation=truncation_eff,
|
| 60 |
+
return_attention_mask=True,
|
| 61 |
+
return_tensors=None,
|
| 62 |
+
)
|
| 63 |
+
# HuggingFace returns lists for a single example; standardize to batch-like shape.
|
| 64 |
+
return {
|
| 65 |
+
"input_ids": [enc["input_ids"]],
|
| 66 |
+
"attention_mask": [enc["attention_mask"]],
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
return self.tokenizer(
|
| 70 |
+
text or "",
|
| 71 |
+
max_length=max_length_eff,
|
| 72 |
+
padding=padding_eff,
|
| 73 |
+
truncation=truncation_eff,
|
| 74 |
+
return_attention_mask=True,
|
| 75 |
+
return_tensors=return_tensors,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def encode_batch(
|
| 79 |
+
self,
|
| 80 |
+
texts: List[str],
|
| 81 |
+
*,
|
| 82 |
+
max_length: Optional[int] = None,
|
| 83 |
+
padding: Optional[Union[bool, str]] = None,
|
| 84 |
+
truncation: Optional[bool] = None,
|
| 85 |
+
return_tensors: str = "pt",
|
| 86 |
+
) -> Dict[str, Any]:
|
| 87 |
+
max_length_eff = max_length or self.max_length
|
| 88 |
+
padding_eff = self.padding if padding is None else padding
|
| 89 |
+
truncation_eff = self.truncation if truncation is None else truncation
|
| 90 |
+
return self.tokenizer(
|
| 91 |
+
[t or "" for t in texts],
|
| 92 |
+
max_length=max_length_eff,
|
| 93 |
+
padding=padding_eff,
|
| 94 |
+
truncation=truncation_eff,
|
| 95 |
+
return_attention_mask=True,
|
| 96 |
+
return_tensors=return_tensors,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def decode(self, token_ids: Union[List[int], Any], skip_special_tokens: bool = True) -> str:
|
| 100 |
+
# Avoid importing torch at module import time; handle torch tensors via duck-typing.
|
| 101 |
+
if hasattr(token_ids, "detach") and hasattr(token_ids, "cpu") and hasattr(token_ids, "tolist"):
|
| 102 |
+
token_ids = token_ids.detach().cpu().tolist()
|
| 103 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
| 104 |
+
|
| 105 |
+
def get_token_info(self, token_id: int) -> Dict[str, Any]:
|
| 106 |
+
tok = self.tokenizer.convert_ids_to_tokens(int(token_id))
|
| 107 |
+
specials = set(self.tokenizer.all_special_ids)
|
| 108 |
+
return {
|
| 109 |
+
"token_id": int(token_id),
|
| 110 |
+
"token": tok,
|
| 111 |
+
"is_special": int(token_id) in specials,
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def create_tokenizer(model_name: str = "DeepPavlov/rubert-base-cased", max_length: int = 128) -> RussianTextTokenizer:
|
| 116 |
+
return RussianTextTokenizer(model_name=model_name, max_length=max_length)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def tokenize_text_pair(
|
| 120 |
+
*,
|
| 121 |
+
title: str,
|
| 122 |
+
snippet: Optional[str],
|
| 123 |
+
tokenizer: RussianTextTokenizer,
|
| 124 |
+
max_title_len: int = 128,
|
| 125 |
+
max_snippet_len: int = 256,
|
| 126 |
+
) -> Dict[str, Any]:
|
| 127 |
+
"""Tokenize (title, snippet) as two independent sequences (not a single pair encoding)."""
|
| 128 |
+
title_enc = tokenizer.encode(title or "", max_length=max_title_len, return_tensors="pt")
|
| 129 |
+
out: Dict[str, Any] = {
|
| 130 |
+
"title_input_ids": title_enc["input_ids"].squeeze(0),
|
| 131 |
+
"title_attention_mask": title_enc["attention_mask"].squeeze(0),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
if snippet is not None:
|
| 135 |
+
snip_enc = tokenizer.encode(snippet or "", max_length=max_snippet_len, return_tensors="pt")
|
| 136 |
+
out.update(
|
| 137 |
+
{
|
| 138 |
+
"snippet_input_ids": snip_enc["input_ids"].squeeze(0),
|
| 139 |
+
"snippet_attention_mask": snip_enc["attention_mask"].squeeze(0),
|
| 140 |
+
}
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
return out
|
| 144 |
+
|
| 145 |
+
|