Solareva Taisia commited on
Commit
090e11e
·
1 Parent(s): 28caba5

fix(api): add internal utils package to avoid bad imports

Browse files
scripts/make_public_snapshot.py CHANGED
@@ -40,6 +40,7 @@ INCLUDE_DIRS = [
40
  "experiments",
41
  "models", # python code only; weights excluded by patterns below
42
  "monitoring", # python code only; prediction logs excluded by patterns below
 
43
  "nginx",
44
  "pages",
45
  "scripts",
 
40
  "experiments",
41
  "models", # python code only; weights excluded by patterns below
42
  "monitoring", # python code only; prediction logs excluded by patterns below
43
+ "utils",
44
  "nginx",
45
  "pages",
46
  "scripts",
utils/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility helpers used across API, training scripts, and dashboards.
2
+
3
+ Important: keep this package lightweight at import time. In production, we want
4
+ `uvicorn api.main:app` to import quickly and bind to the port; heavy deps like
5
+ transformers/torch should only be imported when actually needed.
6
+
7
+ This package also prevents ambiguous imports where `import utils` could resolve
8
+ to an unrelated third-party PyPI package named `utils`.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from importlib import import_module
14
+ from typing import Any
15
+
16
+ __all__ = [
17
+ "RussianTextTokenizer",
18
+ "create_tokenizer",
19
+ "tokenize_text_pair",
20
+ "prepare_text_for_tokenization",
21
+ "normalise_text",
22
+ "create_vocab",
23
+ "process_tags",
24
+ "build_label_mapping",
25
+ "create_target_encoding",
26
+ ]
27
+
28
+ _LAZY: dict[str, tuple[str, str]] = {
29
+ "RussianTextTokenizer": ("utils.tokenization", "RussianTextTokenizer"),
30
+ "create_tokenizer": ("utils.tokenization", "create_tokenizer"),
31
+ "tokenize_text_pair": ("utils.tokenization", "tokenize_text_pair"),
32
+ "prepare_text_for_tokenization": ("utils.russian_text_utils", "prepare_text_for_tokenization"),
33
+ "normalise_text": ("utils.text_processing", "normalise_text"),
34
+ "create_vocab": ("utils.text_processing", "create_vocab"),
35
+ "process_tags": ("utils.data_processing", "process_tags"),
36
+ "build_label_mapping": ("utils.data_processing", "build_label_mapping"),
37
+ "create_target_encoding": ("utils.data_processing", "create_target_encoding"),
38
+ }
39
+
40
+
41
+ def __getattr__(name: str) -> Any:
42
+ if name not in _LAZY:
43
+ raise AttributeError(f"module 'utils' has no attribute {name!r}")
44
+ module_name, attr_name = _LAZY[name]
45
+ mod = import_module(module_name)
46
+ return getattr(mod, attr_name)
47
+
48
+
utils/data_processing.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Label/tag processing helpers for multi-label classification."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, Iterable, List, Sequence, Tuple, Union
6
+
7
+ import torch
8
+
9
+
10
+ def process_tags(tags: Union[str, Sequence[str], None], sep: str = ",") -> List[str]:
11
+ """Convert raw tags to a list of normalized tag strings."""
12
+ if tags is None:
13
+ return []
14
+ if isinstance(tags, str):
15
+ parts = [t.strip() for t in tags.split(sep)]
16
+ return [p for p in parts if p]
17
+ # Sequence[str]
18
+ out: List[str] = []
19
+ for t in tags:
20
+ if t is None:
21
+ continue
22
+ s = str(t).strip()
23
+ if s:
24
+ out.append(s)
25
+ return out
26
+
27
+
28
+ def build_label_mapping(
29
+ df,
30
+ *,
31
+ tags_col: str = "tags",
32
+ sep: str = ",",
33
+ ) -> Dict[str, int]:
34
+ """Build a tag->index mapping from a dataframe-like object.
35
+
36
+ Expects `df[tags_col]` to contain either comma-separated strings or lists.
37
+ """
38
+ tag_set = set()
39
+ for raw in df[tags_col].tolist():
40
+ tag_set.update(process_tags(raw, sep=sep))
41
+ return {tag: i for i, tag in enumerate(sorted(tag_set))}
42
+
43
+
44
+ def create_target_encoding(
45
+ tag_lists: Iterable[Union[str, Sequence[str], None]],
46
+ label_to_idx: Dict[str, int],
47
+ *,
48
+ sep: str = ",",
49
+ dtype: torch.dtype = torch.float32,
50
+ ) -> torch.Tensor:
51
+ """Create a multi-hot target tensor of shape [N, num_labels]."""
52
+ tag_lists = list(tag_lists)
53
+ num_labels = len(label_to_idx)
54
+ y = torch.zeros((len(tag_lists), num_labels), dtype=dtype)
55
+ for i, raw in enumerate(tag_lists):
56
+ for tag in process_tags(raw, sep=sep):
57
+ j = label_to_idx.get(tag)
58
+ if j is not None:
59
+ y[i, j] = 1.0
60
+ return y
61
+
62
+
utils/russian_text_utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Russian text preprocessing helpers.
2
+
3
+ Keep this module lightweight: it is imported by the FastAPI service at startup.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from typing import Optional
10
+
11
+ _WS_RE = re.compile(r"\s+")
12
+
13
+
14
+ def prepare_text_for_tokenization(text: Optional[str]) -> str:
15
+ """Prepare raw text for tokenizer input.
16
+
17
+ - Handles None safely
18
+ - Strips surrounding whitespace
19
+ - Collapses internal whitespace/newlines
20
+ """
21
+ if text is None:
22
+ return ""
23
+ # Normalize whitespace and strip.
24
+ s = _WS_RE.sub(" ", str(text)).strip()
25
+ return s
26
+
27
+
utils/text_processing.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Basic text normalization and vocabulary building utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections import Counter
7
+ from typing import Dict
8
+
9
+ # Keep letters (Latin + Cyrillic), digits, and whitespace.
10
+ _CLEAN_RE = re.compile(r"[^0-9a-zA-Z\u0400-\u04FF\s]+", flags=re.UNICODE)
11
+ _WS_RE = re.compile(r"\s+")
12
+
13
+
14
+ def normalise_text(text: str) -> str:
15
+ """Lowercase, remove punctuation/special chars, and collapse whitespace."""
16
+ s = (text or "").lower()
17
+ s = _CLEAN_RE.sub(" ", s)
18
+ s = _WS_RE.sub(" ", s).strip()
19
+ return s
20
+
21
+
22
+ def create_vocab(text: str, vocab_size: int = 50000) -> Dict[str, int]:
23
+ """Create a simple frequency-based vocabulary mapping.
24
+
25
+ Always includes:
26
+ - #PAD# -> 0
27
+ - #UNKN# -> 1
28
+ """
29
+ vocab: Dict[str, int] = {"#PAD#": 0, "#UNKN#": 1}
30
+ if vocab_size <= 0:
31
+ return vocab
32
+
33
+ tokens = normalise_text(text).split()
34
+ counts = Counter(tokens)
35
+
36
+ for word, _ in counts.most_common(max(0, vocab_size)):
37
+ if word in vocab:
38
+ continue
39
+ vocab[word] = len(vocab)
40
+
41
+ return vocab
42
+
43
+
utils/tokenization.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tokenization utilities used for transformer models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional, Union
7
+
8
+ from transformers import AutoTokenizer
9
+
10
+
11
+ @dataclass
12
+ class RussianTextTokenizer:
13
+ """Thin wrapper around a HuggingFace tokenizer with sane defaults."""
14
+
15
+ model_name: str = "DeepPavlov/rubert-base-cased"
16
+ max_length: int = 128
17
+ padding: Union[bool, str] = "max_length"
18
+ truncation: bool = True
19
+
20
+ def __post_init__(self) -> None:
21
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
22
+
23
+ def get_vocab_size(self) -> int:
24
+ return int(getattr(self.tokenizer, "vocab_size", len(self.tokenizer.get_vocab())))
25
+
26
+ def get_special_tokens(self) -> Dict[str, Optional[int]]:
27
+ return {
28
+ "pad_token_id": self.tokenizer.pad_token_id,
29
+ "cls_token_id": self.tokenizer.cls_token_id,
30
+ "sep_token_id": self.tokenizer.sep_token_id,
31
+ "unk_token_id": self.tokenizer.unk_token_id,
32
+ }
33
+
34
+ def tokenize(self, text: str, add_special_tokens: bool = True) -> List[str]:
35
+ return self.tokenizer.tokenize(text or "", add_special_tokens=add_special_tokens)
36
+
37
+ def encode(
38
+ self,
39
+ text: str,
40
+ *,
41
+ max_length: Optional[int] = None,
42
+ padding: Optional[Union[bool, str]] = None,
43
+ truncation: Optional[bool] = None,
44
+ return_tensors: Optional[str] = "pt",
45
+ ) -> Dict[str, Any]:
46
+ """Encode a single text.
47
+
48
+ Returns a dict containing `input_ids` and `attention_mask`.
49
+ """
50
+ max_length_eff = max_length or self.max_length
51
+ padding_eff = self.padding if padding is None else padding
52
+ truncation_eff = self.truncation if truncation is None else truncation
53
+
54
+ if return_tensors is None:
55
+ enc = self.tokenizer(
56
+ text or "",
57
+ max_length=max_length_eff,
58
+ padding=padding_eff,
59
+ truncation=truncation_eff,
60
+ return_attention_mask=True,
61
+ return_tensors=None,
62
+ )
63
+ # HuggingFace returns lists for a single example; standardize to batch-like shape.
64
+ return {
65
+ "input_ids": [enc["input_ids"]],
66
+ "attention_mask": [enc["attention_mask"]],
67
+ }
68
+
69
+ return self.tokenizer(
70
+ text or "",
71
+ max_length=max_length_eff,
72
+ padding=padding_eff,
73
+ truncation=truncation_eff,
74
+ return_attention_mask=True,
75
+ return_tensors=return_tensors,
76
+ )
77
+
78
+ def encode_batch(
79
+ self,
80
+ texts: List[str],
81
+ *,
82
+ max_length: Optional[int] = None,
83
+ padding: Optional[Union[bool, str]] = None,
84
+ truncation: Optional[bool] = None,
85
+ return_tensors: str = "pt",
86
+ ) -> Dict[str, Any]:
87
+ max_length_eff = max_length or self.max_length
88
+ padding_eff = self.padding if padding is None else padding
89
+ truncation_eff = self.truncation if truncation is None else truncation
90
+ return self.tokenizer(
91
+ [t or "" for t in texts],
92
+ max_length=max_length_eff,
93
+ padding=padding_eff,
94
+ truncation=truncation_eff,
95
+ return_attention_mask=True,
96
+ return_tensors=return_tensors,
97
+ )
98
+
99
+ def decode(self, token_ids: Union[List[int], Any], skip_special_tokens: bool = True) -> str:
100
+ # Avoid importing torch at module import time; handle torch tensors via duck-typing.
101
+ if hasattr(token_ids, "detach") and hasattr(token_ids, "cpu") and hasattr(token_ids, "tolist"):
102
+ token_ids = token_ids.detach().cpu().tolist()
103
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
104
+
105
+ def get_token_info(self, token_id: int) -> Dict[str, Any]:
106
+ tok = self.tokenizer.convert_ids_to_tokens(int(token_id))
107
+ specials = set(self.tokenizer.all_special_ids)
108
+ return {
109
+ "token_id": int(token_id),
110
+ "token": tok,
111
+ "is_special": int(token_id) in specials,
112
+ }
113
+
114
+
115
+ def create_tokenizer(model_name: str = "DeepPavlov/rubert-base-cased", max_length: int = 128) -> RussianTextTokenizer:
116
+ return RussianTextTokenizer(model_name=model_name, max_length=max_length)
117
+
118
+
119
+ def tokenize_text_pair(
120
+ *,
121
+ title: str,
122
+ snippet: Optional[str],
123
+ tokenizer: RussianTextTokenizer,
124
+ max_title_len: int = 128,
125
+ max_snippet_len: int = 256,
126
+ ) -> Dict[str, Any]:
127
+ """Tokenize (title, snippet) as two independent sequences (not a single pair encoding)."""
128
+ title_enc = tokenizer.encode(title or "", max_length=max_title_len, return_tensors="pt")
129
+ out: Dict[str, Any] = {
130
+ "title_input_ids": title_enc["input_ids"].squeeze(0),
131
+ "title_attention_mask": title_enc["attention_mask"].squeeze(0),
132
+ }
133
+
134
+ if snippet is not None:
135
+ snip_enc = tokenizer.encode(snippet or "", max_length=max_snippet_len, return_tensors="pt")
136
+ out.update(
137
+ {
138
+ "snippet_input_ids": snip_enc["input_ids"].squeeze(0),
139
+ "snippet_attention_mask": snip_enc["attention_mask"].squeeze(0),
140
+ }
141
+ )
142
+
143
+ return out
144
+
145
+