"""Datu papildināšana (augmentation).""" from __future__ import annotations import math import random import re WORD_RE = re.compile(r"\b[\w'-]+\b", re.UNICODE) SYNONYM_MAP: dict[str, tuple[str, ...]] = { "ātrs": ("žigls", "straujš"), "attēls": ("bilde", "vizualizācija"), "bilde": ("attēls",), "big": ("large", "sizable"), "code": ("program", "source"), "create": ("build", "make"), "fast": ("quick", "rapid"), "generate": ("produce", "create"), "good": ("solid", "reliable"), "idea": ("concept", "approach"), "image": ("picture", "visual"), "intelligent": ("smart", "capable"), "liels": ("apjomīgs", "ievērojams"), "mazs": ("neliels", "kompakts"), "prompt": ("instruction", "request"), "quick": ("fast", "rapid"), "small": ("compact", "lightweight"), "smart": ("capable", "intelligent"), "strong": ("robust", "powerful"), "text": ("content", "message"), } BACK_TRANSLATION_MAP: dict[str, dict[str, str]] = { "en": { "because": "since", "create": "build", "fast": "quick", "good": "solid", "help": "assist", "plan": "outline", }, "lv": { "ātrs": "žigls", "izveidot": "radīt", "labs": "stabils", "palīdzēt": "atbalstīt", "plāns": "ieceres plāns", }, } def _match_case(source: str, replacement: str) -> str: if source.isupper(): return replacement.upper() if source[:1].isupper(): return replacement.capitalize() return replacement def _lookup_synonym(word: str) -> str | None: synonyms = SYNONYM_MAP.get(word.casefold()) if not synonyms: return None return _match_case(word, random.choice(synonyms)) def random_synonym_swap(text: str, rate: float = 0.1) -> str: """Aizstāj daļu vārdu ar iebūvētiem sinonīmiem bez ārējām atkarībām.""" if not text.strip(): return text words = list(WORD_RE.finditer(text)) if not words: return text candidates = [ (match.start(), match.end(), synonym) for match in words if (synonym := _lookup_synonym(match.group(0))) is not None ] if not candidates or rate <= 0: return text sample_size = min(len(candidates), max(1, math.ceil(len(words) * min(rate, 1.0)))) selected = { (start, end): replacement for start, end, replacement in random.sample(candidates, sample_size) } parts: list[str] = [] cursor = 0 for match in words: span = (match.start(), match.end()) replacement = selected.get(span) if replacement is None: continue parts.append(text[cursor : match.start()]) parts.append(replacement) cursor = match.end() if cursor == 0: return text parts.append(text[cursor:]) return "".join(parts) def back_translate(text: str, lang: str = "en") -> str: """Viegls offline paraphrase fallback vietā, kur nav tulkošanas API.""" if not text.strip(): return text replacements = BACK_TRANSLATION_MAP.get(lang.strip().lower(), {}) augmented = text for source, target in replacements.items(): augmented = re.sub( rf"\b{re.escape(source)}\b", lambda match, replacement=target: _match_case(match.group(0), replacement), augmented, flags=re.IGNORECASE, ) if augmented == text: return random_synonym_swap(text, rate=0.2) return augmented