| """Datu papildināšana (augmentation).""" |
|
|
| from __future__ import annotations |
|
|
| import math |
| import random |
| import re |
|
|
| WORD_RE = re.compile(r"\b[\w'-]+\b", re.UNICODE) |
| SYNONYM_MAP: dict[str, tuple[str, ...]] = { |
| "ātrs": ("žigls", "straujš"), |
| "attēls": ("bilde", "vizualizācija"), |
| "bilde": ("attēls",), |
| "big": ("large", "sizable"), |
| "code": ("program", "source"), |
| "create": ("build", "make"), |
| "fast": ("quick", "rapid"), |
| "generate": ("produce", "create"), |
| "good": ("solid", "reliable"), |
| "idea": ("concept", "approach"), |
| "image": ("picture", "visual"), |
| "intelligent": ("smart", "capable"), |
| "liels": ("apjomīgs", "ievērojams"), |
| "mazs": ("neliels", "kompakts"), |
| "prompt": ("instruction", "request"), |
| "quick": ("fast", "rapid"), |
| "small": ("compact", "lightweight"), |
| "smart": ("capable", "intelligent"), |
| "strong": ("robust", "powerful"), |
| "text": ("content", "message"), |
| } |
| BACK_TRANSLATION_MAP: dict[str, dict[str, str]] = { |
| "en": { |
| "because": "since", |
| "create": "build", |
| "fast": "quick", |
| "good": "solid", |
| "help": "assist", |
| "plan": "outline", |
| }, |
| "lv": { |
| "ātrs": "žigls", |
| "izveidot": "radīt", |
| "labs": "stabils", |
| "palīdzēt": "atbalstīt", |
| "plāns": "ieceres plāns", |
| }, |
| } |
|
|
|
|
| def _match_case(source: str, replacement: str) -> str: |
| if source.isupper(): |
| return replacement.upper() |
| if source[:1].isupper(): |
| return replacement.capitalize() |
| return replacement |
|
|
|
|
| def _lookup_synonym(word: str) -> str | None: |
| synonyms = SYNONYM_MAP.get(word.casefold()) |
| if not synonyms: |
| return None |
| return _match_case(word, random.choice(synonyms)) |
|
|
|
|
| def random_synonym_swap(text: str, rate: float = 0.1) -> str: |
| """Aizstāj daļu vārdu ar iebūvētiem sinonīmiem bez ārējām atkarībām.""" |
| if not text.strip(): |
| return text |
|
|
| words = list(WORD_RE.finditer(text)) |
| if not words: |
| return text |
|
|
| candidates = [ |
| (match.start(), match.end(), synonym) |
| for match in words |
| if (synonym := _lookup_synonym(match.group(0))) is not None |
| ] |
| if not candidates or rate <= 0: |
| return text |
|
|
| sample_size = min(len(candidates), max(1, math.ceil(len(words) * min(rate, 1.0)))) |
| selected = { |
| (start, end): replacement |
| for start, end, replacement in random.sample(candidates, sample_size) |
| } |
|
|
| parts: list[str] = [] |
| cursor = 0 |
| for match in words: |
| span = (match.start(), match.end()) |
| replacement = selected.get(span) |
| if replacement is None: |
| continue |
| parts.append(text[cursor : match.start()]) |
| parts.append(replacement) |
| cursor = match.end() |
| if cursor == 0: |
| return text |
|
|
| parts.append(text[cursor:]) |
| return "".join(parts) |
|
|
|
|
| def back_translate(text: str, lang: str = "en") -> str: |
| """Viegls offline paraphrase fallback vietā, kur nav tulkošanas API.""" |
| if not text.strip(): |
| return text |
|
|
| replacements = BACK_TRANSLATION_MAP.get(lang.strip().lower(), {}) |
| augmented = text |
| for source, target in replacements.items(): |
| augmented = re.sub( |
| rf"\b{re.escape(source)}\b", |
| lambda match, replacement=target: _match_case(match.group(0), replacement), |
| augmented, |
| flags=re.IGNORECASE, |
| ) |
|
|
| if augmented == text: |
| return random_synonym_swap(text, rate=0.2) |
| return augmented |
|
|