MarisUK's picture
Maris AI model sync
f440f03 verified
"""Datu papildināšana (augmentation)."""
from __future__ import annotations
import math
import random
import re
WORD_RE = re.compile(r"\b[\w'-]+\b", re.UNICODE)
SYNONYM_MAP: dict[str, tuple[str, ...]] = {
"ātrs": ("žigls", "straujš"),
"attēls": ("bilde", "vizualizācija"),
"bilde": ("attēls",),
"big": ("large", "sizable"),
"code": ("program", "source"),
"create": ("build", "make"),
"fast": ("quick", "rapid"),
"generate": ("produce", "create"),
"good": ("solid", "reliable"),
"idea": ("concept", "approach"),
"image": ("picture", "visual"),
"intelligent": ("smart", "capable"),
"liels": ("apjomīgs", "ievērojams"),
"mazs": ("neliels", "kompakts"),
"prompt": ("instruction", "request"),
"quick": ("fast", "rapid"),
"small": ("compact", "lightweight"),
"smart": ("capable", "intelligent"),
"strong": ("robust", "powerful"),
"text": ("content", "message"),
}
BACK_TRANSLATION_MAP: dict[str, dict[str, str]] = {
"en": {
"because": "since",
"create": "build",
"fast": "quick",
"good": "solid",
"help": "assist",
"plan": "outline",
},
"lv": {
"ātrs": "žigls",
"izveidot": "radīt",
"labs": "stabils",
"palīdzēt": "atbalstīt",
"plāns": "ieceres plāns",
},
}
def _match_case(source: str, replacement: str) -> str:
if source.isupper():
return replacement.upper()
if source[:1].isupper():
return replacement.capitalize()
return replacement
def _lookup_synonym(word: str) -> str | None:
synonyms = SYNONYM_MAP.get(word.casefold())
if not synonyms:
return None
return _match_case(word, random.choice(synonyms))
def random_synonym_swap(text: str, rate: float = 0.1) -> str:
"""Aizstāj daļu vārdu ar iebūvētiem sinonīmiem bez ārējām atkarībām."""
if not text.strip():
return text
words = list(WORD_RE.finditer(text))
if not words:
return text
candidates = [
(match.start(), match.end(), synonym)
for match in words
if (synonym := _lookup_synonym(match.group(0))) is not None
]
if not candidates or rate <= 0:
return text
sample_size = min(len(candidates), max(1, math.ceil(len(words) * min(rate, 1.0))))
selected = {
(start, end): replacement
for start, end, replacement in random.sample(candidates, sample_size)
}
parts: list[str] = []
cursor = 0
for match in words:
span = (match.start(), match.end())
replacement = selected.get(span)
if replacement is None:
continue
parts.append(text[cursor : match.start()])
parts.append(replacement)
cursor = match.end()
if cursor == 0:
return text
parts.append(text[cursor:])
return "".join(parts)
def back_translate(text: str, lang: str = "en") -> str:
"""Viegls offline paraphrase fallback vietā, kur nav tulkošanas API."""
if not text.strip():
return text
replacements = BACK_TRANSLATION_MAP.get(lang.strip().lower(), {})
augmented = text
for source, target in replacements.items():
augmented = re.sub(
rf"\b{re.escape(source)}\b",
lambda match, replacement=target: _match_case(match.group(0), replacement),
augmented,
flags=re.IGNORECASE,
)
if augmented == text:
return random_synonym_swap(text, rate=0.2)
return augmented