File size: 3,539 Bytes

f440f03

"""Datu papildināšana (augmentation)."""

from __future__ import annotations

import math
import random
import re

WORD_RE = re.compile(r"\b[\w'-]+\b", re.UNICODE)
SYNONYM_MAP: dict[str, tuple[str, ...]] = {
    "ātrs": ("žigls", "straujš"),
    "attēls": ("bilde", "vizualizācija"),
    "bilde": ("attēls",),
    "big": ("large", "sizable"),
    "code": ("program", "source"),
    "create": ("build", "make"),
    "fast": ("quick", "rapid"),
    "generate": ("produce", "create"),
    "good": ("solid", "reliable"),
    "idea": ("concept", "approach"),
    "image": ("picture", "visual"),
    "intelligent": ("smart", "capable"),
    "liels": ("apjomīgs", "ievērojams"),
    "mazs": ("neliels", "kompakts"),
    "prompt": ("instruction", "request"),
    "quick": ("fast", "rapid"),
    "small": ("compact", "lightweight"),
    "smart": ("capable", "intelligent"),
    "strong": ("robust", "powerful"),
    "text": ("content", "message"),
}
BACK_TRANSLATION_MAP: dict[str, dict[str, str]] = {
    "en": {
        "because": "since",
        "create": "build",
        "fast": "quick",
        "good": "solid",
        "help": "assist",
        "plan": "outline",
    },
    "lv": {
        "ātrs": "žigls",
        "izveidot": "radīt",
        "labs": "stabils",
        "palīdzēt": "atbalstīt",
        "plāns": "ieceres plāns",
    },
}


def _match_case(source: str, replacement: str) -> str:
    if source.isupper():
        return replacement.upper()
    if source[:1].isupper():
        return replacement.capitalize()
    return replacement


def _lookup_synonym(word: str) -> str | None:
    synonyms = SYNONYM_MAP.get(word.casefold())
    if not synonyms:
        return None
    return _match_case(word, random.choice(synonyms))


def random_synonym_swap(text: str, rate: float = 0.1) -> str:
    """Aizstāj daļu vārdu ar iebūvētiem sinonīmiem bez ārējām atkarībām."""
    if not text.strip():
        return text

    words = list(WORD_RE.finditer(text))
    if not words:
        return text

    candidates = [
        (match.start(), match.end(), synonym)
        for match in words
        if (synonym := _lookup_synonym(match.group(0))) is not None
    ]
    if not candidates or rate <= 0:
        return text

    sample_size = min(len(candidates), max(1, math.ceil(len(words) * min(rate, 1.0))))
    selected = {
        (start, end): replacement
        for start, end, replacement in random.sample(candidates, sample_size)
    }

    parts: list[str] = []
    cursor = 0
    for match in words:
        span = (match.start(), match.end())
        replacement = selected.get(span)
        if replacement is None:
            continue
        parts.append(text[cursor : match.start()])
        parts.append(replacement)
        cursor = match.end()
    if cursor == 0:
        return text

    parts.append(text[cursor:])
    return "".join(parts)


def back_translate(text: str, lang: str = "en") -> str:
    """Viegls offline paraphrase fallback vietā, kur nav tulkošanas API."""
    if not text.strip():
        return text

    replacements = BACK_TRANSLATION_MAP.get(lang.strip().lower(), {})
    augmented = text
    for source, target in replacements.items():
        augmented = re.sub(
            rf"\b{re.escape(source)}\b",
            lambda match, replacement=target: _match_case(match.group(0), replacement),
            augmented,
            flags=re.IGNORECASE,
        )

    if augmented == text:
        return random_synonym_swap(text, rate=0.2)
    return augmented