File size: 5,730 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""

Augmentation Module (Optional)

================================

Lightweight synthetic data expansion stubs.

These are pure-Python approximations. For production quality,

integrate with an LLM API or NLP library.

"""

import random
import re
from dataclasses import dataclass
from typing import List
import pandas as pd


@dataclass
class AugmentationConfig:
    """Configuration for data augmentation."""
    enabled: bool = False
    paraphrase: bool = False
    generate_variations: bool = False
    back_translate: bool = False
    tone_rewrite: bool = False
    augmentation_factor: int = 1  # how many extra copies per sample


# ---------------------------------------------------------------------------
# Synonym map for lightweight paraphrasing
# ---------------------------------------------------------------------------
_SYNONYMS = {
    'explain': ['describe', 'elaborate on', 'clarify', 'break down'],
    'create': ['generate', 'produce', 'make', 'build'],
    'write': ['compose', 'draft', 'author', 'pen'],
    'list': ['enumerate', 'outline', 'itemize', 'catalog'],
    'help': ['assist', 'aid', 'support', 'guide'],
    'show': ['demonstrate', 'display', 'present', 'illustrate'],
    'tell': ['inform', 'describe', 'narrate', 'share'],
    'give': ['provide', 'supply', 'offer', 'deliver'],
    'find': ['locate', 'discover', 'identify', 'search for'],
    'use': ['utilize', 'employ', 'apply', 'leverage'],
    'what': ['which', 'what exactly'],
    'how': ['in what way', 'by what method'],
    'important': ['crucial', 'essential', 'significant', 'vital'],
    'good': ['excellent', 'great', 'effective', 'beneficial'],
    'bad': ['poor', 'negative', 'harmful', 'detrimental'],
    'big': ['large', 'significant', 'substantial', 'major'],
    'small': ['minor', 'slight', 'modest', 'minimal'],
}


def paraphrase_instruction(text: str) -> str:
    """

    Simple synonym-based paraphrasing.

    Replaces one random word with a synonym.

    """
    if not isinstance(text, str) or len(text.strip()) < 5:
        return text

    words = text.split()
    candidates = []

    for i, word in enumerate(words):
        word_lower = word.lower().strip('.,!?;:')
        if word_lower in _SYNONYMS:
            candidates.append((i, word_lower))

    if not candidates:
        return text

    idx, orig_word = random.choice(candidates)
    replacement = random.choice(_SYNONYMS[orig_word])

    # Preserve original casing
    if words[idx][0].isupper():
        replacement = replacement.capitalize()

    # Preserve trailing punctuation
    trailing = ''
    if words[idx] and words[idx][-1] in '.,!?;:':
        trailing = words[idx][-1]
        words[idx] = replacement + trailing
    else:
        words[idx] = replacement

    return ' '.join(words)


def generate_variation(text: str) -> str:
    """

    Generate a minor variation of the text:

    - Random case changes

    - Add/remove trailing punctuation

    - Slight word reordering at clause boundaries

    """
    if not isinstance(text, str) or len(text.strip()) < 5:
        return text

    variations = [
        lambda t: t.rstrip('.!?') + random.choice(['.', '!', '?', '']),
        lambda t: t[0].upper() + t[1:] if len(t) > 1 else t,
        lambda t: re.sub(r'\s+', ' ', t).strip(),
        lambda t: t + ' Please be detailed.' if random.random() > 0.5 else t,
    ]

    variation = random.choice(variations)
    return variation(text)


def back_translate(text: str) -> str:
    """

    Stub for back-translation.

    In production, this would translate to another language and back.

    Here we just do a light paraphrase.

    """
    return paraphrase_instruction(text)


def rewrite_tone(text: str, tone: str = "formal") -> str:
    """

    Stub for tone rewriting.

    """
    tone_prefixes = {
        'formal': 'Please ',
        'casual': 'Hey, can you ',
        'academic': 'Kindly provide a detailed analysis of ',
        'friendly': 'I would really appreciate if you could ',
    }

    prefix = tone_prefixes.get(tone, '')

    # Don't double-prefix
    if text.lower().startswith(prefix.lower().strip()):
        return text

    # Simple approach: prepend tone prefix if the text starts with a verb-like word
    first_word = text.split()[0].lower() if text.split() else ''
    action_words = {'explain', 'describe', 'write', 'create', 'list', 'show', 'tell', 'give', 'find', 'help', 'make'}

    if first_word in action_words:
        return prefix + text[0].lower() + text[1:]

    return text


def augment_dataset(

    df: pd.DataFrame,

    col: str,

    config: AugmentationConfig,

) -> pd.DataFrame:
    """

    Apply augmentation to create additional samples.

    Returns the original + augmented samples.

    """
    if not config.enabled:
        return df

    methods = []
    if config.paraphrase:
        methods.append(paraphrase_instruction)
    if config.generate_variations:
        methods.append(generate_variation)
    if config.back_translate:
        methods.append(back_translate)
    if config.tone_rewrite:
        methods.append(lambda t: rewrite_tone(t, "formal"))

    if not methods:
        return df

    new_rows = []
    for _, row in df.iterrows():
        for _ in range(config.augmentation_factor):
            method = random.choice(methods)
            new_row = row.copy()
            new_row[col] = method(str(row[col]))
            new_rows.append(new_row)

    if new_rows:
        augmented = pd.DataFrame(new_rows)
        return pd.concat([df, augmented], ignore_index=True)

    return df