Spaces:
Configuration error
Configuration error
File size: 5,730 Bytes
d4398e6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | """
Augmentation Module (Optional)
================================
Lightweight synthetic data expansion stubs.
These are pure-Python approximations. For production quality,
integrate with an LLM API or NLP library.
"""
import random
import re
from dataclasses import dataclass
from typing import List
import pandas as pd
@dataclass
class AugmentationConfig:
"""Configuration for data augmentation."""
enabled: bool = False
paraphrase: bool = False
generate_variations: bool = False
back_translate: bool = False
tone_rewrite: bool = False
augmentation_factor: int = 1 # how many extra copies per sample
# ---------------------------------------------------------------------------
# Synonym map for lightweight paraphrasing
# ---------------------------------------------------------------------------
_SYNONYMS = {
'explain': ['describe', 'elaborate on', 'clarify', 'break down'],
'create': ['generate', 'produce', 'make', 'build'],
'write': ['compose', 'draft', 'author', 'pen'],
'list': ['enumerate', 'outline', 'itemize', 'catalog'],
'help': ['assist', 'aid', 'support', 'guide'],
'show': ['demonstrate', 'display', 'present', 'illustrate'],
'tell': ['inform', 'describe', 'narrate', 'share'],
'give': ['provide', 'supply', 'offer', 'deliver'],
'find': ['locate', 'discover', 'identify', 'search for'],
'use': ['utilize', 'employ', 'apply', 'leverage'],
'what': ['which', 'what exactly'],
'how': ['in what way', 'by what method'],
'important': ['crucial', 'essential', 'significant', 'vital'],
'good': ['excellent', 'great', 'effective', 'beneficial'],
'bad': ['poor', 'negative', 'harmful', 'detrimental'],
'big': ['large', 'significant', 'substantial', 'major'],
'small': ['minor', 'slight', 'modest', 'minimal'],
}
def paraphrase_instruction(text: str) -> str:
"""
Simple synonym-based paraphrasing.
Replaces one random word with a synonym.
"""
if not isinstance(text, str) or len(text.strip()) < 5:
return text
words = text.split()
candidates = []
for i, word in enumerate(words):
word_lower = word.lower().strip('.,!?;:')
if word_lower in _SYNONYMS:
candidates.append((i, word_lower))
if not candidates:
return text
idx, orig_word = random.choice(candidates)
replacement = random.choice(_SYNONYMS[orig_word])
# Preserve original casing
if words[idx][0].isupper():
replacement = replacement.capitalize()
# Preserve trailing punctuation
trailing = ''
if words[idx] and words[idx][-1] in '.,!?;:':
trailing = words[idx][-1]
words[idx] = replacement + trailing
else:
words[idx] = replacement
return ' '.join(words)
def generate_variation(text: str) -> str:
"""
Generate a minor variation of the text:
- Random case changes
- Add/remove trailing punctuation
- Slight word reordering at clause boundaries
"""
if not isinstance(text, str) or len(text.strip()) < 5:
return text
variations = [
lambda t: t.rstrip('.!?') + random.choice(['.', '!', '?', '']),
lambda t: t[0].upper() + t[1:] if len(t) > 1 else t,
lambda t: re.sub(r'\s+', ' ', t).strip(),
lambda t: t + ' Please be detailed.' if random.random() > 0.5 else t,
]
variation = random.choice(variations)
return variation(text)
def back_translate(text: str) -> str:
"""
Stub for back-translation.
In production, this would translate to another language and back.
Here we just do a light paraphrase.
"""
return paraphrase_instruction(text)
def rewrite_tone(text: str, tone: str = "formal") -> str:
"""
Stub for tone rewriting.
"""
tone_prefixes = {
'formal': 'Please ',
'casual': 'Hey, can you ',
'academic': 'Kindly provide a detailed analysis of ',
'friendly': 'I would really appreciate if you could ',
}
prefix = tone_prefixes.get(tone, '')
# Don't double-prefix
if text.lower().startswith(prefix.lower().strip()):
return text
# Simple approach: prepend tone prefix if the text starts with a verb-like word
first_word = text.split()[0].lower() if text.split() else ''
action_words = {'explain', 'describe', 'write', 'create', 'list', 'show', 'tell', 'give', 'find', 'help', 'make'}
if first_word in action_words:
return prefix + text[0].lower() + text[1:]
return text
def augment_dataset(
df: pd.DataFrame,
col: str,
config: AugmentationConfig,
) -> pd.DataFrame:
"""
Apply augmentation to create additional samples.
Returns the original + augmented samples.
"""
if not config.enabled:
return df
methods = []
if config.paraphrase:
methods.append(paraphrase_instruction)
if config.generate_variations:
methods.append(generate_variation)
if config.back_translate:
methods.append(back_translate)
if config.tone_rewrite:
methods.append(lambda t: rewrite_tone(t, "formal"))
if not methods:
return df
new_rows = []
for _, row in df.iterrows():
for _ in range(config.augmentation_factor):
method = random.choice(methods)
new_row = row.copy()
new_row[col] = method(str(row[col]))
new_rows.append(new_row)
if new_rows:
augmented = pd.DataFrame(new_rows)
return pd.concat([df, augmented], ignore_index=True)
return df
|