Spaces:
Configuration error
Configuration error
| """ | |
| Augmentation Module (Optional) | |
| ================================ | |
| Lightweight synthetic data expansion stubs. | |
| These are pure-Python approximations. For production quality, | |
| integrate with an LLM API or NLP library. | |
| """ | |
| import random | |
| import re | |
| from dataclasses import dataclass | |
| from typing import List | |
| import pandas as pd | |
| class AugmentationConfig: | |
| """Configuration for data augmentation.""" | |
| enabled: bool = False | |
| paraphrase: bool = False | |
| generate_variations: bool = False | |
| back_translate: bool = False | |
| tone_rewrite: bool = False | |
| augmentation_factor: int = 1 # how many extra copies per sample | |
| # --------------------------------------------------------------------------- | |
| # Synonym map for lightweight paraphrasing | |
| # --------------------------------------------------------------------------- | |
| _SYNONYMS = { | |
| 'explain': ['describe', 'elaborate on', 'clarify', 'break down'], | |
| 'create': ['generate', 'produce', 'make', 'build'], | |
| 'write': ['compose', 'draft', 'author', 'pen'], | |
| 'list': ['enumerate', 'outline', 'itemize', 'catalog'], | |
| 'help': ['assist', 'aid', 'support', 'guide'], | |
| 'show': ['demonstrate', 'display', 'present', 'illustrate'], | |
| 'tell': ['inform', 'describe', 'narrate', 'share'], | |
| 'give': ['provide', 'supply', 'offer', 'deliver'], | |
| 'find': ['locate', 'discover', 'identify', 'search for'], | |
| 'use': ['utilize', 'employ', 'apply', 'leverage'], | |
| 'what': ['which', 'what exactly'], | |
| 'how': ['in what way', 'by what method'], | |
| 'important': ['crucial', 'essential', 'significant', 'vital'], | |
| 'good': ['excellent', 'great', 'effective', 'beneficial'], | |
| 'bad': ['poor', 'negative', 'harmful', 'detrimental'], | |
| 'big': ['large', 'significant', 'substantial', 'major'], | |
| 'small': ['minor', 'slight', 'modest', 'minimal'], | |
| } | |
| def paraphrase_instruction(text: str) -> str: | |
| """ | |
| Simple synonym-based paraphrasing. | |
| Replaces one random word with a synonym. | |
| """ | |
| if not isinstance(text, str) or len(text.strip()) < 5: | |
| return text | |
| words = text.split() | |
| candidates = [] | |
| for i, word in enumerate(words): | |
| word_lower = word.lower().strip('.,!?;:') | |
| if word_lower in _SYNONYMS: | |
| candidates.append((i, word_lower)) | |
| if not candidates: | |
| return text | |
| idx, orig_word = random.choice(candidates) | |
| replacement = random.choice(_SYNONYMS[orig_word]) | |
| # Preserve original casing | |
| if words[idx][0].isupper(): | |
| replacement = replacement.capitalize() | |
| # Preserve trailing punctuation | |
| trailing = '' | |
| if words[idx] and words[idx][-1] in '.,!?;:': | |
| trailing = words[idx][-1] | |
| words[idx] = replacement + trailing | |
| else: | |
| words[idx] = replacement | |
| return ' '.join(words) | |
| def generate_variation(text: str) -> str: | |
| """ | |
| Generate a minor variation of the text: | |
| - Random case changes | |
| - Add/remove trailing punctuation | |
| - Slight word reordering at clause boundaries | |
| """ | |
| if not isinstance(text, str) or len(text.strip()) < 5: | |
| return text | |
| variations = [ | |
| lambda t: t.rstrip('.!?') + random.choice(['.', '!', '?', '']), | |
| lambda t: t[0].upper() + t[1:] if len(t) > 1 else t, | |
| lambda t: re.sub(r'\s+', ' ', t).strip(), | |
| lambda t: t + ' Please be detailed.' if random.random() > 0.5 else t, | |
| ] | |
| variation = random.choice(variations) | |
| return variation(text) | |
| def back_translate(text: str) -> str: | |
| """ | |
| Stub for back-translation. | |
| In production, this would translate to another language and back. | |
| Here we just do a light paraphrase. | |
| """ | |
| return paraphrase_instruction(text) | |
| def rewrite_tone(text: str, tone: str = "formal") -> str: | |
| """ | |
| Stub for tone rewriting. | |
| """ | |
| tone_prefixes = { | |
| 'formal': 'Please ', | |
| 'casual': 'Hey, can you ', | |
| 'academic': 'Kindly provide a detailed analysis of ', | |
| 'friendly': 'I would really appreciate if you could ', | |
| } | |
| prefix = tone_prefixes.get(tone, '') | |
| # Don't double-prefix | |
| if text.lower().startswith(prefix.lower().strip()): | |
| return text | |
| # Simple approach: prepend tone prefix if the text starts with a verb-like word | |
| first_word = text.split()[0].lower() if text.split() else '' | |
| action_words = {'explain', 'describe', 'write', 'create', 'list', 'show', 'tell', 'give', 'find', 'help', 'make'} | |
| if first_word in action_words: | |
| return prefix + text[0].lower() + text[1:] | |
| return text | |
| def augment_dataset( | |
| df: pd.DataFrame, | |
| col: str, | |
| config: AugmentationConfig, | |
| ) -> pd.DataFrame: | |
| """ | |
| Apply augmentation to create additional samples. | |
| Returns the original + augmented samples. | |
| """ | |
| if not config.enabled: | |
| return df | |
| methods = [] | |
| if config.paraphrase: | |
| methods.append(paraphrase_instruction) | |
| if config.generate_variations: | |
| methods.append(generate_variation) | |
| if config.back_translate: | |
| methods.append(back_translate) | |
| if config.tone_rewrite: | |
| methods.append(lambda t: rewrite_tone(t, "formal")) | |
| if not methods: | |
| return df | |
| new_rows = [] | |
| for _, row in df.iterrows(): | |
| for _ in range(config.augmentation_factor): | |
| method = random.choice(methods) | |
| new_row = row.copy() | |
| new_row[col] = method(str(row[col])) | |
| new_rows.append(new_row) | |
| if new_rows: | |
| augmented = pd.DataFrame(new_rows) | |
| return pd.concat([df, augmented], ignore_index=True) | |
| return df | |