Spaces:
Running
Running
| import pandas as pd | |
| import os | |
| import random | |
| import re | |
| import my_generator as gen | |
| # Config | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt') | |
| CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv') | |
| # sets the target total size of the final dataset | |
| TOTAL_ROWS = 5000 | |
| # --------------------------------------------------------- | |
| # 1. SLANG & AUGMENTATION LIBRARY | |
| # --------------------------------------------------------- | |
| # defines common Taglish words and their text-speak/slang equivalents | |
| SLANG_MAP = { | |
| "tulong": ["help", "saklolo", "tulong po", "help pls"], | |
| "kami": ["kmi", "kme", "tayo"], | |
| "dito": ["d2", "dto", "here"], | |
| "baha": ["flood", "tubig", "pagbaha"], | |
| "rescue": ["save us", "pasundo", "saklolo"], | |
| "please": ["pls", "plz", "paki", "parang awa nyo na"], | |
| "wala": ["la", "wla", "zero"], | |
| "sa": ["s", "sa may"], | |
| "ang": ["ung", "yung", "ang"], | |
| "hindi": ["di", "d", "hndi"], | |
| "kayo": ["kau", "nyo"], | |
| "may": ["meron", "my"], | |
| } | |
| # replaces common words with text-speak or slang based on SLANG_MAP | |
| def apply_slang(text): | |
| words = text.split() | |
| new_words = [] | |
| for word in words: | |
| lower_word = word.lower().replace(".", "").replace("!", "") | |
| if lower_word in SLANG_MAP and random.random() > 0.5: | |
| new_words.append(random.choice(SLANG_MAP[lower_word])) | |
| else: | |
| new_words.append(word) | |
| return " ".join(new_words) | |
| # randomly swaps two adjacent words to simulate panic typing errors | |
| def shuffle_sentence(text): | |
| words = text.split() | |
| if len(words) > 3: | |
| idx = random.randint(0, len(words) - 2) | |
| words[idx], words[idx+1] = words[idx+1], words[idx] | |
| return " ".join(words) | |
| # adds formatting noise (random caps, repetition, punctuation spam) to simulate distress | |
| def add_noise(text): | |
| # 1. Random Caps | |
| if random.random() > 0.7: | |
| text = text.upper() | |
| elif random.random() > 0.7: | |
| text = text.lower() | |
| # 2. Urgent Repetition (for positives) | |
| if "help" in text.lower() or "tulong" in text.lower(): | |
| if random.random() > 0.7: | |
| text = text + " TULONG!" | |
| # 3. Punctuation Spam | |
| if random.random() > 0.6: | |
| text += "!!" if random.random() > 0.5 else "..." | |
| return text | |
| # creates multiple variations of a single input text using all augmentation methods | |
| def generate_variations(text, num_variations=3): | |
| variations = [text] # Keep original | |
| for _ in range(num_variations): | |
| # Method A: Slang | |
| var = apply_slang(text) | |
| # Method B: Noise | |
| var = add_noise(var) | |
| # Method C: Shuffle (rarely) | |
| if random.random() > 0.8: | |
| var = shuffle_sentence(var) | |
| variations.append(var) | |
| return list(set(variations)) # Unique only | |
| # --------------------------------------------------------- | |
| # 2. CORE LOGIC | |
| # --------------------------------------------------------- | |
| # cleans and loads the initial seed data from the text file | |
| def clean_and_load_seed_data(filepath): | |
| if not os.path.exists(filepath): | |
| print(f"⚠️ Warning: {filepath} not found.") | |
| return [] | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # removes internal tags or bracketed information | |
| pattern = re.compile(r"\[.*?\]") | |
| content = pattern.sub("", content) | |
| clean_rows = [] | |
| raw_lines = content.split('\n') | |
| buffer = "" | |
| for line in raw_lines: | |
| line = line.strip() | |
| if not line: continue | |
| # checks for the classification label at the end of the line | |
| if line.endswith('|0') or line.endswith('|1'): | |
| full_line = (buffer + " " + line).strip() | |
| try: | |
| text, label = full_line.rsplit('|', 1) | |
| clean_rows.append({'text': text.strip(), 'label': int(label)}) | |
| except: pass | |
| buffer = "" | |
| else: | |
| buffer += line + " " | |
| return clean_rows | |
| # main function to orchestrate the data augmentation and saving process | |
| def create_database(): | |
| print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---") | |
| # 1. Load Seed Data | |
| seed_rows = clean_and_load_seed_data(SEED_PATH) | |
| print(f"🌱 Loaded {len(seed_rows)} original seed rows.") | |
| # 2. MULTIPLY SEED DATA | |
| final_rows = [] | |
| print("🧬 Cloning and mutating seed data...") | |
| # generates multiple variations for each original seed row | |
| for row in seed_rows: | |
| # Generate 8 variations per real row | |
| variations = generate_variations(row['text'], num_variations=8) | |
| for var in variations: | |
| final_rows.append({'text': var, 'label': row['label']}) | |
| print(f" ↳ Expanded seed data to {len(final_rows)} rows.") | |
| # 3. Fill the rest with Synthetic Templates | |
| # calculates how many synthetic rows are needed to meet the TOTAL_ROWS target | |
| remaining = TOTAL_ROWS - len(final_rows) | |
| # generates synthetic positive and negative posts using my_generator | |
| if remaining > 0: | |
| print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...") | |
| for _ in range(remaining // 2): | |
| final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1}) | |
| final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0}) | |
| else: | |
| print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.") | |
| # 4. Save | |
| df = pd.DataFrame(final_rows) | |
| # shuffles the dataset and removes duplicates before saving | |
| final_df = df.sample(frac=1).reset_index(drop=True) | |
| final_df.drop_duplicates(subset=['text'], inplace=True) | |
| # saves the final dataset to a CSV file | |
| final_df.to_csv(CSV_PATH, index=False) | |
| print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}") | |
| print(" (Note: Dataset is optimized for quality over quantity)") | |
| # executes the main function when the script is run | |
| if __name__ == "__main__": | |
| create_database() |