import pandas as pd import os import random import re import my_generator as gen # Config BASE_DIR = os.path.dirname(os.path.abspath(__file__)) SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt') CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv') # sets the target total size of the final dataset TOTAL_ROWS = 5000 # --------------------------------------------------------- # 1. SLANG & AUGMENTATION LIBRARY # --------------------------------------------------------- # defines common Taglish words and their text-speak/slang equivalents SLANG_MAP = { "tulong": ["help", "saklolo", "tulong po", "help pls"], "kami": ["kmi", "kme", "tayo"], "dito": ["d2", "dto", "here"], "baha": ["flood", "tubig", "pagbaha"], "rescue": ["save us", "pasundo", "saklolo"], "please": ["pls", "plz", "paki", "parang awa nyo na"], "wala": ["la", "wla", "zero"], "sa": ["s", "sa may"], "ang": ["ung", "yung", "ang"], "hindi": ["di", "d", "hndi"], "kayo": ["kau", "nyo"], "may": ["meron", "my"], } # replaces common words with text-speak or slang based on SLANG_MAP def apply_slang(text): words = text.split() new_words = [] for word in words: lower_word = word.lower().replace(".", "").replace("!", "") if lower_word in SLANG_MAP and random.random() > 0.5: new_words.append(random.choice(SLANG_MAP[lower_word])) else: new_words.append(word) return " ".join(new_words) # randomly swaps two adjacent words to simulate panic typing errors def shuffle_sentence(text): words = text.split() if len(words) > 3: idx = random.randint(0, len(words) - 2) words[idx], words[idx+1] = words[idx+1], words[idx] return " ".join(words) # adds formatting noise (random caps, repetition, punctuation spam) to simulate distress def add_noise(text): # 1. Random Caps if random.random() > 0.7: text = text.upper() elif random.random() > 0.7: text = text.lower() # 2. Urgent Repetition (for positives) if "help" in text.lower() or "tulong" in text.lower(): if random.random() > 0.7: text = text + " TULONG!" # 3. Punctuation Spam if random.random() > 0.6: text += "!!" if random.random() > 0.5 else "..." return text # creates multiple variations of a single input text using all augmentation methods def generate_variations(text, num_variations=3): variations = [text] # Keep original for _ in range(num_variations): # Method A: Slang var = apply_slang(text) # Method B: Noise var = add_noise(var) # Method C: Shuffle (rarely) if random.random() > 0.8: var = shuffle_sentence(var) variations.append(var) return list(set(variations)) # Unique only # --------------------------------------------------------- # 2. CORE LOGIC # --------------------------------------------------------- # cleans and loads the initial seed data from the text file def clean_and_load_seed_data(filepath): if not os.path.exists(filepath): print(f"⚠️ Warning: {filepath} not found.") return [] with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # removes internal tags or bracketed information pattern = re.compile(r"\[.*?\]") content = pattern.sub("", content) clean_rows = [] raw_lines = content.split('\n') buffer = "" for line in raw_lines: line = line.strip() if not line: continue # checks for the classification label at the end of the line if line.endswith('|0') or line.endswith('|1'): full_line = (buffer + " " + line).strip() try: text, label = full_line.rsplit('|', 1) clean_rows.append({'text': text.strip(), 'label': int(label)}) except: pass buffer = "" else: buffer += line + " " return clean_rows # main function to orchestrate the data augmentation and saving process def create_database(): print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---") # 1. Load Seed Data seed_rows = clean_and_load_seed_data(SEED_PATH) print(f"🌱 Loaded {len(seed_rows)} original seed rows.") # 2. MULTIPLY SEED DATA final_rows = [] print("🧬 Cloning and mutating seed data...") # generates multiple variations for each original seed row for row in seed_rows: # Generate 8 variations per real row variations = generate_variations(row['text'], num_variations=8) for var in variations: final_rows.append({'text': var, 'label': row['label']}) print(f"   ↳ Expanded seed data to {len(final_rows)} rows.") # 3. Fill the rest with Synthetic Templates # calculates how many synthetic rows are needed to meet the TOTAL_ROWS target remaining = TOTAL_ROWS - len(final_rows) # generates synthetic positive and negative posts using my_generator if remaining > 0: print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...") for _ in range(remaining // 2): final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1}) final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0}) else: print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.") # 4. Save df = pd.DataFrame(final_rows) # shuffles the dataset and removes duplicates before saving final_df = df.sample(frac=1).reset_index(drop=True) final_df.drop_duplicates(subset=['text'], inplace=True) # saves the final dataset to a CSV file final_df.to_csv(CSV_PATH, index=False) print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}") print("   (Note: Dataset is optimized for quality over quantity)") # executes the main function when the script is run if __name__ == "__main__": create_database()