Spaces:

Quivara
/

alisto-project

Running

File size: 6,102 Bytes

bdb271a

import pandas as pd
import os
import random
import re
import my_generator as gen

# Config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt')
CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
# sets the target total size of the final dataset
TOTAL_ROWS = 5000 

# ---------------------------------------------------------
# 1. SLANG & AUGMENTATION LIBRARY
# ---------------------------------------------------------
# defines common Taglish words and their text-speak/slang equivalents
SLANG_MAP = {
    "tulong": ["help", "saklolo", "tulong po", "help pls"],
    "kami": ["kmi", "kme", "tayo"],
    "dito": ["d2", "dto", "here"],
    "baha": ["flood", "tubig", "pagbaha"],
    "rescue": ["save us", "pasundo", "saklolo"],
    "please": ["pls", "plz", "paki", "parang awa nyo na"],
    "wala": ["la", "wla", "zero"],
    "sa": ["s", "sa may"],
    "ang": ["ung", "yung", "ang"],
    "hindi": ["di", "d", "hndi"],
    "kayo": ["kau", "nyo"],
    "may": ["meron", "my"],
}

# replaces common words with text-speak or slang based on SLANG_MAP
def apply_slang(text):
    words = text.split()
    new_words = []
    for word in words:
        lower_word = word.lower().replace(".", "").replace("!", "")
        if lower_word in SLANG_MAP and random.random() > 0.5:
            new_words.append(random.choice(SLANG_MAP[lower_word]))
        else:
            new_words.append(word)
    return " ".join(new_words)

# randomly swaps two adjacent words to simulate panic typing errors
def shuffle_sentence(text):
    words = text.split()
    if len(words) > 3:
        idx = random.randint(0, len(words) - 2)
        words[idx], words[idx+1] = words[idx+1], words[idx]
    return " ".join(words)

# adds formatting noise (random caps, repetition, punctuation spam) to simulate distress
def add_noise(text):
    # 1. Random Caps
    if random.random() > 0.7:
        text = text.upper()
    elif random.random() > 0.7:
        text = text.lower()
        
    # 2. Urgent Repetition (for positives)
    if "help" in text.lower() or "tulong" in text.lower():
        if random.random() > 0.7:
            text = text + " TULONG!"
            
    # 3. Punctuation Spam
    if random.random() > 0.6:
        text += "!!" if random.random() > 0.5 else "..."
        
    return text

# creates multiple variations of a single input text using all augmentation methods
def generate_variations(text, num_variations=3):
    variations = [text] # Keep original
    
    for _ in range(num_variations):
        # Method A: Slang
        var = apply_slang(text)
        # Method B: Noise
        var = add_noise(var)
        # Method C: Shuffle (rarely)
        if random.random() > 0.8:
            var = shuffle_sentence(var)
            
        variations.append(var)
    
    return list(set(variations)) # Unique only

# ---------------------------------------------------------
# 2. CORE LOGIC
# ---------------------------------------------------------
# cleans and loads the initial seed data from the text file
def clean_and_load_seed_data(filepath):
    if not os.path.exists(filepath):
        print(f"⚠️ Warning: {filepath} not found.")
        return []

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # removes internal tags or bracketed information
    pattern = re.compile(r"\[.*?\]")
    content = pattern.sub("", content)

    clean_rows = []
    raw_lines = content.split('\n')
    buffer = ""

    for line in raw_lines:
        line = line.strip()
        if not line: continue

        # checks for the classification label at the end of the line
        if line.endswith('|0') or line.endswith('|1'):
            full_line = (buffer + " " + line).strip()
            try:
                text, label = full_line.rsplit('|', 1)
                clean_rows.append({'text': text.strip(), 'label': int(label)})
            except: pass
            buffer = ""
        else:
            buffer += line + " "
            
    return clean_rows

# main function to orchestrate the data augmentation and saving process
def create_database():
    print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---")
    
    # 1. Load Seed Data
    seed_rows = clean_and_load_seed_data(SEED_PATH)
    print(f"🌱 Loaded {len(seed_rows)} original seed rows.")

    # 2. MULTIPLY SEED DATA
    final_rows = []
    print("🧬 Cloning and mutating seed data...")
    # generates multiple variations for each original seed row
    for row in seed_rows:
        # Generate 8 variations per real row
        variations = generate_variations(row['text'], num_variations=8)
        for var in variations:
            final_rows.append({'text': var, 'label': row['label']})
            
    print(f"   ↳ Expanded seed data to {len(final_rows)} rows.")

    # 3. Fill the rest with Synthetic Templates
    # calculates how many synthetic rows are needed to meet the TOTAL_ROWS target
    remaining = TOTAL_ROWS - len(final_rows)
    
    # generates synthetic positive and negative posts using my_generator
    if remaining > 0:
        print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...")
        for _ in range(remaining // 2):
            final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1})
            final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0})
    else:
        print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.")

    # 4. Save
    df = pd.DataFrame(final_rows)
    # shuffles the dataset and removes duplicates before saving
    final_df = df.sample(frac=1).reset_index(drop=True)
    final_df.drop_duplicates(subset=['text'], inplace=True)
    
    # saves the final dataset to a CSV file
    final_df.to_csv(CSV_PATH, index=False)
    print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}")
    print("   (Note: Dataset is optimized for quality over quantity)")

# executes the main function when the script is run
if __name__ == "__main__":
    create_database()