File size: 6,102 Bytes
bdb271a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import os
import random
import re
import my_generator as gen

# Config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt')
CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
# sets the target total size of the final dataset
TOTAL_ROWS = 5000 

# ---------------------------------------------------------
# 1. SLANG & AUGMENTATION LIBRARY
# ---------------------------------------------------------
# defines common Taglish words and their text-speak/slang equivalents
SLANG_MAP = {
    "tulong": ["help", "saklolo", "tulong po", "help pls"],
    "kami": ["kmi", "kme", "tayo"],
    "dito": ["d2", "dto", "here"],
    "baha": ["flood", "tubig", "pagbaha"],
    "rescue": ["save us", "pasundo", "saklolo"],
    "please": ["pls", "plz", "paki", "parang awa nyo na"],
    "wala": ["la", "wla", "zero"],
    "sa": ["s", "sa may"],
    "ang": ["ung", "yung", "ang"],
    "hindi": ["di", "d", "hndi"],
    "kayo": ["kau", "nyo"],
    "may": ["meron", "my"],
}

# replaces common words with text-speak or slang based on SLANG_MAP
def apply_slang(text):
    words = text.split()
    new_words = []
    for word in words:
        lower_word = word.lower().replace(".", "").replace("!", "")
        if lower_word in SLANG_MAP and random.random() > 0.5:
            new_words.append(random.choice(SLANG_MAP[lower_word]))
        else:
            new_words.append(word)
    return " ".join(new_words)

# randomly swaps two adjacent words to simulate panic typing errors
def shuffle_sentence(text):
    words = text.split()
    if len(words) > 3:
        idx = random.randint(0, len(words) - 2)
        words[idx], words[idx+1] = words[idx+1], words[idx]
    return " ".join(words)

# adds formatting noise (random caps, repetition, punctuation spam) to simulate distress
def add_noise(text):
    # 1. Random Caps
    if random.random() > 0.7:
        text = text.upper()
    elif random.random() > 0.7:
        text = text.lower()
        
    # 2. Urgent Repetition (for positives)
    if "help" in text.lower() or "tulong" in text.lower():
        if random.random() > 0.7:
            text = text + " TULONG!"
            
    # 3. Punctuation Spam
    if random.random() > 0.6:
        text += "!!" if random.random() > 0.5 else "..."
        
    return text

# creates multiple variations of a single input text using all augmentation methods
def generate_variations(text, num_variations=3):
    variations = [text] # Keep original
    
    for _ in range(num_variations):
        # Method A: Slang
        var = apply_slang(text)
        # Method B: Noise
        var = add_noise(var)
        # Method C: Shuffle (rarely)
        if random.random() > 0.8:
            var = shuffle_sentence(var)
            
        variations.append(var)
    
    return list(set(variations)) # Unique only

# ---------------------------------------------------------
# 2. CORE LOGIC
# ---------------------------------------------------------
# cleans and loads the initial seed data from the text file
def clean_and_load_seed_data(filepath):
    if not os.path.exists(filepath):
        print(f"⚠️ Warning: {filepath} not found.")
        return []

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # removes internal tags or bracketed information
    pattern = re.compile(r"\[.*?\]")
    content = pattern.sub("", content)

    clean_rows = []
    raw_lines = content.split('\n')
    buffer = ""

    for line in raw_lines:
        line = line.strip()
        if not line: continue

        # checks for the classification label at the end of the line
        if line.endswith('|0') or line.endswith('|1'):
            full_line = (buffer + " " + line).strip()
            try:
                text, label = full_line.rsplit('|', 1)
                clean_rows.append({'text': text.strip(), 'label': int(label)})
            except: pass
            buffer = ""
        else:
            buffer += line + " "
            
    return clean_rows

# main function to orchestrate the data augmentation and saving process
def create_database():
    print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---")
    
    # 1. Load Seed Data
    seed_rows = clean_and_load_seed_data(SEED_PATH)
    print(f"🌱 Loaded {len(seed_rows)} original seed rows.")

    # 2. MULTIPLY SEED DATA
    final_rows = []
    print("🧬 Cloning and mutating seed data...")
    # generates multiple variations for each original seed row
    for row in seed_rows:
        # Generate 8 variations per real row
        variations = generate_variations(row['text'], num_variations=8)
        for var in variations:
            final_rows.append({'text': var, 'label': row['label']})
            
    print(f"   ↳ Expanded seed data to {len(final_rows)} rows.")

    # 3. Fill the rest with Synthetic Templates
    # calculates how many synthetic rows are needed to meet the TOTAL_ROWS target
    remaining = TOTAL_ROWS - len(final_rows)
    
    # generates synthetic positive and negative posts using my_generator
    if remaining > 0:
        print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...")
        for _ in range(remaining // 2):
            final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1})
            final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0})
    else:
        print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.")

    # 4. Save
    df = pd.DataFrame(final_rows)
    # shuffles the dataset and removes duplicates before saving
    final_df = df.sample(frac=1).reset_index(drop=True)
    final_df.drop_duplicates(subset=['text'], inplace=True)
    
    # saves the final dataset to a CSV file
    final_df.to_csv(CSV_PATH, index=False)
    print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}")
    print("   (Note: Dataset is optimized for quality over quantity)")

# executes the main function when the script is run
if __name__ == "__main__":
    create_database()