Spaces:
Running
Running
File size: 6,102 Bytes
bdb271a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import pandas as pd
import os
import random
import re
import my_generator as gen
# Config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt')
CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
# sets the target total size of the final dataset
TOTAL_ROWS = 5000
# ---------------------------------------------------------
# 1. SLANG & AUGMENTATION LIBRARY
# ---------------------------------------------------------
# defines common Taglish words and their text-speak/slang equivalents
SLANG_MAP = {
"tulong": ["help", "saklolo", "tulong po", "help pls"],
"kami": ["kmi", "kme", "tayo"],
"dito": ["d2", "dto", "here"],
"baha": ["flood", "tubig", "pagbaha"],
"rescue": ["save us", "pasundo", "saklolo"],
"please": ["pls", "plz", "paki", "parang awa nyo na"],
"wala": ["la", "wla", "zero"],
"sa": ["s", "sa may"],
"ang": ["ung", "yung", "ang"],
"hindi": ["di", "d", "hndi"],
"kayo": ["kau", "nyo"],
"may": ["meron", "my"],
}
# replaces common words with text-speak or slang based on SLANG_MAP
def apply_slang(text):
words = text.split()
new_words = []
for word in words:
lower_word = word.lower().replace(".", "").replace("!", "")
if lower_word in SLANG_MAP and random.random() > 0.5:
new_words.append(random.choice(SLANG_MAP[lower_word]))
else:
new_words.append(word)
return " ".join(new_words)
# randomly swaps two adjacent words to simulate panic typing errors
def shuffle_sentence(text):
words = text.split()
if len(words) > 3:
idx = random.randint(0, len(words) - 2)
words[idx], words[idx+1] = words[idx+1], words[idx]
return " ".join(words)
# adds formatting noise (random caps, repetition, punctuation spam) to simulate distress
def add_noise(text):
# 1. Random Caps
if random.random() > 0.7:
text = text.upper()
elif random.random() > 0.7:
text = text.lower()
# 2. Urgent Repetition (for positives)
if "help" in text.lower() or "tulong" in text.lower():
if random.random() > 0.7:
text = text + " TULONG!"
# 3. Punctuation Spam
if random.random() > 0.6:
text += "!!" if random.random() > 0.5 else "..."
return text
# creates multiple variations of a single input text using all augmentation methods
def generate_variations(text, num_variations=3):
variations = [text] # Keep original
for _ in range(num_variations):
# Method A: Slang
var = apply_slang(text)
# Method B: Noise
var = add_noise(var)
# Method C: Shuffle (rarely)
if random.random() > 0.8:
var = shuffle_sentence(var)
variations.append(var)
return list(set(variations)) # Unique only
# ---------------------------------------------------------
# 2. CORE LOGIC
# ---------------------------------------------------------
# cleans and loads the initial seed data from the text file
def clean_and_load_seed_data(filepath):
if not os.path.exists(filepath):
print(f"⚠️ Warning: {filepath} not found.")
return []
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# removes internal tags or bracketed information
pattern = re.compile(r"\[.*?\]")
content = pattern.sub("", content)
clean_rows = []
raw_lines = content.split('\n')
buffer = ""
for line in raw_lines:
line = line.strip()
if not line: continue
# checks for the classification label at the end of the line
if line.endswith('|0') or line.endswith('|1'):
full_line = (buffer + " " + line).strip()
try:
text, label = full_line.rsplit('|', 1)
clean_rows.append({'text': text.strip(), 'label': int(label)})
except: pass
buffer = ""
else:
buffer += line + " "
return clean_rows
# main function to orchestrate the data augmentation and saving process
def create_database():
print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---")
# 1. Load Seed Data
seed_rows = clean_and_load_seed_data(SEED_PATH)
print(f"🌱 Loaded {len(seed_rows)} original seed rows.")
# 2. MULTIPLY SEED DATA
final_rows = []
print("🧬 Cloning and mutating seed data...")
# generates multiple variations for each original seed row
for row in seed_rows:
# Generate 8 variations per real row
variations = generate_variations(row['text'], num_variations=8)
for var in variations:
final_rows.append({'text': var, 'label': row['label']})
print(f" ↳ Expanded seed data to {len(final_rows)} rows.")
# 3. Fill the rest with Synthetic Templates
# calculates how many synthetic rows are needed to meet the TOTAL_ROWS target
remaining = TOTAL_ROWS - len(final_rows)
# generates synthetic positive and negative posts using my_generator
if remaining > 0:
print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...")
for _ in range(remaining // 2):
final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1})
final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0})
else:
print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.")
# 4. Save
df = pd.DataFrame(final_rows)
# shuffles the dataset and removes duplicates before saving
final_df = df.sample(frac=1).reset_index(drop=True)
final_df.drop_duplicates(subset=['text'], inplace=True)
# saves the final dataset to a CSV file
final_df.to_csv(CSV_PATH, index=False)
print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}")
print(" (Note: Dataset is optimized for quality over quantity)")
# executes the main function when the script is run
if __name__ == "__main__":
create_database() |