Quivara's picture
Fresh upload with LFS
bdb271a
import pandas as pd
import os
import random
import re
import my_generator as gen
# Config
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt')
CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
# sets the target total size of the final dataset
TOTAL_ROWS = 5000
# ---------------------------------------------------------
# 1. SLANG & AUGMENTATION LIBRARY
# ---------------------------------------------------------
# defines common Taglish words and their text-speak/slang equivalents
SLANG_MAP = {
"tulong": ["help", "saklolo", "tulong po", "help pls"],
"kami": ["kmi", "kme", "tayo"],
"dito": ["d2", "dto", "here"],
"baha": ["flood", "tubig", "pagbaha"],
"rescue": ["save us", "pasundo", "saklolo"],
"please": ["pls", "plz", "paki", "parang awa nyo na"],
"wala": ["la", "wla", "zero"],
"sa": ["s", "sa may"],
"ang": ["ung", "yung", "ang"],
"hindi": ["di", "d", "hndi"],
"kayo": ["kau", "nyo"],
"may": ["meron", "my"],
}
# replaces common words with text-speak or slang based on SLANG_MAP
def apply_slang(text):
words = text.split()
new_words = []
for word in words:
lower_word = word.lower().replace(".", "").replace("!", "")
if lower_word in SLANG_MAP and random.random() > 0.5:
new_words.append(random.choice(SLANG_MAP[lower_word]))
else:
new_words.append(word)
return " ".join(new_words)
# randomly swaps two adjacent words to simulate panic typing errors
def shuffle_sentence(text):
words = text.split()
if len(words) > 3:
idx = random.randint(0, len(words) - 2)
words[idx], words[idx+1] = words[idx+1], words[idx]
return " ".join(words)
# adds formatting noise (random caps, repetition, punctuation spam) to simulate distress
def add_noise(text):
# 1. Random Caps
if random.random() > 0.7:
text = text.upper()
elif random.random() > 0.7:
text = text.lower()
# 2. Urgent Repetition (for positives)
if "help" in text.lower() or "tulong" in text.lower():
if random.random() > 0.7:
text = text + " TULONG!"
# 3. Punctuation Spam
if random.random() > 0.6:
text += "!!" if random.random() > 0.5 else "..."
return text
# creates multiple variations of a single input text using all augmentation methods
def generate_variations(text, num_variations=3):
variations = [text] # Keep original
for _ in range(num_variations):
# Method A: Slang
var = apply_slang(text)
# Method B: Noise
var = add_noise(var)
# Method C: Shuffle (rarely)
if random.random() > 0.8:
var = shuffle_sentence(var)
variations.append(var)
return list(set(variations)) # Unique only
# ---------------------------------------------------------
# 2. CORE LOGIC
# ---------------------------------------------------------
# cleans and loads the initial seed data from the text file
def clean_and_load_seed_data(filepath):
if not os.path.exists(filepath):
print(f"⚠️ Warning: {filepath} not found.")
return []
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# removes internal tags or bracketed information
pattern = re.compile(r"\[.*?\]")
content = pattern.sub("", content)
clean_rows = []
raw_lines = content.split('\n')
buffer = ""
for line in raw_lines:
line = line.strip()
if not line: continue
# checks for the classification label at the end of the line
if line.endswith('|0') or line.endswith('|1'):
full_line = (buffer + " " + line).strip()
try:
text, label = full_line.rsplit('|', 1)
clean_rows.append({'text': text.strip(), 'label': int(label)})
except: pass
buffer = ""
else:
buffer += line + " "
return clean_rows
# main function to orchestrate the data augmentation and saving process
def create_database():
print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---")
# 1. Load Seed Data
seed_rows = clean_and_load_seed_data(SEED_PATH)
print(f"🌱 Loaded {len(seed_rows)} original seed rows.")
# 2. MULTIPLY SEED DATA
final_rows = []
print("🧬 Cloning and mutating seed data...")
# generates multiple variations for each original seed row
for row in seed_rows:
# Generate 8 variations per real row
variations = generate_variations(row['text'], num_variations=8)
for var in variations:
final_rows.append({'text': var, 'label': row['label']})
print(f"   ↳ Expanded seed data to {len(final_rows)} rows.")
# 3. Fill the rest with Synthetic Templates
# calculates how many synthetic rows are needed to meet the TOTAL_ROWS target
remaining = TOTAL_ROWS - len(final_rows)
# generates synthetic positive and negative posts using my_generator
if remaining > 0:
print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...")
for _ in range(remaining // 2):
final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1})
final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0})
else:
print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.")
# 4. Save
df = pd.DataFrame(final_rows)
# shuffles the dataset and removes duplicates before saving
final_df = df.sample(frac=1).reset_index(drop=True)
final_df.drop_duplicates(subset=['text'], inplace=True)
# saves the final dataset to a CSV file
final_df.to_csv(CSV_PATH, index=False)
print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}")
print("   (Note: Dataset is optimized for quality over quantity)")
# executes the main function when the script is run
if __name__ == "__main__":
create_database()