Spaces:

Quivara
/

alisto-project

Running

App Files Files Community

alisto-project / alisto_project /backend /augment_data.py

Quivara

Fresh upload with LFS

bdb271a 2 days ago

raw

history blame contribute delete

6.1 kB

	import pandas as pd
	import os
	import random
	import re
	import my_generator as gen

	# Config
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	SEED_PATH = os.path.join(BASE_DIR, '../data/seed_data.txt')
	CSV_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
	# sets the target total size of the final dataset
	TOTAL_ROWS = 5000

	# ---------------------------------------------------------
	# 1. SLANG & AUGMENTATION LIBRARY
	# ---------------------------------------------------------
	# defines common Taglish words and their text-speak/slang equivalents
	SLANG_MAP = {
	"tulong": ["help", "saklolo", "tulong po", "help pls"],
	"kami": ["kmi", "kme", "tayo"],
	"dito": ["d2", "dto", "here"],
	"baha": ["flood", "tubig", "pagbaha"],
	"rescue": ["save us", "pasundo", "saklolo"],
	"please": ["pls", "plz", "paki", "parang awa nyo na"],
	"wala": ["la", "wla", "zero"],
	"sa": ["s", "sa may"],
	"ang": ["ung", "yung", "ang"],
	"hindi": ["di", "d", "hndi"],
	"kayo": ["kau", "nyo"],
	"may": ["meron", "my"],
	}

	# replaces common words with text-speak or slang based on SLANG_MAP
	def apply_slang(text):
	words = text.split()
	new_words = []
	for word in words:
	lower_word = word.lower().replace(".", "").replace("!", "")
	if lower_word in SLANG_MAP and random.random() > 0.5:
	new_words.append(random.choice(SLANG_MAP[lower_word]))
	else:
	new_words.append(word)
	return " ".join(new_words)

	# randomly swaps two adjacent words to simulate panic typing errors
	def shuffle_sentence(text):
	words = text.split()
	if len(words) > 3:
	idx = random.randint(0, len(words) - 2)
	words[idx], words[idx+1] = words[idx+1], words[idx]
	return " ".join(words)

	# adds formatting noise (random caps, repetition, punctuation spam) to simulate distress
	def add_noise(text):
	# 1. Random Caps
	if random.random() > 0.7:
	text = text.upper()
	elif random.random() > 0.7:
	text = text.lower()

	# 2. Urgent Repetition (for positives)
	if "help" in text.lower() or "tulong" in text.lower():
	if random.random() > 0.7:
	text = text + " TULONG!"

	# 3. Punctuation Spam
	if random.random() > 0.6:
	text += "!!" if random.random() > 0.5 else "..."

	return text

	# creates multiple variations of a single input text using all augmentation methods
	def generate_variations(text, num_variations=3):
	variations = [text] # Keep original

	for _ in range(num_variations):
	# Method A: Slang
	var = apply_slang(text)
	# Method B: Noise
	var = add_noise(var)
	# Method C: Shuffle (rarely)
	if random.random() > 0.8:
	var = shuffle_sentence(var)

	variations.append(var)

	return list(set(variations)) # Unique only

	# ---------------------------------------------------------
	# 2. CORE LOGIC
	# ---------------------------------------------------------
	# cleans and loads the initial seed data from the text file
	def clean_and_load_seed_data(filepath):
	if not os.path.exists(filepath):
	print(f"⚠️ Warning: {filepath} not found.")
	return []

	with open(filepath, 'r', encoding='utf-8') as f:
	content = f.read()

	# removes internal tags or bracketed information
	pattern = re.compile(r"\[.*?\]")
	content = pattern.sub("", content)

	clean_rows = []
	raw_lines = content.split('\n')
	buffer = ""

	for line in raw_lines:
	line = line.strip()
	if not line: continue

	# checks for the classification label at the end of the line
	if line.endswith('\|0') or line.endswith('\|1'):
	full_line = (buffer + " " + line).strip()
	try:
	text, label = full_line.rsplit('\|', 1)
	clean_rows.append({'text': text.strip(), 'label': int(label)})
	except: pass
	buffer = ""
	else:
	buffer += line + " "

	return clean_rows

	# main function to orchestrate the data augmentation and saving process
	def create_database():
	print("--- ALISTO: Phase 2 Data Augmentation (Lean & Mean Mode) ---")

	# 1. Load Seed Data
	seed_rows = clean_and_load_seed_data(SEED_PATH)
	print(f"🌱 Loaded {len(seed_rows)} original seed rows.")

	# 2. MULTIPLY SEED DATA
	final_rows = []
	print("🧬 Cloning and mutating seed data...")
	# generates multiple variations for each original seed row
	for row in seed_rows:
	# Generate 8 variations per real row
	variations = generate_variations(row['text'], num_variations=8)
	for var in variations:
	final_rows.append({'text': var, 'label': row['label']})

	print(f" ↳ Expanded seed data to {len(final_rows)} rows.")

	# 3. Fill the rest with Synthetic Templates
	# calculates how many synthetic rows are needed to meet the TOTAL_ROWS target
	remaining = TOTAL_ROWS - len(final_rows)

	# generates synthetic positive and negative posts using my_generator
	if remaining > 0:
	print(f"🤖 Generating {remaining} TRICKY synthetic rows to fill dataset...")
	for _ in range(remaining // 2):
	final_rows.append({'text': add_noise(gen.build_positive()), 'label': 1})
	final_rows.append({'text': add_noise(gen.build_negative()), 'label': 0})
	else:
	print("🤖 Seed data expansion is sufficient. Skipping synthetic generation.")

	# 4. Save
	df = pd.DataFrame(final_rows)
	# shuffles the dataset and removes duplicates before saving
	final_df = df.sample(frac=1).reset_index(drop=True)
	final_df.drop_duplicates(subset=['text'], inplace=True)

	# saves the final dataset to a CSV file
	final_df.to_csv(CSV_PATH, index=False)
	print(f"✅ Success! Saved {len(final_df)} rows to {CSV_PATH}")
	print(" (Note: Dataset is optimized for quality over quantity)")

	# executes the main function when the script is run
	if __name__ == "__main__":
	create_database()