"""Download, clean, label, and split Amazon review data for AspectBERT. Pipeline: 1. Stream `num_samples` reviews from McAuley-Lab/Amazon-Reviews-2023 (raw_review_Electronics), or generate synthetic reviews with --test. 2. Clean review text (strip HTML/URLs, normalize whitespace). 3. Detect aspects per review via keyword matching (one review -> 0..N rows). 4. Derive a sentiment label from the star rating. 5. Save the full labeled set to data/amazon_reviews.jsonl and split it into train/val/test (70/15/15) jsonl files. """ import argparse import itertools import json import os import random import re import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from constants import detect_aspects, rating_to_label # noqa: E402 HTML_TAG_RE = re.compile(r"<[^>]+>") URL_RE = re.compile(r"http\S+|www\.\S+") WHITESPACE_RE = re.compile(r"\s+") def clean_text(text): """Strip HTML tags / URLs and collapse whitespace.""" if not text: return "" text = HTML_TAG_RE.sub(" ", text) text = URL_RE.sub(" ", text) text = WHITESPACE_RE.sub(" ", text).strip() return text # --------------------------------------------------------------------------- # # Synthetic data (for --test, no network access) # --------------------------------------------------------------------------- # SYNTHETIC_TEMPLATES = [ ("The battery life is amazing, lasts all day on a single charge!", 5), ("Battery drains within a few hours, very disappointing.", 1), ("Battery life is okay, nothing special but gets the job done.", 3), ("The display is gorgeous with vibrant colors and great brightness.", 5), ("Screen resolution is poor and hard to read in sunlight.", 2), ("The display is decent for the price.", 3), ("Camera takes stunning photos even in low light.", 5), ("Camera quality is blurry and pictures come out grainy.", 1), ("The camera is average, photos are fine for casual use.", 3), ("Great value for the money, very affordable for what you get.", 5), ("Way too expensive for the features it offers, overpriced.", 1), ("The price is reasonable, about what I expected.", 3), ("Performance is super fast and smooth, no lag at all.", 5), ("The processor is slow and the device freezes constantly.", 1), ("Performance is acceptable for everyday tasks.", 3), ("Beautiful sleek design, feels premium and well built.", 5), ("Build quality feels cheap and the material scratches easily.", 2), ("The design is fine, fairly compact and lightweight.", 3), ("The software is buggy and crashes constantly after the update.", 1), ("The app interface is intuitive and the firmware updates are smooth.", 5), ("Software works fine most of the time, occasional minor glitches.", 3), ("Customer service was unhelpful and slow to respond to my return.", 1), ("Customer support resolved my warranty issue quickly and politely.", 5), ("Customer service was okay, took a while but got a refund eventually.", 3), ( "Battery life is excellent and the camera takes great photos, " "but the price feels a bit high.", 4, ), ( "Beautiful display and snappy performance, however customer service " "was rude when I asked for a refund.", 2, ), ] def generate_synthetic_reviews(n=200, seed=42): """Generate a small synthetic review set for pipeline verification.""" rng = random.Random(seed) reviews = [] for i in range(n): text, rating = rng.choice(SYNTHETIC_TEMPLATES) reviews.append({"title": "Great product overview", "text": text, "rating": rating}) return reviews # --------------------------------------------------------------------------- # # Real download via HuggingFace datasets # --------------------------------------------------------------------------- # def load_amazon_reviews(num_samples, seed=42): """Stream `num_samples` Amazon Electronics reviews from the HF Hub.""" from datasets import load_dataset print("Loading McAuley-Lab/Amazon-Reviews-2023 (raw_review_Electronics) " "in streaming mode...") ds = load_dataset( "McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", streaming=True, trust_remote_code=True, ) split = ds["full"].shuffle(seed=seed, buffer_size=10_000) reviews = [] for example in itertools.islice(split, num_samples): reviews.append(example) if len(reviews) % 5000 == 0: print(f" ...collected {len(reviews)}/{num_samples} reviews") return reviews # --------------------------------------------------------------------------- # # Labeling and splitting # --------------------------------------------------------------------------- # def build_rows(raw_reviews): """Clean, aspect-label, and sentiment-label raw review dicts.""" rows = [] for review in raw_reviews: title = review.get("title") or "" body = review.get("text") or "" text = clean_text(f"{title}. {body}") rating = review.get("rating") if not text or rating is None: continue aspects = detect_aspects(text) if not aspects: continue label = rating_to_label(rating) for aspect in aspects: rows.append({ "text": text, "aspect": aspect, "rating": rating, "label": label, }) return rows def split_data(rows, seed=42, ratios=(0.70, 0.15, 0.15)): rng = random.Random(seed) shuffled = rows[:] rng.shuffle(shuffled) n = len(shuffled) n_train = int(n * ratios[0]) n_val = int(n * ratios[1]) train = shuffled[:n_train] val = shuffled[n_train:n_train + n_val] test = shuffled[n_train + n_val:] return train, val, test def save_jsonl(rows, path): os.makedirs(os.path.dirname(path) or ".", exist_ok=True) with open(path, "w", encoding="utf-8") as f: for row in rows: f.write(json.dumps(row, ensure_ascii=False) + "\n") def print_stats(rows, name): from collections import Counter aspect_counts = Counter(r["aspect"] for r in rows) label_counts = Counter(r["label"] for r in rows) print(f"\n{name}: {len(rows)} rows") print(f" by aspect: {dict(aspect_counts)}") print(f" by label: {dict(label_counts)}") # --------------------------------------------------------------------------- # # Main # --------------------------------------------------------------------------- # def parse_args(): parser = argparse.ArgumentParser(description="Prepare AspectBERT training data.") parser.add_argument("--test", action="store_true", help="Use synthetic data instead of downloading (pipeline check).") parser.add_argument("--num-samples", type=int, default=25_000, help="Number of reviews to sample from the dataset.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", default="data/amazon_reviews.jsonl", help="Path for the full labeled dataset (jsonl).") parser.add_argument("--data-dir", default="data", help="Directory to write train/val/test splits into.") return parser.parse_args() def main(): args = parse_args() if args.test: print("Running in --test mode: generating synthetic reviews (no download).") raw_reviews = generate_synthetic_reviews(n=300, seed=args.seed) else: raw_reviews = load_amazon_reviews(args.num_samples, seed=args.seed) print(f"\nFetched {len(raw_reviews)} raw reviews.") rows = build_rows(raw_reviews) print_stats(rows, "Full labeled dataset") save_jsonl(rows, args.output) print(f"\nSaved full dataset to {args.output}") train, val, test = split_data(rows, seed=args.seed) save_jsonl(train, os.path.join(args.data_dir, "train.jsonl")) save_jsonl(val, os.path.join(args.data_dir, "val.jsonl")) save_jsonl(test, os.path.join(args.data_dir, "test.jsonl")) print_stats(train, "Train split") print_stats(val, "Validation split") print_stats(test, "Test split") if __name__ == "__main__": main()