Spaces:
Sleeping
Sleeping
| """Download, clean, label, and split Amazon review data for AspectBERT. | |
| Pipeline: | |
| 1. Stream `num_samples` reviews from McAuley-Lab/Amazon-Reviews-2023 | |
| (raw_review_Electronics), or generate synthetic reviews with --test. | |
| 2. Clean review text (strip HTML/URLs, normalize whitespace). | |
| 3. Detect aspects per review via keyword matching (one review -> 0..N rows). | |
| 4. Derive a sentiment label from the star rating. | |
| 5. Save the full labeled set to data/amazon_reviews.jsonl and split it | |
| into train/val/test (70/15/15) jsonl files. | |
| """ | |
| import argparse | |
| import itertools | |
| import json | |
| import os | |
| import random | |
| import re | |
| import sys | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from constants import detect_aspects, rating_to_label # noqa: E402 | |
| HTML_TAG_RE = re.compile(r"<[^>]+>") | |
| URL_RE = re.compile(r"http\S+|www\.\S+") | |
| WHITESPACE_RE = re.compile(r"\s+") | |
| def clean_text(text): | |
| """Strip HTML tags / URLs and collapse whitespace.""" | |
| if not text: | |
| return "" | |
| text = HTML_TAG_RE.sub(" ", text) | |
| text = URL_RE.sub(" ", text) | |
| text = WHITESPACE_RE.sub(" ", text).strip() | |
| return text | |
| # --------------------------------------------------------------------------- # | |
| # Synthetic data (for --test, no network access) | |
| # --------------------------------------------------------------------------- # | |
| SYNTHETIC_TEMPLATES = [ | |
| ("The battery life is amazing, lasts all day on a single charge!", 5), | |
| ("Battery drains within a few hours, very disappointing.", 1), | |
| ("Battery life is okay, nothing special but gets the job done.", 3), | |
| ("The display is gorgeous with vibrant colors and great brightness.", 5), | |
| ("Screen resolution is poor and hard to read in sunlight.", 2), | |
| ("The display is decent for the price.", 3), | |
| ("Camera takes stunning photos even in low light.", 5), | |
| ("Camera quality is blurry and pictures come out grainy.", 1), | |
| ("The camera is average, photos are fine for casual use.", 3), | |
| ("Great value for the money, very affordable for what you get.", 5), | |
| ("Way too expensive for the features it offers, overpriced.", 1), | |
| ("The price is reasonable, about what I expected.", 3), | |
| ("Performance is super fast and smooth, no lag at all.", 5), | |
| ("The processor is slow and the device freezes constantly.", 1), | |
| ("Performance is acceptable for everyday tasks.", 3), | |
| ("Beautiful sleek design, feels premium and well built.", 5), | |
| ("Build quality feels cheap and the material scratches easily.", 2), | |
| ("The design is fine, fairly compact and lightweight.", 3), | |
| ("The software is buggy and crashes constantly after the update.", 1), | |
| ("The app interface is intuitive and the firmware updates are smooth.", 5), | |
| ("Software works fine most of the time, occasional minor glitches.", 3), | |
| ("Customer service was unhelpful and slow to respond to my return.", 1), | |
| ("Customer support resolved my warranty issue quickly and politely.", 5), | |
| ("Customer service was okay, took a while but got a refund eventually.", 3), | |
| ( | |
| "Battery life is excellent and the camera takes great photos, " | |
| "but the price feels a bit high.", | |
| 4, | |
| ), | |
| ( | |
| "Beautiful display and snappy performance, however customer service " | |
| "was rude when I asked for a refund.", | |
| 2, | |
| ), | |
| ] | |
| def generate_synthetic_reviews(n=200, seed=42): | |
| """Generate a small synthetic review set for pipeline verification.""" | |
| rng = random.Random(seed) | |
| reviews = [] | |
| for i in range(n): | |
| text, rating = rng.choice(SYNTHETIC_TEMPLATES) | |
| reviews.append({"title": "Great product overview", "text": text, "rating": rating}) | |
| return reviews | |
| # --------------------------------------------------------------------------- # | |
| # Real download via HuggingFace datasets | |
| # --------------------------------------------------------------------------- # | |
| def load_amazon_reviews(num_samples, seed=42): | |
| """Stream `num_samples` Amazon Electronics reviews from the HF Hub.""" | |
| from datasets import load_dataset | |
| print("Loading McAuley-Lab/Amazon-Reviews-2023 (raw_review_Electronics) " | |
| "in streaming mode...") | |
| ds = load_dataset( | |
| "McAuley-Lab/Amazon-Reviews-2023", | |
| "raw_review_Electronics", | |
| streaming=True, | |
| trust_remote_code=True, | |
| ) | |
| split = ds["full"].shuffle(seed=seed, buffer_size=10_000) | |
| reviews = [] | |
| for example in itertools.islice(split, num_samples): | |
| reviews.append(example) | |
| if len(reviews) % 5000 == 0: | |
| print(f" ...collected {len(reviews)}/{num_samples} reviews") | |
| return reviews | |
| # --------------------------------------------------------------------------- # | |
| # Labeling and splitting | |
| # --------------------------------------------------------------------------- # | |
| def build_rows(raw_reviews): | |
| """Clean, aspect-label, and sentiment-label raw review dicts.""" | |
| rows = [] | |
| for review in raw_reviews: | |
| title = review.get("title") or "" | |
| body = review.get("text") or "" | |
| text = clean_text(f"{title}. {body}") | |
| rating = review.get("rating") | |
| if not text or rating is None: | |
| continue | |
| aspects = detect_aspects(text) | |
| if not aspects: | |
| continue | |
| label = rating_to_label(rating) | |
| for aspect in aspects: | |
| rows.append({ | |
| "text": text, | |
| "aspect": aspect, | |
| "rating": rating, | |
| "label": label, | |
| }) | |
| return rows | |
| def split_data(rows, seed=42, ratios=(0.70, 0.15, 0.15)): | |
| rng = random.Random(seed) | |
| shuffled = rows[:] | |
| rng.shuffle(shuffled) | |
| n = len(shuffled) | |
| n_train = int(n * ratios[0]) | |
| n_val = int(n * ratios[1]) | |
| train = shuffled[:n_train] | |
| val = shuffled[n_train:n_train + n_val] | |
| test = shuffled[n_train + n_val:] | |
| return train, val, test | |
| def save_jsonl(rows, path): | |
| os.makedirs(os.path.dirname(path) or ".", exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| for row in rows: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| def print_stats(rows, name): | |
| from collections import Counter | |
| aspect_counts = Counter(r["aspect"] for r in rows) | |
| label_counts = Counter(r["label"] for r in rows) | |
| print(f"\n{name}: {len(rows)} rows") | |
| print(f" by aspect: {dict(aspect_counts)}") | |
| print(f" by label: {dict(label_counts)}") | |
| # --------------------------------------------------------------------------- # | |
| # Main | |
| # --------------------------------------------------------------------------- # | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Prepare AspectBERT training data.") | |
| parser.add_argument("--test", action="store_true", | |
| help="Use synthetic data instead of downloading (pipeline check).") | |
| parser.add_argument("--num-samples", type=int, default=25_000, | |
| help="Number of reviews to sample from the dataset.") | |
| parser.add_argument("--seed", type=int, default=42) | |
| parser.add_argument("--output", default="data/amazon_reviews.jsonl", | |
| help="Path for the full labeled dataset (jsonl).") | |
| parser.add_argument("--data-dir", default="data", | |
| help="Directory to write train/val/test splits into.") | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_args() | |
| if args.test: | |
| print("Running in --test mode: generating synthetic reviews (no download).") | |
| raw_reviews = generate_synthetic_reviews(n=300, seed=args.seed) | |
| else: | |
| raw_reviews = load_amazon_reviews(args.num_samples, seed=args.seed) | |
| print(f"\nFetched {len(raw_reviews)} raw reviews.") | |
| rows = build_rows(raw_reviews) | |
| print_stats(rows, "Full labeled dataset") | |
| save_jsonl(rows, args.output) | |
| print(f"\nSaved full dataset to {args.output}") | |
| train, val, test = split_data(rows, seed=args.seed) | |
| save_jsonl(train, os.path.join(args.data_dir, "train.jsonl")) | |
| save_jsonl(val, os.path.join(args.data_dir, "val.jsonl")) | |
| save_jsonl(test, os.path.join(args.data_dir, "test.jsonl")) | |
| print_stats(train, "Train split") | |
| print_stats(val, "Validation split") | |
| print_stats(test, "Test split") | |
| if __name__ == "__main__": | |
| main() | |