import re import json import nltk from nltk.corpus import stopwords # Download stopwords if not already downloaded nltk.download('stopwords', quiet=True) # Define a set of English stopwords for filtering out common words STOPWORDS = set(stopwords.words('english')) # Tokenizer def simple_tokenize(text): """A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords.""" if not text: return [] text = text.lower() text = re.sub(r"-", " ", text) text = re.sub(r"[^a-z0-9\s]", "", text) tokens = text.split() tokens = [t for t in tokens if t not in STOPWORDS] return tokens def extract_image(row): """ Return the first large image URL from the HF dataset row, or None. Expected structure (adjust key names to match your dataset): row["images"] = {"large": ["https://...", ...], ...} or a JSON-encoded string of the same shape. """ images = row.get("images") if images is None: return None # Some datasets store this column as a JSON string if isinstance(images, str): try: images = json.loads(images) except json.JSONDecodeError: return None if not isinstance(images, dict): return None large = images.get("large") if large and isinstance(large, list) and len(large) > 0: return large[0] return None def decode_ratings(page_content): """Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text.""" block_pattern = r'\[\d\.0★\].*' matches = re.findall(block_pattern, page_content) if matches: pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)' parsed = [] for r in matches[:3]: match = re.match(pattern, r) if match: rating, title, text = match.groups() parsed.append({ 'rating': float(rating), 'title': title.strip(), 'text': text.strip() }) return(parsed) else: return {}