Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| import nltk | |
| from nltk.corpus import stopwords | |
| # Download stopwords if not already downloaded | |
| nltk.download('stopwords', quiet=True) | |
| # Define a set of English stopwords for filtering out common words | |
| STOPWORDS = set(stopwords.words('english')) | |
| # Tokenizer | |
| def simple_tokenize(text): | |
| """A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords.""" | |
| if not text: | |
| return [] | |
| text = text.lower() | |
| text = re.sub(r"-", " ", text) | |
| text = re.sub(r"[^a-z0-9\s]", "", text) | |
| tokens = text.split() | |
| tokens = [t for t in tokens if t not in STOPWORDS] | |
| return tokens | |
| def extract_image(row): | |
| """ | |
| Return the first large image URL from the HF dataset row, or None. | |
| Expected structure (adjust key names to match your dataset): | |
| row["images"] = {"large": ["https://...", ...], ...} | |
| or a JSON-encoded string of the same shape. | |
| """ | |
| images = row.get("images") | |
| if images is None: | |
| return None | |
| # Some datasets store this column as a JSON string | |
| if isinstance(images, str): | |
| try: | |
| images = json.loads(images) | |
| except json.JSONDecodeError: | |
| return None | |
| if not isinstance(images, dict): | |
| return None | |
| large = images.get("large") | |
| if large and isinstance(large, list) and len(large) > 0: | |
| return large[0] | |
| return None | |
| def decode_ratings(page_content): | |
| """Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text.""" | |
| block_pattern = r'\[\d\.0β \].*' | |
| matches = re.findall(block_pattern, page_content) | |
| if matches: | |
| pattern = r'\[(\d\.0)β \]\s*(.*?)\s*β\s*(.*)' | |
| parsed = [] | |
| for r in matches[:3]: | |
| match = re.match(pattern, r) | |
| if match: | |
| rating, title, text = match.groups() | |
| parsed.append({ | |
| 'rating': float(rating), | |
| 'title': title.strip(), | |
| 'text': text.strip() | |
| }) | |
| return(parsed) | |
| else: | |
| return {} |