Spaces:
Sleeping
Sleeping
File size: 2,124 Bytes
2bf862f 05b9d1b 2bf862f 251d75e 2bf862f 05b9d1b 251d75e 05b9d1b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import re
import json
import nltk
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
nltk.download('stopwords', quiet=True)
# Define a set of English stopwords for filtering out common words
STOPWORDS = set(stopwords.words('english'))
# Tokenizer
def simple_tokenize(text):
"""A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords."""
if not text:
return []
text = text.lower()
text = re.sub(r"-", " ", text)
text = re.sub(r"[^a-z0-9\s]", "", text)
tokens = text.split()
tokens = [t for t in tokens if t not in STOPWORDS]
return tokens
def extract_image(row):
"""
Return the first large image URL from the HF dataset row, or None.
Expected structure (adjust key names to match your dataset):
row["images"] = {"large": ["https://...", ...], ...}
or a JSON-encoded string of the same shape.
"""
images = row.get("images")
if images is None:
return None
# Some datasets store this column as a JSON string
if isinstance(images, str):
try:
images = json.loads(images)
except json.JSONDecodeError:
return None
if not isinstance(images, dict):
return None
large = images.get("large")
if large and isinstance(large, list) and len(large) > 0:
return large[0]
return None
def decode_ratings(page_content):
"""Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text."""
block_pattern = r'\[\d\.0★\].*'
matches = re.findall(block_pattern, page_content)
if matches:
pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
parsed = []
for r in matches[:3]:
match = re.match(pattern, r)
if match:
rating, title, text = match.groups()
parsed.append({
'rating': float(rating),
'title': title.strip(),
'text': text.strip()
})
return(parsed)
else:
return {} |