File size: 2,124 Bytes
2bf862f
05b9d1b
2bf862f
 
 
 
 
 
 
 
 
 
 
251d75e
2bf862f
 
 
 
 
 
 
05b9d1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251d75e
05b9d1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
import json
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords', quiet=True)

# Define a set of English stopwords for filtering out common words
STOPWORDS = set(stopwords.words('english'))

# Tokenizer
def simple_tokenize(text):
    """A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords."""
    if not text:
        return []
    text = text.lower()
    text = re.sub(r"-", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS]
    return tokens

def extract_image(row):
    """
    Return the first large image URL from the HF dataset row, or None.
 
    Expected structure (adjust key names to match your dataset):
        row["images"] = {"large": ["https://...", ...], ...}
    or a JSON-encoded string of the same shape.
    """
    images = row.get("images")
    if images is None:
        return None
 
    # Some datasets store this column as a JSON string
    if isinstance(images, str):
        try:
            images = json.loads(images)
        except json.JSONDecodeError:
            return None
 
    if not isinstance(images, dict):
        return None
 
    large = images.get("large")
    if large and isinstance(large, list) and len(large) > 0:
        return large[0]
 
    return None

def decode_ratings(page_content):
    """Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text."""
    block_pattern = r'\[\d\.0★\].*'
    matches = re.findall(block_pattern, page_content)
    if matches:
        pattern = r'\[(\d\.0)★\]\s*(.*?)\s*—\s*(.*)'
        parsed = []

        for r in matches[:3]:
            match = re.match(pattern, r)
            if match:
                rating, title, text = match.groups()
                parsed.append({
                    'rating': float(rating),
                    'title': title.strip(),
                    'text': text.strip()
                })

        return(parsed)
    else:
        return {}