amazon_retriever / src /utils.py
github-actions[bot]
chore: sync app/ and src/ from GitHub
251d75e
import re
import json
import nltk
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
nltk.download('stopwords', quiet=True)
# Define a set of English stopwords for filtering out common words
STOPWORDS = set(stopwords.words('english'))
# Tokenizer
def simple_tokenize(text):
"""A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords."""
if not text:
return []
text = text.lower()
text = re.sub(r"-", " ", text)
text = re.sub(r"[^a-z0-9\s]", "", text)
tokens = text.split()
tokens = [t for t in tokens if t not in STOPWORDS]
return tokens
def extract_image(row):
"""
Return the first large image URL from the HF dataset row, or None.
Expected structure (adjust key names to match your dataset):
row["images"] = {"large": ["https://...", ...], ...}
or a JSON-encoded string of the same shape.
"""
images = row.get("images")
if images is None:
return None
# Some datasets store this column as a JSON string
if isinstance(images, str):
try:
images = json.loads(images)
except json.JSONDecodeError:
return None
if not isinstance(images, dict):
return None
large = images.get("large")
if large and isinstance(large, list) and len(large) > 0:
return large[0]
return None
def decode_ratings(page_content):
"""Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text."""
block_pattern = r'\[\d\.0β˜…\].*'
matches = re.findall(block_pattern, page_content)
if matches:
pattern = r'\[(\d\.0)β˜…\]\s*(.*?)\s*β€”\s*(.*)'
parsed = []
for r in matches[:3]:
match = re.match(pattern, r)
if match:
rating, title, text = match.groups()
parsed.append({
'rating': float(rating),
'title': title.strip(),
'text': text.strip()
})
return(parsed)
else:
return {}