Spaces:

rishadaz
/

amazon_retriever

Sleeping

github-actions[bot]

chore: sync app/ and src/ from GitHub

251d75e about 1 month ago

2.12 kB

	import re
	import json
	import nltk
	from nltk.corpus import stopwords

	# Download stopwords if not already downloaded
	nltk.download('stopwords', quiet=True)

	# Define a set of English stopwords for filtering out common words
	STOPWORDS = set(stopwords.words('english'))

	# Tokenizer
	def simple_tokenize(text):
	"""A simple tokenizer that lowercases text, removes punctuation, and filters out stopwords."""
	if not text:
	return []
	text = text.lower()
	text = re.sub(r"-", " ", text)
	text = re.sub(r"[^a-z0-9\s]", "", text)
	tokens = text.split()
	tokens = [t for t in tokens if t not in STOPWORDS]
	return tokens

	def extract_image(row):
	"""
	Return the first large image URL from the HF dataset row, or None.

	Expected structure (adjust key names to match your dataset):
	row["images"] = {"large": ["https://...", ...], ...}
	or a JSON-encoded string of the same shape.
	"""
	images = row.get("images")
	if images is None:
	return None

	# Some datasets store this column as a JSON string
	if isinstance(images, str):
	try:
	images = json.loads(images)
	except json.JSONDecodeError:
	return None

	if not isinstance(images, dict):
	return None

	large = images.get("large")
	if large and isinstance(large, list) and len(large) > 0:
	return large[0]

	return None

	def decode_ratings(page_content):
	"""Extracts up to 3 ratings from the page content string, returning a list of dicts with rating, title, and text."""
	block_pattern = r'\[\d\.0★\].*'
	matches = re.findall(block_pattern, page_content)
	if matches:
	pattern = r'\[(\d\.0)★\]\s(.?)\s—\s(.*)'
	parsed = []

	for r in matches[:3]:
	match = re.match(pattern, r)
	if match:
	rating, title, text = match.groups()
	parsed.append({
	'rating': float(rating),
	'title': title.strip(),
	'text': text.strip()
	})

	return(parsed)
	else:
	return {}