Spaces:

codingwithadi
/

OpenMark

Running

App Files Files Community

OpenMark / openmark /pipeline /normalize.py

codingwithadi

Upload folder using huggingface_hub

81598c5 verified 1 day ago

raw

history blame contribute delete

2.94 kB

	"""
	Normalize, clean, and deduplicate bookmark items.
	"""

	import re
	from openmark import config


	def clean_title(title: str) -> str:
	if not title:
	return ""
	# Strip HTML entities
	title = re.sub(r"&", "&", title)
	title = re.sub(r"<", "<", title)
	title = re.sub(r">", ">", title)
	title = re.sub(r"'", "'", title)
	title = re.sub(r""", '"', title)
	# Strip leading/trailing whitespace and truncate
	title = title.strip()[:300]
	return title


	def fix_category(cat: str \| None) -> str:
	if not cat:
	return "News & Articles"
	# Apply known remapping
	cat = config.CATEGORY_MAP.get(cat, cat)
	# If still unknown, fallback
	if cat not in config.CATEGORIES:
	return "News & Articles"
	return cat


	def build_document_text(item: dict) -> str:
	"""
	Build a single rich text string for embedding.
	Combines title + tags + category + content/excerpt for better semantic matching.
	"""
	parts = []
	if item.get("title"):
	parts.append(item["title"])
	if item.get("category"):
	parts.append(item["category"])
	if item.get("tags"):
	parts.append(" ".join(item["tags"]))
	if item.get("content"):
	parts.append(item["content"][:200])
	elif item.get("excerpt"):
	parts.append(item["excerpt"][:200])
	if item.get("channel"):
	parts.append(item["channel"])
	if item.get("author"):
	parts.append(item["author"])
	return " \| ".join(p for p in parts if p)


	def normalize_item(item: dict) -> dict:
	"""Clean and normalize a single bookmark item."""
	url = item.get("url", "").strip()
	title = clean_title(item.get("title", ""))
	cat = fix_category(item.get("category"))
	tags = [t.lower().strip() for t in item.get("tags", []) if t][:5]
	score = item.get("score", 5)
	if not isinstance(score, (int, float)):
	score = 5

	normalized = {
	"url": url,
	"title": title,
	"category": cat,
	"tags": tags,
	"score": score,
	"source": item.get("source", "unknown"),
	"folder": item.get("folder", ""),
	}

	# Preserve optional fields
	for field in ["content", "excerpt", "author", "channel", "description"]:
	if item.get(field):
	normalized[field] = item[field][:300]

	# Build the document text for embedding
	normalized["doc_text"] = build_document_text(normalized)

	return normalized


	def dedupe(items: list[dict]) -> list[dict]:
	"""Remove duplicates by URL (case-insensitive, trailing slash stripped)."""
	seen = set()
	unique = []
	for item in items:
	url = item.get("url", "").rstrip("/").lower()
	if not url or url in seen:
	continue
	seen.add(url)
	unique.append(item)
	return unique