Spaces:
Running
Running
File size: 2,942 Bytes
81598c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """
Normalize, clean, and deduplicate bookmark items.
"""
import re
from openmark import config
def clean_title(title: str) -> str:
if not title:
return ""
# Strip HTML entities
title = re.sub(r"&", "&", title)
title = re.sub(r"<", "<", title)
title = re.sub(r">", ">", title)
title = re.sub(r"'", "'", title)
title = re.sub(r""", '"', title)
# Strip leading/trailing whitespace and truncate
title = title.strip()[:300]
return title
def fix_category(cat: str | None) -> str:
if not cat:
return "News & Articles"
# Apply known remapping
cat = config.CATEGORY_MAP.get(cat, cat)
# If still unknown, fallback
if cat not in config.CATEGORIES:
return "News & Articles"
return cat
def build_document_text(item: dict) -> str:
"""
Build a single rich text string for embedding.
Combines title + tags + category + content/excerpt for better semantic matching.
"""
parts = []
if item.get("title"):
parts.append(item["title"])
if item.get("category"):
parts.append(item["category"])
if item.get("tags"):
parts.append(" ".join(item["tags"]))
if item.get("content"):
parts.append(item["content"][:200])
elif item.get("excerpt"):
parts.append(item["excerpt"][:200])
if item.get("channel"):
parts.append(item["channel"])
if item.get("author"):
parts.append(item["author"])
return " | ".join(p for p in parts if p)
def normalize_item(item: dict) -> dict:
"""Clean and normalize a single bookmark item."""
url = item.get("url", "").strip()
title = clean_title(item.get("title", ""))
cat = fix_category(item.get("category"))
tags = [t.lower().strip() for t in item.get("tags", []) if t][:5]
score = item.get("score", 5)
if not isinstance(score, (int, float)):
score = 5
normalized = {
"url": url,
"title": title,
"category": cat,
"tags": tags,
"score": score,
"source": item.get("source", "unknown"),
"folder": item.get("folder", ""),
}
# Preserve optional fields
for field in ["content", "excerpt", "author", "channel", "description"]:
if item.get(field):
normalized[field] = item[field][:300]
# Build the document text for embedding
normalized["doc_text"] = build_document_text(normalized)
return normalized
def dedupe(items: list[dict]) -> list[dict]:
"""Remove duplicates by URL (case-insensitive, trailing slash stripped)."""
seen = set()
unique = []
for item in items:
url = item.get("url", "").rstrip("/").lower()
if not url or url in seen:
continue
seen.add(url)
unique.append(item)
return unique
|