Spaces:
Running
Running
| """ | |
| Normalize, clean, and deduplicate bookmark items. | |
| """ | |
| import re | |
| from openmark import config | |
| def clean_title(title: str) -> str: | |
| if not title: | |
| return "" | |
| # Strip HTML entities | |
| title = re.sub(r"&", "&", title) | |
| title = re.sub(r"<", "<", title) | |
| title = re.sub(r">", ">", title) | |
| title = re.sub(r"'", "'", title) | |
| title = re.sub(r""", '"', title) | |
| # Strip leading/trailing whitespace and truncate | |
| title = title.strip()[:300] | |
| return title | |
| def fix_category(cat: str | None) -> str: | |
| if not cat: | |
| return "News & Articles" | |
| # Apply known remapping | |
| cat = config.CATEGORY_MAP.get(cat, cat) | |
| # If still unknown, fallback | |
| if cat not in config.CATEGORIES: | |
| return "News & Articles" | |
| return cat | |
| def build_document_text(item: dict) -> str: | |
| """ | |
| Build a single rich text string for embedding. | |
| Combines title + tags + category + content/excerpt for better semantic matching. | |
| """ | |
| parts = [] | |
| if item.get("title"): | |
| parts.append(item["title"]) | |
| if item.get("category"): | |
| parts.append(item["category"]) | |
| if item.get("tags"): | |
| parts.append(" ".join(item["tags"])) | |
| if item.get("content"): | |
| parts.append(item["content"][:200]) | |
| elif item.get("excerpt"): | |
| parts.append(item["excerpt"][:200]) | |
| if item.get("channel"): | |
| parts.append(item["channel"]) | |
| if item.get("author"): | |
| parts.append(item["author"]) | |
| return " | ".join(p for p in parts if p) | |
| def normalize_item(item: dict) -> dict: | |
| """Clean and normalize a single bookmark item.""" | |
| url = item.get("url", "").strip() | |
| title = clean_title(item.get("title", "")) | |
| cat = fix_category(item.get("category")) | |
| tags = [t.lower().strip() for t in item.get("tags", []) if t][:5] | |
| score = item.get("score", 5) | |
| if not isinstance(score, (int, float)): | |
| score = 5 | |
| normalized = { | |
| "url": url, | |
| "title": title, | |
| "category": cat, | |
| "tags": tags, | |
| "score": score, | |
| "source": item.get("source", "unknown"), | |
| "folder": item.get("folder", ""), | |
| } | |
| # Preserve optional fields | |
| for field in ["content", "excerpt", "author", "channel", "description"]: | |
| if item.get(field): | |
| normalized[field] = item[field][:300] | |
| # Build the document text for embedding | |
| normalized["doc_text"] = build_document_text(normalized) | |
| return normalized | |
| def dedupe(items: list[dict]) -> list[dict]: | |
| """Remove duplicates by URL (case-insensitive, trailing slash stripped).""" | |
| seen = set() | |
| unique = [] | |
| for item in items: | |
| url = item.get("url", "").rstrip("/").lower() | |
| if not url or url in seen: | |
| continue | |
| seen.add(url) | |
| unique.append(item) | |
| return unique | |