Spaces:

codingwithadi
/

OpenMark

Running

App Files Files Community

OpenMark / openmark /pipeline /merge.py

codingwithadi

Upload folder using huggingface_hub

81598c5 verified 1 day ago

raw

history blame contribute delete

3.34 kB

	"""
	Merge ALL data sources into one clean list:
	- CATEGORIZED.json (Edge + old Raindrop + daily.dev — already categorized)
	- linkedin_saved.json (1,260 LinkedIn posts)
	- youtube_MASTER.json (liked + watch_later + playlists)
	- Fresh Raindrop pull (new items not yet in CATEGORIZED)

	Deduplicates by URL. Normalizes categories.
	"""

	import json
	import os
	from openmark import config
	from openmark.pipeline.normalize import normalize_item, dedupe


	def load_categorized() -> list[dict]:
	path = os.path.join(config.RAINDROP_MISSION_DIR, "CATEGORIZED.json")
	with open(path, encoding="utf-8") as f:
	items = json.load(f)
	print(f"CATEGORIZED.json: {len(items)} items")
	return items


	def load_linkedin() -> list[dict]:
	path = os.path.join(config.RAINDROP_MISSION_DIR, "linkedin_saved.json")
	if not os.path.exists(path):
	print("LinkedIn: file not found, skipping")
	return []
	with open(path, encoding="utf-8") as f:
	posts = json.load(f)
	items = []
	for p in posts:
	content = p.get("content", "")
	author = p.get("author", "")
	items.append({
	"url": p.get("url", ""),
	"title": f"{author} — {content[:80]}" if author else content[:100],
	"content": content[:300],
	"author": author,
	"folder": "LinkedIn Saved",
	"source": "linkedin",
	"tags": [],
	"category": None, # will be assigned by normalize
	"score": 6,
	})
	print(f"LinkedIn: {len(items)} posts")
	return items


	def load_youtube() -> list[dict]:
	path = os.path.join(config.RAINDROP_MISSION_DIR, "youtube_MASTER.json")
	if not os.path.exists(path):
	print("YouTube: file not found, skipping")
	return []
	with open(path, encoding="utf-8") as f:
	yt = json.load(f)
	items = []
	for section in ["liked_videos", "watch_later", "playlists"]:
	for v in yt.get(section, []):
	items.append({
	"url": v.get("url", ""),
	"title": v.get("title", ""),
	"channel": v.get("channel", ""),
	"folder": f"YouTube / {section}",
	"source": f"youtube_{section}",
	"tags": v.get("tags", [])[:5],
	"category": "YouTube & Video",
	"score": 7,
	})
	print(f"YouTube: {len(items)} videos (liked + watch_later + playlists)")
	return items


	def merge_all(include_fresh_raindrop: bool = False) -> list[dict]:
	"""
	Merge all sources. Returns deduplicated, normalized list.
	Set include_fresh_raindrop=True to also pull live from Raindrop API.
	"""
	all_items = []

	all_items.extend(load_categorized())
	all_items.extend(load_linkedin())
	all_items.extend(load_youtube())

	if include_fresh_raindrop:
	from openmark.pipeline.raindrop import pull_all
	fresh = pull_all()
	all_items.extend(fresh)

	# Normalize each item
	normalized = [normalize_item(i) for i in all_items]

	# Deduplicate by URL
	unique = dedupe(normalized)
	print(f"\nTotal after merge + dedup: {len(unique)} items")
	return unique