Spaces:

codingwithadi
/

OpenMark

Running

App Files Files Community

OpenMark / scripts /ingest.py

codingwithadi

Upload scripts/ingest.py with huggingface_hub

be820ad verified 1 day ago

raw

history blame contribute delete

4.23 kB

	"""
	OpenMark Full Ingest Pipeline
	Run this once (or again to update) to:
	1. Merge all data sources (CATEGORIZED.json + LinkedIn + YouTube)
	2. Embed everything with chosen provider (local pplx-embed or Azure)
	3. Store in ChromaDB (semantic search)
	4. Store in Neo4j (knowledge graph)
	5. Compute SIMILAR_TO edges (top-5 neighbors per bookmark → graph edges)

	Usage:
	C:\\Python313\\python scripts/ingest.py
	C:\\Python313\\python scripts/ingest.py --provider azure
	C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
	C:\\Python313\\python scripts/ingest.py --skip-neo4j (ChromaDB only, no Neo4j required)
	C:\\Python313\\python scripts/ingest.py --skip-similar (skip SIMILAR_TO edge computation)
	"""

	import sys
	import os
	import argparse

	sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
	sys.stdout.reconfigure(encoding="utf-8")

	from openmark.pipeline.merge import merge_all
	from openmark.embeddings.factory import get_embedder
	from openmark.stores import chroma as chroma_store
	from openmark.stores import neo4j_store
	from openmark import config


	def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
	"""
	For each item, find its top-k nearest neighbors in ChromaDB
	and write SIMILAR_TO edges in Neo4j.
	This creates the semantic web inside the graph.
	"""
	print(f"\nBuilding SIMILAR_TO edges (top-{top_k} per bookmark)...")
	pairs = []
	total = len(items)

	for i, item in enumerate(items):
	url = item["url"]
	try:
	results = chroma_store.search(
	item["doc_text"], embedder, n=top_k + 1
	)
	for r in results:
	if r["url"] != url and r["similarity"] > 0.5:
	pairs.append((url, r["url"], r["similarity"]))
	except Exception:
	pass

	if (i + 1) % 500 == 0:
	print(f" Processed {i+1}/{total} for SIMILAR_TO")

	print(f" Writing {len(pairs)} SIMILAR_TO edges to Neo4j...")
	neo4j_store.add_similar_to_edges(pairs)
	print(" SIMILAR_TO done.")


	def main():
	parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
	parser.add_argument("--provider", default=None, help="Embedding provider: local or azure")
	parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
	parser.add_argument("--skip-neo4j", action="store_true", help="Skip Neo4j entirely (ChromaDB only)")
	parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
	args = parser.parse_args()

	if args.provider:
	os.environ["EMBEDDING_PROVIDER"] = args.provider

	print("=" * 60)
	print("OPENMARK INGEST PIPELINE")
	print(f"Embedding: {config.EMBEDDING_PROVIDER}")
	print("=" * 60)

	# Step 1: Merge all sources
	print("\n[1/4] Merging data sources...")
	items = merge_all(include_fresh_raindrop=args.fresh_raindrop)

	# Step 2: Load embedder
	print(f"\n[2/4] Loading {config.EMBEDDING_PROVIDER} embedder...")
	embedder = get_embedder()

	# Step 3: ChromaDB
	print("\n[3/4] Ingesting into ChromaDB...")
	chroma_store.ingest(items, embedder)

	# Step 4: Neo4j (optional)
	if not args.skip_neo4j:
	print("\n[4/4] Ingesting into Neo4j...")
	neo4j_store.ingest(items)

	# Step 5: SIMILAR_TO edges
	if not args.skip_similar:
	build_similar_to_edges(items, embedder, top_k=5)
	else:
	print("\n[4/4] Neo4j skipped.")

	print("\n" + "=" * 60)
	print("INGEST COMPLETE")
	chroma = chroma_store.get_stats()
	print(f" ChromaDB: {chroma.get('total', 0)} vectors")
	if not args.skip_neo4j:
	neo4j = neo4j_store.get_stats()
	print(f" Neo4j: {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
	print("=" * 60)
	print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
	print(" or: C:\\Python313\\python -m openmark.ui.app")


	if __name__ == "__main__":
	main()