Spaces:
Running
Running
File size: 4,230 Bytes
81598c5 be820ad 81598c5 be820ad 81598c5 be820ad 81598c5 be820ad 81598c5 be820ad 81598c5 be820ad 81598c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | """
OpenMark Full Ingest Pipeline
Run this once (or again to update) to:
1. Merge all data sources (CATEGORIZED.json + LinkedIn + YouTube)
2. Embed everything with chosen provider (local pplx-embed or Azure)
3. Store in ChromaDB (semantic search)
4. Store in Neo4j (knowledge graph)
5. Compute SIMILAR_TO edges (top-5 neighbors per bookmark → graph edges)
Usage:
C:\\Python313\\python scripts/ingest.py
C:\\Python313\\python scripts/ingest.py --provider azure
C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
C:\\Python313\\python scripts/ingest.py --skip-neo4j (ChromaDB only, no Neo4j required)
C:\\Python313\\python scripts/ingest.py --skip-similar (skip SIMILAR_TO edge computation)
"""
import sys
import os
import argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
sys.stdout.reconfigure(encoding="utf-8")
from openmark.pipeline.merge import merge_all
from openmark.embeddings.factory import get_embedder
from openmark.stores import chroma as chroma_store
from openmark.stores import neo4j_store
from openmark import config
def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
"""
For each item, find its top-k nearest neighbors in ChromaDB
and write SIMILAR_TO edges in Neo4j.
This creates the semantic web inside the graph.
"""
print(f"\nBuilding SIMILAR_TO edges (top-{top_k} per bookmark)...")
pairs = []
total = len(items)
for i, item in enumerate(items):
url = item["url"]
try:
results = chroma_store.search(
item["doc_text"], embedder, n=top_k + 1
)
for r in results:
if r["url"] != url and r["similarity"] > 0.5:
pairs.append((url, r["url"], r["similarity"]))
except Exception:
pass
if (i + 1) % 500 == 0:
print(f" Processed {i+1}/{total} for SIMILAR_TO")
print(f" Writing {len(pairs)} SIMILAR_TO edges to Neo4j...")
neo4j_store.add_similar_to_edges(pairs)
print(" SIMILAR_TO done.")
def main():
parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
parser.add_argument("--provider", default=None, help="Embedding provider: local or azure")
parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
parser.add_argument("--skip-neo4j", action="store_true", help="Skip Neo4j entirely (ChromaDB only)")
parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
args = parser.parse_args()
if args.provider:
os.environ["EMBEDDING_PROVIDER"] = args.provider
print("=" * 60)
print("OPENMARK INGEST PIPELINE")
print(f"Embedding: {config.EMBEDDING_PROVIDER}")
print("=" * 60)
# Step 1: Merge all sources
print("\n[1/4] Merging data sources...")
items = merge_all(include_fresh_raindrop=args.fresh_raindrop)
# Step 2: Load embedder
print(f"\n[2/4] Loading {config.EMBEDDING_PROVIDER} embedder...")
embedder = get_embedder()
# Step 3: ChromaDB
print("\n[3/4] Ingesting into ChromaDB...")
chroma_store.ingest(items, embedder)
# Step 4: Neo4j (optional)
if not args.skip_neo4j:
print("\n[4/4] Ingesting into Neo4j...")
neo4j_store.ingest(items)
# Step 5: SIMILAR_TO edges
if not args.skip_similar:
build_similar_to_edges(items, embedder, top_k=5)
else:
print("\n[4/4] Neo4j skipped.")
print("\n" + "=" * 60)
print("INGEST COMPLETE")
chroma = chroma_store.get_stats()
print(f" ChromaDB: {chroma.get('total', 0)} vectors")
if not args.skip_neo4j:
neo4j = neo4j_store.get_stats()
print(f" Neo4j: {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
print("=" * 60)
print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
print(" or: C:\\Python313\\python -m openmark.ui.app")
if __name__ == "__main__":
main()
|