Spaces:
Running
Running
Upload scripts/ingest.py with huggingface_hub
Browse files- scripts/ingest.py +16 -9
scripts/ingest.py
CHANGED
|
@@ -11,6 +11,8 @@ Usage:
|
|
| 11 |
C:\\Python313\\python scripts/ingest.py
|
| 12 |
C:\\Python313\\python scripts/ingest.py --provider azure
|
| 13 |
C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
import sys
|
|
@@ -59,8 +61,9 @@ def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
|
|
| 59 |
|
| 60 |
def main():
|
| 61 |
parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
|
| 62 |
-
parser.add_argument("--provider", default=None,
|
| 63 |
parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
|
|
|
|
| 64 |
parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
|
| 65 |
args = parser.parse_args()
|
| 66 |
|
|
@@ -84,20 +87,24 @@ def main():
|
|
| 84 |
print("\n[3/4] Ingesting into ChromaDB...")
|
| 85 |
chroma_store.ingest(items, embedder)
|
| 86 |
|
| 87 |
-
# Step 4: Neo4j
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
print("\n" + "=" * 60)
|
| 96 |
print("INGEST COMPLETE")
|
| 97 |
chroma = chroma_store.get_stats()
|
| 98 |
-
neo4j = neo4j_store.get_stats()
|
| 99 |
print(f" ChromaDB: {chroma.get('total', 0)} vectors")
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
print("=" * 60)
|
| 102 |
print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
|
| 103 |
print(" or: C:\\Python313\\python -m openmark.ui.app")
|
|
|
|
| 11 |
C:\\Python313\\python scripts/ingest.py
|
| 12 |
C:\\Python313\\python scripts/ingest.py --provider azure
|
| 13 |
C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
|
| 14 |
+
C:\\Python313\\python scripts/ingest.py --skip-neo4j (ChromaDB only, no Neo4j required)
|
| 15 |
+
C:\\Python313\\python scripts/ingest.py --skip-similar (skip SIMILAR_TO edge computation)
|
| 16 |
"""
|
| 17 |
|
| 18 |
import sys
|
|
|
|
| 61 |
|
| 62 |
def main():
|
| 63 |
parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
|
| 64 |
+
parser.add_argument("--provider", default=None, help="Embedding provider: local or azure")
|
| 65 |
parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
|
| 66 |
+
parser.add_argument("--skip-neo4j", action="store_true", help="Skip Neo4j entirely (ChromaDB only)")
|
| 67 |
parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
|
| 68 |
args = parser.parse_args()
|
| 69 |
|
|
|
|
| 87 |
print("\n[3/4] Ingesting into ChromaDB...")
|
| 88 |
chroma_store.ingest(items, embedder)
|
| 89 |
|
| 90 |
+
# Step 4: Neo4j (optional)
|
| 91 |
+
if not args.skip_neo4j:
|
| 92 |
+
print("\n[4/4] Ingesting into Neo4j...")
|
| 93 |
+
neo4j_store.ingest(items)
|
| 94 |
|
| 95 |
+
# Step 5: SIMILAR_TO edges
|
| 96 |
+
if not args.skip_similar:
|
| 97 |
+
build_similar_to_edges(items, embedder, top_k=5)
|
| 98 |
+
else:
|
| 99 |
+
print("\n[4/4] Neo4j skipped.")
|
| 100 |
|
| 101 |
print("\n" + "=" * 60)
|
| 102 |
print("INGEST COMPLETE")
|
| 103 |
chroma = chroma_store.get_stats()
|
|
|
|
| 104 |
print(f" ChromaDB: {chroma.get('total', 0)} vectors")
|
| 105 |
+
if not args.skip_neo4j:
|
| 106 |
+
neo4j = neo4j_store.get_stats()
|
| 107 |
+
print(f" Neo4j: {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
|
| 108 |
print("=" * 60)
|
| 109 |
print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
|
| 110 |
print(" or: C:\\Python313\\python -m openmark.ui.app")
|