Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .env.example +26 -0
- .gitattributes +8 -35
- .gitignore +45 -0
- LICENSE +21 -0
- README.md +267 -31
- app.py +8 -0
- data/.gitkeep +0 -0
- docs/architecture.md +188 -0
- docs/data-collection.md +144 -0
- docs/huggingface.md +129 -0
- docs/ingest.md +154 -0
- docs/troubleshooting.md +127 -0
- openmark/__init__.py +0 -0
- openmark/agent/__init__.py +0 -0
- openmark/agent/graph.py +70 -0
- openmark/agent/tools.py +138 -0
- openmark/config.py +73 -0
- openmark/embeddings/__init__.py +0 -0
- openmark/embeddings/azure.py +42 -0
- openmark/embeddings/base.py +21 -0
- openmark/embeddings/factory.py +15 -0
- openmark/embeddings/local.py +54 -0
- openmark/pipeline/__init__.py +0 -0
- openmark/pipeline/merge.py +97 -0
- openmark/pipeline/normalize.py +98 -0
- openmark/pipeline/raindrop.py +83 -0
- openmark/stores/__init__.py +0 -0
- openmark/stores/chroma.py +127 -0
- openmark/stores/neo4j_store.py +213 -0
- openmark/ui/__init__.py +0 -0
- openmark/ui/app.py +165 -0
- requirements.txt +14 -0
- scripts/ingest.py +107 -0
- scripts/search.py +85 -0
.env.example
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Embedding Provider: "local" or "azure" ───────────────────
|
| 2 |
+
EMBEDDING_PROVIDER=local
|
| 3 |
+
|
| 4 |
+
# ── Local pplx-embed ─────────────────────────────────────────
|
| 5 |
+
PPLX_QUERY_MODEL=perplexity-ai/pplx-embed-v1-0.6b
|
| 6 |
+
PPLX_DOC_MODEL=perplexity-ai/pplx-embed-context-v1-0.6b
|
| 7 |
+
|
| 8 |
+
# ── Azure AI Foundry ──────────────────────────────────────────
|
| 9 |
+
AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
|
| 10 |
+
AZURE_API_KEY=your-azure-api-key
|
| 11 |
+
AZURE_DEPLOYMENT_LLM=gpt-4o-mini
|
| 12 |
+
AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
|
| 13 |
+
AZURE_API_VERSION=2024-05-01-preview
|
| 14 |
+
|
| 15 |
+
# ── Neo4j ─────────────────────────────────────────────────────
|
| 16 |
+
NEO4J_URI=bolt://127.0.0.1:7687
|
| 17 |
+
NEO4J_USER=neo4j
|
| 18 |
+
NEO4J_PASSWORD=your-neo4j-password
|
| 19 |
+
NEO4J_DATABASE=db1
|
| 20 |
+
|
| 21 |
+
# ── Raindrop ──────────────────────────────────────────────────
|
| 22 |
+
RAINDROP_TOKEN=your-raindrop-test-token
|
| 23 |
+
|
| 24 |
+
# ── Data paths ────────────────────────────────────────────────
|
| 25 |
+
RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
|
| 26 |
+
CHROMA_PATH=C:\path\to\OpenMark\data\chroma_db
|
.gitattributes
CHANGED
|
@@ -1,35 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
*
|
| 3 |
-
*.
|
| 4 |
-
*.
|
| 5 |
-
*.
|
| 6 |
-
*.
|
| 7 |
-
*.
|
| 8 |
-
*.
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Normalize line endings
|
| 2 |
+
* text=auto
|
| 3 |
+
*.py text eol=lf
|
| 4 |
+
*.md text eol=lf
|
| 5 |
+
*.txt text eol=lf
|
| 6 |
+
*.json text eol=lf
|
| 7 |
+
*.env text eol=lf
|
| 8 |
+
*.gitignore text eol=lf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Credentials — NEVER commit ───────────────────────────────
|
| 2 |
+
.env
|
| 3 |
+
|
| 4 |
+
# ── Personal data — your bookmark vectors ────────────────────
|
| 5 |
+
data/chroma_db/
|
| 6 |
+
|
| 7 |
+
# ── Python ────────────────────────────────────────────────────
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*.pyo
|
| 11 |
+
*.pyd
|
| 12 |
+
.Python
|
| 13 |
+
*.egg-info/
|
| 14 |
+
dist/
|
| 15 |
+
build/
|
| 16 |
+
*.egg
|
| 17 |
+
.eggs/
|
| 18 |
+
|
| 19 |
+
# ── Virtual environments ──────────────────────────────────────
|
| 20 |
+
venv/
|
| 21 |
+
.venv/
|
| 22 |
+
env/
|
| 23 |
+
ENV/
|
| 24 |
+
|
| 25 |
+
# ── IDE ───────────────────────────────────────────────────────
|
| 26 |
+
.idea/
|
| 27 |
+
.vscode/
|
| 28 |
+
*.swp
|
| 29 |
+
*.swo
|
| 30 |
+
.DS_Store
|
| 31 |
+
Thumbs.db
|
| 32 |
+
|
| 33 |
+
# ── Logs & temp ───────────────────────────────────────────────
|
| 34 |
+
*.log
|
| 35 |
+
*.tmp
|
| 36 |
+
*.bak
|
| 37 |
+
|
| 38 |
+
# ── HuggingFace cache (large model files) ────────────────────
|
| 39 |
+
.cache/
|
| 40 |
+
|
| 41 |
+
# ── Raw data exports — personal, not for the repo ────────────
|
| 42 |
+
raindrop-mission/
|
| 43 |
+
data/linkedin_saved.json
|
| 44 |
+
data/youtube_MASTER.json
|
| 45 |
+
data/CATEGORIZED.json
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Ahmad Othman Ammar Adi
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,31 +1,267 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
---
|
| 22 |
-
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenMark
|
| 2 |
+
|
| 3 |
+
**Your personal knowledge graph — built from everything you've ever saved.**
|
| 4 |
+
|
| 5 |
+
OpenMark ingests your bookmarks, LinkedIn saved posts, and YouTube videos into a dual-store knowledge system: **ChromaDB** for semantic vector search and **Neo4j** for graph-based connection discovery. A LangGraph agent sits on top, letting you query everything in natural language.
|
| 6 |
+
|
| 7 |
+
Built by [Ahmad Othman Ammar Adi](https://github.com/OthmanAdi).
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## What it does
|
| 12 |
+
|
| 13 |
+
- Pulls all your saved content from multiple sources into one place
|
| 14 |
+
- Embeds everything using [pplx-embed](https://huggingface.co/collections/perplexity-ai/pplx-embed) (local, free) or Azure AI Foundry (fast, cheap)
|
| 15 |
+
- Stores vectors in **ChromaDB** — find things by *meaning*, not keywords
|
| 16 |
+
- Builds a **Neo4j knowledge graph** — discover how topics connect
|
| 17 |
+
- Runs a **LangGraph agent** (powered by gpt-4o-mini) that searches both stores intelligently
|
| 18 |
+
- Serves a **Gradio UI** with Chat, Search, and Stats tabs
|
| 19 |
+
- Also works as a **CLI** — `python scripts/search.py "RAG tools"`
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## Data Sources
|
| 24 |
+
|
| 25 |
+
### 1. Raindrop.io
|
| 26 |
+
|
| 27 |
+
Create a test token at [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations).
|
| 28 |
+
OpenMark pulls **all collections** automatically via the Raindrop REST API.
|
| 29 |
+
|
| 30 |
+
### 2. Browser Bookmarks
|
| 31 |
+
|
| 32 |
+
Export your bookmarks as an HTML file from Edge, Chrome, or Firefox:
|
| 33 |
+
- **Edge:** `Settings → Favourites → ··· → Export favourites` → save as `favorites.html`
|
| 34 |
+
- **Chrome/Firefox:** `Bookmarks Manager → Export`
|
| 35 |
+
|
| 36 |
+
Point `RAINDROP_MISSION_DIR` in your `.env` to the folder containing the exported HTML files.
|
| 37 |
+
The pipeline parses the Netscape bookmark format automatically.
|
| 38 |
+
|
| 39 |
+
### 3. LinkedIn Saved Posts
|
| 40 |
+
|
| 41 |
+
LinkedIn does not provide a public API for saved posts. The included `linkedin_fetch.py` script uses your browser session cookie to call LinkedIn's internal Voyager GraphQL API.
|
| 42 |
+
|
| 43 |
+
**Steps:**
|
| 44 |
+
1. Log into LinkedIn in your browser
|
| 45 |
+
2. Open DevTools → Application → Cookies → copy the value of `li_at`
|
| 46 |
+
3. Run:
|
| 47 |
+
```bash
|
| 48 |
+
python raindrop-mission/linkedin_fetch.py
|
| 49 |
+
```
|
| 50 |
+
Paste your `li_at` cookie when prompted. The script fetches all saved posts and writes `linkedin_saved.json`.
|
| 51 |
+
|
| 52 |
+
> **Personal use only.** This uses LinkedIn's internal API which is not publicly documented or officially supported. Use responsibly.
|
| 53 |
+
|
| 54 |
+
### 4. YouTube
|
| 55 |
+
|
| 56 |
+
Uses the official [YouTube Data API v3](https://developers.google.com/youtube/v3) via OAuth 2.0.
|
| 57 |
+
|
| 58 |
+
**Steps:**
|
| 59 |
+
1. Go to [Google Cloud Console](https://console.cloud.google.com/) → Create a project
|
| 60 |
+
2. Enable the **YouTube Data API v3**
|
| 61 |
+
3. Create OAuth 2.0 credentials → Download as `client_secret.json`
|
| 62 |
+
4. Add your Google account as a test user (OAuth consent screen → Test users)
|
| 63 |
+
5. Run:
|
| 64 |
+
```bash
|
| 65 |
+
python raindrop-mission/youtube_fetch.py
|
| 66 |
+
```
|
| 67 |
+
A browser window opens for auth. After that, `youtube_MASTER.json` is written with liked videos, watch later, and playlists.
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
## How it works
|
| 72 |
+
|
| 73 |
+
```
|
| 74 |
+
Your saved content
|
| 75 |
+
│
|
| 76 |
+
▼
|
| 77 |
+
normalize.py ← clean titles, dedupe by URL, fix categories
|
| 78 |
+
│
|
| 79 |
+
▼
|
| 80 |
+
EmbeddingProvider ← LOCAL: pplx-embed-context-v1-0.6b (documents)
|
| 81 |
+
pplx-embed-v1-0.6b (queries)
|
| 82 |
+
AZURE: text-embedding-ada-002
|
| 83 |
+
│
|
| 84 |
+
├──────────────────────────────────┐
|
| 85 |
+
▼ ▼
|
| 86 |
+
ChromaDB Neo4j
|
| 87 |
+
(vector store) (knowledge graph)
|
| 88 |
+
find by meaning find by connection
|
| 89 |
+
|
| 90 |
+
"show me RAG tools" "what connects LangGraph
|
| 91 |
+
to my Neo4j saves?"
|
| 92 |
+
│ │
|
| 93 |
+
└──────────────┬───────────────────┘
|
| 94 |
+
▼
|
| 95 |
+
LangGraph Agent
|
| 96 |
+
(gpt-4o-mini)
|
| 97 |
+
│
|
| 98 |
+
▼
|
| 99 |
+
Gradio UI / CLI
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
### Why embeddings?
|
| 103 |
+
|
| 104 |
+
An embedding is a list of numbers that represents the *meaning* of a piece of text. Two pieces of text with similar meaning will have similar numbers — even if they use completely different words. This is how OpenMark finds "retrieval augmented generation tutorials" when you search "RAG tools."
|
| 105 |
+
|
| 106 |
+
### Why ChromaDB?
|
| 107 |
+
|
| 108 |
+
ChromaDB stores those embedding vectors locally on your disk. It's a persistent vector database — no server, no cloud, no API key. When you search, it compares your query's embedding against all stored embeddings and returns the closest matches.
|
| 109 |
+
|
| 110 |
+
### Why Neo4j?
|
| 111 |
+
|
| 112 |
+
Embeddings answer "what's similar?" — Neo4j answers "how are these connected?" Every bookmark is a node. Tags, categories, domains, and sources are also nodes. Edges connect them. After ingestion, OpenMark also writes `SIMILAR_TO` edges derived from embedding neighbors — so the graph contains semantic connections you never manually created. You can then traverse: *"start from this LangChain article, walk similar-to 2 hops, what clusters emerge?"*
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
## Requirements
|
| 117 |
+
|
| 118 |
+
- Python 3.13
|
| 119 |
+
- Neo4j Desktop (local) or AuraDB (cloud) — [neo4j.com/download](https://neo4j.com/download/)
|
| 120 |
+
- **Either** Azure AI Foundry account **or** enough disk space for local pplx-embed (~1.2 GB)
|
| 121 |
+
|
| 122 |
+
---
|
| 123 |
+
|
| 124 |
+
## Setup
|
| 125 |
+
|
| 126 |
+
### 1. Clone and install
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
git clone https://github.com/OthmanAdi/OpenMark.git
|
| 130 |
+
cd OpenMark
|
| 131 |
+
pip install -r requirements.txt
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
### 2. Configure
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
cp .env.example .env
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
Edit `.env` with your values:
|
| 141 |
+
|
| 142 |
+
```env
|
| 143 |
+
# Choose your embedding provider
|
| 144 |
+
EMBEDDING_PROVIDER=local # or: azure
|
| 145 |
+
|
| 146 |
+
# Azure AI Foundry (required if EMBEDDING_PROVIDER=azure, also used for the LLM agent)
|
| 147 |
+
AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
|
| 148 |
+
AZURE_API_KEY=your-key
|
| 149 |
+
AZURE_DEPLOYMENT_LLM=gpt-4o-mini
|
| 150 |
+
AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
|
| 151 |
+
|
| 152 |
+
# Neo4j
|
| 153 |
+
NEO4J_URI=bolt://127.0.0.1:7687
|
| 154 |
+
NEO4J_USER=neo4j
|
| 155 |
+
NEO4J_PASSWORD=your-password
|
| 156 |
+
NEO4J_DATABASE=neo4j
|
| 157 |
+
|
| 158 |
+
# Raindrop (get token at app.raindrop.io/settings/integrations)
|
| 159 |
+
RAINDROP_TOKEN=your-token
|
| 160 |
+
|
| 161 |
+
# Path to your raindrop-mission data folder
|
| 162 |
+
RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### 3. Ingest
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
# Local embeddings (free, ~20 min for 8K items on CPU)
|
| 169 |
+
python scripts/ingest.py
|
| 170 |
+
|
| 171 |
+
# Azure embeddings (fast, ~5 min, costs ~€0.30 for 8K items)
|
| 172 |
+
python scripts/ingest.py --provider azure
|
| 173 |
+
|
| 174 |
+
# Also pull fresh from Raindrop API during ingest
|
| 175 |
+
python scripts/ingest.py --fresh-raindrop
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### 4. Search (CLI)
|
| 179 |
+
|
| 180 |
+
```bash
|
| 181 |
+
python scripts/search.py "RAG tools"
|
| 182 |
+
python scripts/search.py "LangGraph" --category "Agent Development"
|
| 183 |
+
python scripts/search.py --tag "rag"
|
| 184 |
+
python scripts/search.py --stats
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
### 5. Launch UI
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
python openmark/ui/app.py
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
Open [http://localhost:7860](http://localhost:7860)
|
| 194 |
+
|
| 195 |
+
---
|
| 196 |
+
|
| 197 |
+
## Required API Keys
|
| 198 |
+
|
| 199 |
+
| Key | Where to get it | Required? |
|
| 200 |
+
|-----|----------------|-----------|
|
| 201 |
+
| `RAINDROP_TOKEN` | [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations) | Yes |
|
| 202 |
+
| `AZURE_API_KEY` | Azure Portal → your AI Foundry resource | Only if `EMBEDDING_PROVIDER=azure` |
|
| 203 |
+
| `NEO4J_PASSWORD` | Set when creating your Neo4j database | Yes |
|
| 204 |
+
| YouTube OAuth | Google Cloud Console → YouTube Data API v3 | Only if ingesting YouTube |
|
| 205 |
+
|
| 206 |
+
No HuggingFace token is needed for local pplx-embed. The models are open weights and download automatically. You will see a warning `"You are sending unauthenticated requests to the HF Hub"` — this is harmless and can be silenced by setting `HF_TOKEN` in your `.env` if you want higher rate limits.
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
## Project Structure
|
| 211 |
+
|
| 212 |
+
```
|
| 213 |
+
OpenMark/
|
| 214 |
+
├── openmark/
|
| 215 |
+
│ ├── config.py ← all settings loaded from .env
|
| 216 |
+
│ ├── pipeline/
|
| 217 |
+
│ │ ├── raindrop.py ← pull all Raindrop collections via API
|
| 218 |
+
│ │ ├── normalize.py ← clean, dedupe, build embedding text
|
| 219 |
+
│ │ └── merge.py ← combine all sources
|
| 220 |
+
│ ├── embeddings/
|
| 221 |
+
│ │ ├── base.py ← abstract EmbeddingProvider interface
|
| 222 |
+
│ │ ├── local.py ← pplx-embed (local, free)
|
| 223 |
+
│ │ ├── azure.py ← Azure AI Foundry
|
| 224 |
+
│ │ └── factory.py ← returns provider based on .env
|
| 225 |
+
│ ├── stores/
|
| 226 |
+
│ │ ├── chroma.py ← ChromaDB: ingest + semantic search
|
| 227 |
+
│ │ └── neo4j_store.py ← Neo4j: graph nodes, edges, traversal
|
| 228 |
+
│ ├── agent/
|
| 229 |
+
│ │ ├── tools.py ← LangGraph tools (search, tag, graph)
|
| 230 |
+
│ │ └── graph.py ← create_react_agent with gpt-4o-mini
|
| 231 |
+
│ └── ui/
|
| 232 |
+
│ └── app.py ← Gradio UI (Chat / Search / Stats)
|
| 233 |
+
└── scripts/
|
| 234 |
+
├── ingest.py ← full pipeline runner
|
| 235 |
+
└── search.py ← CLI search
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
---
|
| 239 |
+
|
| 240 |
+
## Roadmap
|
| 241 |
+
|
| 242 |
+
- [ ] OpenAI embeddings integration
|
| 243 |
+
- [ ] Ollama local LLM support
|
| 244 |
+
- [ ] Pinecone vector store option
|
| 245 |
+
- [ ] Web scraping — fetch full page content for richer embeddings
|
| 246 |
+
- [ ] Browser extension for real-time saving to OpenMark
|
| 247 |
+
- [ ] Comet / Arc browser bookmark import
|
| 248 |
+
- [ ] Automatic re-ingestion on schedule
|
| 249 |
+
- [ ] Export to Obsidian / Notion
|
| 250 |
+
- [ ] Multi-user support
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
## Documentation
|
| 255 |
+
|
| 256 |
+
| Doc | What's in it |
|
| 257 |
+
|-----|-------------|
|
| 258 |
+
| [docs/data-collection.md](docs/data-collection.md) | Full guide for each data source — Raindrop, Edge, LinkedIn cookie method, YouTube OAuth, daily.dev console script |
|
| 259 |
+
| [docs/ingest.md](docs/ingest.md) | All ingest flags, timing for each step, how SIMILAR_TO edges work, re-run behavior |
|
| 260 |
+
| [docs/architecture.md](docs/architecture.md) | Dual-store design, Neo4j graph schema, embedding patches, Cypher query examples, agent tools |
|
| 261 |
+
| [docs/troubleshooting.md](docs/troubleshooting.md) | pplx-embed compatibility fixes, LinkedIn queryId changes, Neo4j connection issues, Windows encoding |
|
| 262 |
+
|
| 263 |
+
---
|
| 264 |
+
|
| 265 |
+
## License
|
| 266 |
+
|
| 267 |
+
MIT
|
app.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HuggingFace Space entry point — launches the OpenMark Gradio UI."""
|
| 2 |
+
import sys, os
|
| 3 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 4 |
+
from openmark.ui.app import build_ui
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
ui = build_ui()
|
| 8 |
+
ui.launch()
|
data/.gitkeep
ADDED
|
File without changes
|
docs/architecture.md
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
OpenMark uses a **dual-store architecture** — two databases working together, each doing what it's best at.
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
User Query
|
| 9 |
+
│
|
| 10 |
+
LangGraph Agent
|
| 11 |
+
(gpt-4o-mini)
|
| 12 |
+
/ \
|
| 13 |
+
ChromaDB Neo4j
|
| 14 |
+
(vector store) (graph store)
|
| 15 |
+
|
| 16 |
+
"find by meaning" "find by connection"
|
| 17 |
+
"what's similar?" "how are things linked?"
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Embedding Layer
|
| 23 |
+
|
| 24 |
+
The embedding layer is **provider-agnostic** — swap between local and cloud with one env var.
|
| 25 |
+
|
| 26 |
+
```
|
| 27 |
+
EMBEDDING_PROVIDER=local → LocalEmbedder (pplx-embed, runs on your machine)
|
| 28 |
+
EMBEDDING_PROVIDER=azure → AzureEmbedder (Azure AI Foundry, API call)
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
**Why two pplx-embed models?**
|
| 32 |
+
|
| 33 |
+
Perplexity AI ships two variants:
|
| 34 |
+
- `pplx-embed-v1-0.6b` — for encoding **queries** (what the user types)
|
| 35 |
+
- `pplx-embed-context-v1-0.6b` — for encoding **documents** (the bookmarks, surrounding context matters)
|
| 36 |
+
|
| 37 |
+
Using the correct model for each role improves retrieval quality. Most implementations use one model for both — this is the correct production pattern.
|
| 38 |
+
|
| 39 |
+
**The compatibility patches:**
|
| 40 |
+
|
| 41 |
+
pplx-embed models ship with custom Python code (`st_quantize.py`) that has two incompatibilities with modern libraries:
|
| 42 |
+
|
| 43 |
+
1. **`sentence_transformers 4.x` removed the `Module` base class** — pplx-embed's code imports it. Fixed by aliasing `torch.nn.Module` to `sentence_transformers.models.Module` before import.
|
| 44 |
+
|
| 45 |
+
2. **`transformers 4.57` added `list_repo_templates()`** — it looks for an `additional_chat_templates` folder in every model repo. pplx-embed doesn't have one, causing a hard 404 crash. Fixed by monkey-patching the function to return an empty list on exception.
|
| 46 |
+
|
| 47 |
+
Both patches are applied in `openmark/embeddings/local.py` before any model loading.
|
| 48 |
+
|
| 49 |
+
**Why `sentence-transformers==3.3.1` specifically?**
|
| 50 |
+
|
| 51 |
+
Version 4.x removed the `Module` base class that pplx-embed depends on. Pin to 3.3.1.
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## ChromaDB
|
| 56 |
+
|
| 57 |
+
Local, file-based vector database. No server, no API key, no cloud.
|
| 58 |
+
|
| 59 |
+
**Collection:** `openmark_bookmarks`
|
| 60 |
+
**Similarity metric:** cosine
|
| 61 |
+
**Data path:** `CHROMA_PATH` in `.env` (default: `OpenMark/data/chroma_db/`)
|
| 62 |
+
|
| 63 |
+
**What's stored per item:**
|
| 64 |
+
```python
|
| 65 |
+
{
|
| 66 |
+
"id": url, # primary key
|
| 67 |
+
"document": doc_text, # rich text used for embedding
|
| 68 |
+
"metadata": {
|
| 69 |
+
"title": str,
|
| 70 |
+
"category": str,
|
| 71 |
+
"source": str, # raindrop, linkedin, youtube_liked, edge, etc.
|
| 72 |
+
"score": float, # quality score 1-10
|
| 73 |
+
"tags": str, # comma-separated
|
| 74 |
+
"folder": str,
|
| 75 |
+
},
|
| 76 |
+
"embedding": [float x 1024] # or 1536 for Azure
|
| 77 |
+
}
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
**Querying:**
|
| 81 |
+
```python
|
| 82 |
+
collection.query(
|
| 83 |
+
query_embeddings=[embedder.embed_query("RAG tools")],
|
| 84 |
+
n_results=10,
|
| 85 |
+
where={"category": {"$eq": "RAG & Vector Search"}}, # optional filter
|
| 86 |
+
)
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## Neo4j Graph Schema
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
(:Bookmark {url, title, score})
|
| 95 |
+
-[:IN_CATEGORY]-> (:Category {name})
|
| 96 |
+
-[:TAGGED]-> (:Tag {name})
|
| 97 |
+
-[:FROM_SOURCE]-> (:Source {name})
|
| 98 |
+
-[:FROM_DOMAIN]-> (:Domain {name})
|
| 99 |
+
-[:SIMILAR_TO {score}]-> (:Bookmark) ← from embeddings
|
| 100 |
+
|
| 101 |
+
(:Tag)-[:CO_OCCURS_WITH {count}]-(:Tag) ← tags that appear together
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Useful Cypher queries:**
|
| 105 |
+
|
| 106 |
+
```cypher
|
| 107 |
+
// Count everything
|
| 108 |
+
MATCH (b:Bookmark) RETURN count(b) AS bookmarks
|
| 109 |
+
MATCH (t:Tag) RETURN count(t) AS tags
|
| 110 |
+
|
| 111 |
+
// Top categories
|
| 112 |
+
MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
|
| 113 |
+
RETURN c.name, count(b) AS count ORDER BY count DESC
|
| 114 |
+
|
| 115 |
+
// All bookmarks tagged 'rag'
|
| 116 |
+
MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: 'rag'})
|
| 117 |
+
RETURN b.title, b.url ORDER BY b.score DESC
|
| 118 |
+
|
| 119 |
+
// Find what connects to 'langchain' tag (2 hops)
|
| 120 |
+
MATCH (t:Tag {name: 'langchain'})-[:CO_OCCURS_WITH*1..2]-(related:Tag)
|
| 121 |
+
RETURN related.name, count(*) AS strength ORDER BY strength DESC
|
| 122 |
+
|
| 123 |
+
// Similar bookmarks to a URL
|
| 124 |
+
MATCH (b:Bookmark {url: 'https://...'})-[r:SIMILAR_TO]-(other)
|
| 125 |
+
RETURN other.title, other.url, r.score ORDER BY r.score DESC
|
| 126 |
+
|
| 127 |
+
// Most connected domains
|
| 128 |
+
MATCH (b:Bookmark)-[:FROM_DOMAIN]->(d:Domain)
|
| 129 |
+
RETURN d.name, count(b) AS saved ORDER BY saved DESC LIMIT 20
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
## LangGraph Agent
|
| 135 |
+
|
| 136 |
+
Built with `create_react_agent` from LangGraph 1.0.x.
|
| 137 |
+
|
| 138 |
+
**Model:** Azure gpt-4o-mini (streaming enabled)
|
| 139 |
+
**Memory:** `MemorySaver` — conversation history persists per `thread_id` within a session
|
| 140 |
+
|
| 141 |
+
**Tools:**
|
| 142 |
+
|
| 143 |
+
| Tool | Store | Description |
|
| 144 |
+
|------|-------|-------------|
|
| 145 |
+
| `search_semantic` | ChromaDB | Natural language vector search |
|
| 146 |
+
| `search_by_category` | ChromaDB | Filter by category + optional query |
|
| 147 |
+
| `find_by_tag` | Neo4j | Exact tag lookup |
|
| 148 |
+
| `find_similar_bookmarks` | Neo4j | SIMILAR_TO edge traversal |
|
| 149 |
+
| `explore_tag_cluster` | Neo4j | CO_OCCURS_WITH traversal (2 hops) |
|
| 150 |
+
| `get_stats` | Both | Count totals |
|
| 151 |
+
| `run_cypher` | Neo4j | Raw Cypher for power users |
|
| 152 |
+
|
| 153 |
+
**Agent routing:** The LLM decides which tool(s) to call based on the query. For "what do I know about RAG" it will call `search_semantic` + `search_by_category` + `find_by_tag`. For "how does LangGraph connect to my Neo4j saves" it will call `explore_tag_cluster` and `run_cypher`.
|
| 154 |
+
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
+
## Gradio UI
|
| 158 |
+
|
| 159 |
+
Three tabs:
|
| 160 |
+
|
| 161 |
+
| Tab | What it does |
|
| 162 |
+
|-----|-------------|
|
| 163 |
+
| Chat | Full LangGraph agent conversation. Remembers context within session. |
|
| 164 |
+
| Search | Direct ChromaDB search with category filter, min score slider, result count. |
|
| 165 |
+
| Stats | Neo4j category breakdown + top tags. Loads on startup. |
|
| 166 |
+
|
| 167 |
+
Run: `python openmark/ui/app.py` → `http://localhost:7860`
|
| 168 |
+
|
| 169 |
+
---
|
| 170 |
+
|
| 171 |
+
## Data Flow Summary
|
| 172 |
+
|
| 173 |
+
```
|
| 174 |
+
Source files (JSON, HTML)
|
| 175 |
+
│
|
| 176 |
+
merge.py → normalize.py
|
| 177 |
+
│
|
| 178 |
+
8,007 items with doc_text
|
| 179 |
+
│
|
| 180 |
+
EmbeddingProvider.embed_documents()
|
| 181 |
+
│
|
| 182 |
+
┌────┴────┐
|
| 183 |
+
│ │
|
| 184 |
+
ChromaDB Neo4j
|
| 185 |
+
add() MERGE nodes + relationships
|
| 186 |
+
CO_OCCURS_WITH edges
|
| 187 |
+
SIMILAR_TO edges (from ChromaDB top-5 per item)
|
| 188 |
+
```
|
docs/data-collection.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Collection Guide
|
| 2 |
+
|
| 3 |
+
Everything you need to collect your saved content from each source before running the ingest pipeline.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 1. Raindrop.io
|
| 8 |
+
|
| 9 |
+
OpenMark pulls **all your Raindrop collections automatically** via the official REST API. You just need a token.
|
| 10 |
+
|
| 11 |
+
**Steps:**
|
| 12 |
+
1. Go to [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations)
|
| 13 |
+
2. Under "For Developers" → click **Create new app**
|
| 14 |
+
3. Copy the **Test token** (permanent, no expiry)
|
| 15 |
+
4. Add to `.env`:
|
| 16 |
+
```
|
| 17 |
+
RAINDROP_TOKEN=your-token-here
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
The pipeline fetches every collection, every sub-collection, and every unsorted raindrop automatically. No manual export needed.
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## 2. Browser Bookmarks (Edge / Chrome / Firefox)
|
| 25 |
+
|
| 26 |
+
Export your bookmarks as an HTML file in the Netscape bookmark format (all browsers support this).
|
| 27 |
+
|
| 28 |
+
**Edge:**
|
| 29 |
+
`Settings → Favourites → ··· (three dots) → Export favourites` → save as `favorites.html`
|
| 30 |
+
|
| 31 |
+
**Chrome:**
|
| 32 |
+
`Bookmarks Manager (Ctrl+Shift+O) → ··· → Export bookmarks` → save as `bookmarks.html`
|
| 33 |
+
|
| 34 |
+
**Firefox:**
|
| 35 |
+
`Bookmarks → Manage Bookmarks → Import and Backup → Export Bookmarks to HTML`
|
| 36 |
+
|
| 37 |
+
**After exporting:**
|
| 38 |
+
- Place the HTML file(s) in your `raindrop-mission` folder (or wherever `RAINDROP_MISSION_DIR` points)
|
| 39 |
+
- The pipeline (`merge.py`) looks for `favorites_*.html` and `bookmarks_*.html` patterns
|
| 40 |
+
- It parses the Netscape format and extracts URLs + titles + folder structure
|
| 41 |
+
|
| 42 |
+
> **Tip:** Export fresh before every ingest to capture new bookmarks.
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## 3. LinkedIn Saved Posts
|
| 47 |
+
|
| 48 |
+
LinkedIn has no public API for saved posts. OpenMark uses LinkedIn's internal **Voyager GraphQL API** — the same API the LinkedIn web app uses internally.
|
| 49 |
+
|
| 50 |
+
**This is the exact endpoint used:**
|
| 51 |
+
```
|
| 52 |
+
https://www.linkedin.com/voyager/api/graphql
|
| 53 |
+
?variables=(start:0,count:10,paginationToken:null,
|
| 54 |
+
query:(flagshipSearchIntent:SEARCH_MY_ITEMS_SAVED_POSTS))
|
| 55 |
+
&queryId=voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
**How to get your session cookie:**
|
| 59 |
+
|
| 60 |
+
1. Log into LinkedIn in your browser
|
| 61 |
+
2. Open DevTools (`F12`) → **Application** tab → **Cookies** → `https://www.linkedin.com`
|
| 62 |
+
3. Find the cookie named `li_at` — copy its value
|
| 63 |
+
4. Also find `JSESSIONID` — copy its value (used as CSRF token, format: `ajax:XXXXXXXXXXXXXXXXXX`)
|
| 64 |
+
|
| 65 |
+
**Run the fetch script:**
|
| 66 |
+
```bash
|
| 67 |
+
python raindrop-mission/linkedin_fetch.py
|
| 68 |
+
```
|
| 69 |
+
Paste your `li_at` value when prompted.
|
| 70 |
+
|
| 71 |
+
**Output:** `raindrop-mission/linkedin_saved.json` — 1,260 saved posts with author, content, and URL.
|
| 72 |
+
|
| 73 |
+
**Pagination:** LinkedIn returns 10 posts per page. The script detects end of results when no `nextPageToken` is returned. With 1,260 posts that's ~133 pages.
|
| 74 |
+
|
| 75 |
+
> **Important:** The `queryId` (`voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9`) is hardcoded in LinkedIn's JavaScript bundle and can change with LinkedIn deployments. If the script returns 0 results, intercept a fresh request from your browser's Network tab — filter for `voyagerSearchDashClusters`, copy the new `queryId`.
|
| 76 |
+
|
| 77 |
+
> **Personal use only.** This method is not officially supported by LinkedIn. Do not use for scraping at scale.
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## 4. YouTube
|
| 82 |
+
|
| 83 |
+
Uses the official **YouTube Data API v3** via OAuth 2.0. Collects liked videos, watch later playlist, and any saved playlists.
|
| 84 |
+
|
| 85 |
+
**One-time setup:**
|
| 86 |
+
|
| 87 |
+
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
|
| 88 |
+
2. Create a new project (e.g. "OpenMark")
|
| 89 |
+
3. Enable **YouTube Data API v3** (APIs & Services → Enable APIs)
|
| 90 |
+
4. Create credentials: **OAuth 2.0 Client ID** → Desktop App
|
| 91 |
+
5. Download the JSON file — rename it to `client_secret.json` and place it in `raindrop-mission/`
|
| 92 |
+
6. Go to **OAuth consent screen** → Test users → add your Google account email
|
| 93 |
+
|
| 94 |
+
**Run the fetch script:**
|
| 95 |
+
```bash
|
| 96 |
+
python raindrop-mission/youtube_fetch.py
|
| 97 |
+
```
|
| 98 |
+
A browser window opens for Google sign-in. After auth, a token is cached locally — you won't need to auth again.
|
| 99 |
+
|
| 100 |
+
**Output:** `raindrop-mission/youtube_MASTER.json` with:
|
| 101 |
+
- `liked_videos` — videos you've liked (up to ~3,200 via API limit)
|
| 102 |
+
- `watch_later` — requires Google Takeout (see below)
|
| 103 |
+
- `playlists` — saved playlists
|
| 104 |
+
|
| 105 |
+
**Watch Later via Google Takeout:**
|
| 106 |
+
YouTube's API does not expose Watch Later directly. Export it via [takeout.google.com](https://takeout.google.com):
|
| 107 |
+
- Select only **YouTube** → **Playlists** → Download
|
| 108 |
+
- Extract the CSV file named `Watch later-videos.csv`
|
| 109 |
+
- Place it in `raindrop-mission/`
|
| 110 |
+
- The `youtube_organize.py` script fetches video titles via API and includes them in `youtube_MASTER.json`
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## 5. daily.dev Bookmarks
|
| 115 |
+
|
| 116 |
+
daily.dev does not provide a public API. Use the included browser console script to extract bookmarks directly from the page.
|
| 117 |
+
|
| 118 |
+
**Steps:**
|
| 119 |
+
1. Go to [app.daily.dev](https://app.daily.dev) → **Bookmarks**
|
| 120 |
+
2. Scroll all the way down to load all bookmarks
|
| 121 |
+
3. Open DevTools → **Console** tab
|
| 122 |
+
4. Paste and run `raindrop-mission/dailydev_console_script.js`
|
| 123 |
+
5. The script copies a JSON array to your clipboard
|
| 124 |
+
6. Paste into a file named `dailydev_bookmarks.json` in `raindrop-mission/`
|
| 125 |
+
|
| 126 |
+
> The script filters for `/posts/` URLs only — it ignores profile links, squad links, and other noise.
|
| 127 |
+
|
| 128 |
+
---
|
| 129 |
+
|
| 130 |
+
## Summary
|
| 131 |
+
|
| 132 |
+
| Source | Method | Output file |
|
| 133 |
+
|--------|--------|-------------|
|
| 134 |
+
| Raindrop | REST API (auto) | pulled live |
|
| 135 |
+
| Edge/Chrome bookmarks | HTML export | `favorites.html` / `bookmarks.html` |
|
| 136 |
+
| LinkedIn saved posts | Voyager GraphQL + session cookie | `linkedin_saved.json` |
|
| 137 |
+
| YouTube liked/playlists | YouTube Data API v3 + OAuth | `youtube_MASTER.json` |
|
| 138 |
+
| YouTube watch later | Google Takeout CSV | included in `youtube_MASTER.json` |
|
| 139 |
+
| daily.dev bookmarks | Browser console script | `dailydev_bookmarks.json` |
|
| 140 |
+
|
| 141 |
+
Once all files are in place, run:
|
| 142 |
+
```bash
|
| 143 |
+
python scripts/ingest.py
|
| 144 |
+
```
|
docs/huggingface.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HuggingFace Publishing Guide
|
| 2 |
+
|
| 3 |
+
OpenMark publishes two things on HuggingFace:
|
| 4 |
+
1. **Space** — live Gradio demo at `OthmanAdi/OpenMark`
|
| 5 |
+
2. **Dataset** — the categorized bookmarks at `OthmanAdi/openmark-bookmarks`
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Prerequisites
|
| 10 |
+
|
| 11 |
+
You need a HuggingFace account and a **write-access token**:
|
| 12 |
+
1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
| 13 |
+
2. Create a new token → **Write** access
|
| 14 |
+
3. Add to your `.env`:
|
| 15 |
+
```
|
| 16 |
+
HF_TOKEN=hf_your_token_here
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 1. HuggingFace Space (Gradio Demo)
|
| 22 |
+
|
| 23 |
+
The Space hosts the Gradio UI publicly (or privately until you're ready).
|
| 24 |
+
|
| 25 |
+
**Create the Space:**
|
| 26 |
+
```bash
|
| 27 |
+
pip install huggingface_hub
|
| 28 |
+
python -c "
|
| 29 |
+
from huggingface_hub import HfApi
|
| 30 |
+
import os
|
| 31 |
+
from dotenv import load_dotenv
|
| 32 |
+
load_dotenv()
|
| 33 |
+
api = HfApi(token=os.getenv('HF_TOKEN'))
|
| 34 |
+
api.create_repo(
|
| 35 |
+
repo_id='OthmanAdi/OpenMark',
|
| 36 |
+
repo_type='space',
|
| 37 |
+
space_sdk='gradio',
|
| 38 |
+
private=True,
|
| 39 |
+
)
|
| 40 |
+
print('Space created: https://huggingface.co/spaces/OthmanAdi/OpenMark')
|
| 41 |
+
"
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
**Push the code to the Space:**
|
| 45 |
+
```bash
|
| 46 |
+
python -c "
|
| 47 |
+
from huggingface_hub import HfApi
|
| 48 |
+
import os
|
| 49 |
+
from dotenv import load_dotenv
|
| 50 |
+
load_dotenv()
|
| 51 |
+
api = HfApi(token=os.getenv('HF_TOKEN'))
|
| 52 |
+
api.upload_folder(
|
| 53 |
+
folder_path='.',
|
| 54 |
+
repo_id='OthmanAdi/OpenMark',
|
| 55 |
+
repo_type='space',
|
| 56 |
+
ignore_patterns=['.env', 'data/chroma_db/*', '__pycache__/*', '.git/*'],
|
| 57 |
+
)
|
| 58 |
+
"
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
> **Note:** The Space version requires your ChromaDB and Neo4j data to be pre-loaded. For a public demo, you would host a sample dataset. For private use, the full local setup is better.
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## 2. HuggingFace Dataset
|
| 66 |
+
|
| 67 |
+
The dataset card publishes your 8,000+ categorized bookmarks as a reusable dataset for RAG experiments.
|
| 68 |
+
|
| 69 |
+
**What's in the dataset:**
|
| 70 |
+
- URL, title, category (19 categories), tags, score (1-10), source
|
| 71 |
+
- Sources: Raindrop, Edge browser, LinkedIn, YouTube, daily.dev
|
| 72 |
+
- ~8,007 unique items after deduplication
|
| 73 |
+
|
| 74 |
+
**Create the dataset repo:**
|
| 75 |
+
```bash
|
| 76 |
+
python -c "
|
| 77 |
+
from huggingface_hub import HfApi
|
| 78 |
+
import os, json
|
| 79 |
+
from dotenv import load_dotenv
|
| 80 |
+
load_dotenv()
|
| 81 |
+
api = HfApi(token=os.getenv('HF_TOKEN'))
|
| 82 |
+
|
| 83 |
+
# Create private dataset repo
|
| 84 |
+
api.create_repo(
|
| 85 |
+
repo_id='OthmanAdi/openmark-bookmarks',
|
| 86 |
+
repo_type='dataset',
|
| 87 |
+
private=True,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Upload dataset card
|
| 91 |
+
api.upload_file(
|
| 92 |
+
path_or_fileobj='docs/dataset_card.md',
|
| 93 |
+
path_in_repo='README.md',
|
| 94 |
+
repo_id='OthmanAdi/openmark-bookmarks',
|
| 95 |
+
repo_type='dataset',
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Upload the data (RAINDROP_MISSION_DIR/CATEGORIZED.json)
|
| 99 |
+
api.upload_file(
|
| 100 |
+
path_or_fileobj=os.path.join(os.getenv('RAINDROP_MISSION_DIR'), 'CATEGORIZED.json'),
|
| 101 |
+
path_in_repo='data/bookmarks.json',
|
| 102 |
+
repo_id='OthmanAdi/openmark-bookmarks',
|
| 103 |
+
repo_type='dataset',
|
| 104 |
+
)
|
| 105 |
+
print('Dataset created: https://huggingface.co/datasets/OthmanAdi/openmark-bookmarks')
|
| 106 |
+
"
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
## Making Public
|
| 112 |
+
|
| 113 |
+
When you're ready to go public, flip visibility:
|
| 114 |
+
```bash
|
| 115 |
+
python -c "
|
| 116 |
+
from huggingface_hub import HfApi
|
| 117 |
+
import os
|
| 118 |
+
from dotenv import load_dotenv
|
| 119 |
+
load_dotenv()
|
| 120 |
+
api = HfApi(token=os.getenv('HF_TOKEN'))
|
| 121 |
+
|
| 122 |
+
# Make Space public
|
| 123 |
+
api.update_repo_visibility('OthmanAdi/OpenMark', private=False, repo_type='space')
|
| 124 |
+
|
| 125 |
+
# Make Dataset public
|
| 126 |
+
api.update_repo_visibility('OthmanAdi/openmark-bookmarks', private=False, repo_type='dataset')
|
| 127 |
+
print('Both are now public.')
|
| 128 |
+
"
|
| 129 |
+
```
|
docs/ingest.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ingest Pipeline
|
| 2 |
+
|
| 3 |
+
The ingest pipeline is the heart of OpenMark. It merges all your data, embeds everything, and writes to both ChromaDB and Neo4j.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Command
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
python scripts/ingest.py [options]
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
| Flag | Default | Description |
|
| 14 |
+
|------|---------|-------------|
|
| 15 |
+
| `--provider local` | from `.env` | Use local pplx-embed models |
|
| 16 |
+
| `--provider azure` | from `.env` | Use Azure AI Foundry embeddings |
|
| 17 |
+
| `--fresh-raindrop` | off | Also pull live from Raindrop API during merge |
|
| 18 |
+
| `--skip-similar` | off | Skip SIMILAR_TO edge computation (saves ~30 min) |
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Pipeline Steps
|
| 23 |
+
|
| 24 |
+
### Step 1 — Merge
|
| 25 |
+
|
| 26 |
+
Loads and deduplicates all sources:
|
| 27 |
+
- `CATEGORIZED.json` — pre-categorized bookmarks from Edge + Raindrop + daily.dev
|
| 28 |
+
- `linkedin_saved.json` — LinkedIn saved posts
|
| 29 |
+
- `youtube_MASTER.json` — liked videos, watch later, playlists (not subscriptions)
|
| 30 |
+
|
| 31 |
+
Deduplication is URL-based (case-insensitive, trailing slash stripped). If the same URL appears in multiple sources, the first occurrence wins.
|
| 32 |
+
|
| 33 |
+
Each item gets a `doc_text` field built for embedding:
|
| 34 |
+
```
|
| 35 |
+
{title} | {category} | {tag1 tag2 tag3} | {content/excerpt/channel}
|
| 36 |
+
```
|
| 37 |
+
This rich text is what gets embedded — not just the title.
|
| 38 |
+
|
| 39 |
+
**Output:** ~8,000 normalized items in memory.
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
### Step 2 — Embedding
|
| 44 |
+
|
| 45 |
+
Loads the embedding provider specified by `EMBEDDING_PROVIDER` in `.env` (or `--provider` flag).
|
| 46 |
+
|
| 47 |
+
**Local (pplx-embed):**
|
| 48 |
+
- Query model: `perplexity-ai/pplx-embed-v1-0.6b` — used for user search queries
|
| 49 |
+
- Document model: `perplexity-ai/pplx-embed-context-v1-0.6b` — used for bookmark documents
|
| 50 |
+
- Output dimension: 1024
|
| 51 |
+
- Downloaded once to HuggingFace cache (~1.2 GB total), free on every subsequent run
|
| 52 |
+
- **Known compatibility issue:** pplx-embed requires `sentence-transformers==3.3.1` and two runtime patches (applied automatically in `local.py`). See [troubleshooting.md](troubleshooting.md) for details.
|
| 53 |
+
|
| 54 |
+
**Azure:**
|
| 55 |
+
- Uses `text-embedding-ada-002` (or configured `AZURE_DEPLOYMENT_EMBED`)
|
| 56 |
+
- Output dimension: 1536
|
| 57 |
+
- Cost: ~€0.30 for 8,000 items (as of 2026)
|
| 58 |
+
- Batched in groups of 100 with progress logging
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
### Step 3 — ChromaDB Ingest
|
| 63 |
+
|
| 64 |
+
Embeds all documents in batches of 100 and stores in ChromaDB.
|
| 65 |
+
|
| 66 |
+
- Skips items already in ChromaDB (resumable — safe to re-run)
|
| 67 |
+
- Stores: URL (as ID), embedding vector, title, category, source, score, tags
|
| 68 |
+
- Uses cosine similarity space (`hnsw:space: cosine`)
|
| 69 |
+
- Database written to disk at `CHROMA_PATH` (default: `OpenMark/data/chroma_db/`)
|
| 70 |
+
|
| 71 |
+
**Timing:**
|
| 72 |
+
| Provider | 8K items | Notes |
|
| 73 |
+
|----------|----------|-------|
|
| 74 |
+
| Local pplx-embed (CPU) | ~20 min | No GPU detected = CPU inference |
|
| 75 |
+
| Local pplx-embed (GPU) | ~3 min | NVIDIA GPU with CUDA |
|
| 76 |
+
| Azure AI Foundry | ~5 min | Network bound |
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
### Step 4 — Neo4j Ingest
|
| 81 |
+
|
| 82 |
+
Creates nodes and relationships in batches of 200.
|
| 83 |
+
|
| 84 |
+
**Nodes created:**
|
| 85 |
+
- `Bookmark` — url, title, score
|
| 86 |
+
- `Category` — name
|
| 87 |
+
- `Tag` — name
|
| 88 |
+
- `Source` — name (raindrop, linkedin, youtube_liked, edge, dailydev, etc.)
|
| 89 |
+
- `Domain` — extracted from URL (e.g. `github.com`, `medium.com`)
|
| 90 |
+
|
| 91 |
+
**Relationships created:**
|
| 92 |
+
- `(Bookmark)-[:IN_CATEGORY]->(Category)`
|
| 93 |
+
- `(Bookmark)-[:TAGGED]->(Tag)`
|
| 94 |
+
- `(Bookmark)-[:FROM_SOURCE]->(Source)`
|
| 95 |
+
- `(Bookmark)-[:FROM_DOMAIN]->(Domain)`
|
| 96 |
+
- `(Tag)-[:CO_OCCURS_WITH {count}]-(Tag)` — built after all nodes are written
|
| 97 |
+
|
| 98 |
+
**Timing:** ~3-5 minutes for 8K items.
|
| 99 |
+
|
| 100 |
+
**Idempotent:** Uses `MERGE` everywhere — safe to re-run, won't create duplicates.
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
### Step 5 — SIMILAR_TO Edges
|
| 105 |
+
|
| 106 |
+
This is the most powerful and most time-consuming step.
|
| 107 |
+
|
| 108 |
+
For each of the 8K bookmarks, OpenMark queries ChromaDB for its top-5 nearest semantic neighbors and writes those as `SIMILAR_TO` edges in Neo4j with a similarity score.
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
(Bookmark {url: "...langchain-docs..."})-[:SIMILAR_TO {score: 0.94}]->(Bookmark {url: "...langgraph-tutorial..."})
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
These edges encode **semantic connections you never manually created**. The knowledge graph becomes a web of meaning, not just a web of tags.
|
| 115 |
+
|
| 116 |
+
**Timing:** ~25-40 minutes on CPU for 8K items. This is the longest step.
|
| 117 |
+
|
| 118 |
+
**Skip it if you're in a hurry:**
|
| 119 |
+
```bash
|
| 120 |
+
python scripts/ingest.py --skip-similar
|
| 121 |
+
```
|
| 122 |
+
Everything else works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` tool in the agent and the graph traversal from those edges.
|
| 123 |
+
|
| 124 |
+
**Only edges with similarity > 0.5 are written.** Low-quality connections are discarded.
|
| 125 |
+
|
| 126 |
+
---
|
| 127 |
+
|
| 128 |
+
## Re-running the Pipeline
|
| 129 |
+
|
| 130 |
+
The pipeline is safe to re-run at any time:
|
| 131 |
+
|
| 132 |
+
- **ChromaDB:** skips already-ingested URLs automatically
|
| 133 |
+
- **Neo4j:** uses `MERGE` — no duplicates created
|
| 134 |
+
- **SIMILAR_TO:** edges are overwritten (not duplicated) via `MERGE`
|
| 135 |
+
|
| 136 |
+
To add new bookmarks after the first run:
|
| 137 |
+
1. Update your source files (fresh Raindrop pull, new LinkedIn export, etc.)
|
| 138 |
+
2. Run `python scripts/ingest.py` — only new items get embedded and stored
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## Checking What's Ingested
|
| 143 |
+
|
| 144 |
+
```bash
|
| 145 |
+
# Quick stats
|
| 146 |
+
python scripts/search.py --stats
|
| 147 |
+
|
| 148 |
+
# Search to verify
|
| 149 |
+
python scripts/search.py "RAG tools"
|
| 150 |
+
|
| 151 |
+
# Neo4j — open browser
|
| 152 |
+
# http://localhost:7474
|
| 153 |
+
# Run: MATCH (b:Bookmark) RETURN count(b)
|
| 154 |
+
```
|
docs/troubleshooting.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Troubleshooting
|
| 2 |
+
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
## pplx-embed fails to load
|
| 6 |
+
|
| 7 |
+
**Error:** `ImportError: cannot import name 'Module' from 'sentence_transformers.models'`
|
| 8 |
+
|
| 9 |
+
**Cause:** pplx-embed's custom `st_quantize.py` imports `Module` from `sentence_transformers.models`, which was removed in version 4.x.
|
| 10 |
+
|
| 11 |
+
**Fix:** Pin to the correct version:
|
| 12 |
+
```bash
|
| 13 |
+
pip install "sentence-transformers==3.3.1"
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## pplx-embed crashes with 404 on chat templates
|
| 19 |
+
|
| 20 |
+
**Error:** `RemoteEntryNotFoundError: 404 ... additional_chat_templates does not exist`
|
| 21 |
+
|
| 22 |
+
**Cause:** `transformers 4.57+` added `list_repo_templates()` which looks for an `additional_chat_templates` folder in every model repo. pplx-embed predates this feature and doesn't have the folder.
|
| 23 |
+
|
| 24 |
+
**Fix:** Already handled automatically in `openmark/embeddings/local.py` via a monkey-patch applied before model loading. If you see this error outside of OpenMark, apply:
|
| 25 |
+
```python
|
| 26 |
+
from transformers.utils import hub as _hub
|
| 27 |
+
import transformers.tokenization_utils_base as _tub
|
| 28 |
+
_orig = _hub.list_repo_templates
|
| 29 |
+
def _safe(*a, **kw):
|
| 30 |
+
try: return _orig(*a, **kw)
|
| 31 |
+
except Exception: return []
|
| 32 |
+
_hub.list_repo_templates = _safe
|
| 33 |
+
_tub.list_repo_templates = _safe
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Neo4j connection error: "Unable to retrieve routing information"
|
| 39 |
+
|
| 40 |
+
**Cause:** Using `neo4j://` URI (routing protocol) with a single local Neo4j instance.
|
| 41 |
+
|
| 42 |
+
**Fix:** Use `bolt://` instead:
|
| 43 |
+
```env
|
| 44 |
+
NEO4J_URI=bolt://127.0.0.1:7687
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## Neo4j error: "Database does not exist"
|
| 50 |
+
|
| 51 |
+
**Cause:** The database name in `.env` doesn't match what's in Neo4j Desktop.
|
| 52 |
+
|
| 53 |
+
**Fix:** Open `http://localhost:7474`, check what databases exist:
|
| 54 |
+
```cypher
|
| 55 |
+
SHOW DATABASES
|
| 56 |
+
```
|
| 57 |
+
Update `NEO4J_DATABASE` in `.env` to match.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## LinkedIn script returns 0 results or 404
|
| 62 |
+
|
| 63 |
+
**Cause:** LinkedIn's internal `queryId` changes when they deploy new JavaScript bundles.
|
| 64 |
+
|
| 65 |
+
**Fix:**
|
| 66 |
+
1. Open LinkedIn in your browser → go to Saved Posts
|
| 67 |
+
2. Open DevTools → Network tab → filter for `voyagerSearchDashClusters`
|
| 68 |
+
3. Click one of the requests → copy the full URL
|
| 69 |
+
4. Extract the new `queryId` value
|
| 70 |
+
5. Update `linkedin_fetch.py` with the new `queryId`
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## YouTube OAuth "Access Blocked: App not verified"
|
| 75 |
+
|
| 76 |
+
**Cause:** Your Google Cloud app is in testing mode and your account isn't listed as a test user.
|
| 77 |
+
|
| 78 |
+
**Fix:**
|
| 79 |
+
1. Google Cloud Console → OAuth consent screen
|
| 80 |
+
2. Scroll to "Test users" → Add users → add your Google account email
|
| 81 |
+
3. Re-run `youtube_fetch.py`
|
| 82 |
+
|
| 83 |
+
---
|
| 84 |
+
|
| 85 |
+
## ChromaDB ingest is slow
|
| 86 |
+
|
| 87 |
+
On CPU with local pplx-embed, embedding 8K items takes ~20 minutes. This is normal.
|
| 88 |
+
|
| 89 |
+
**Options:**
|
| 90 |
+
- Use Azure instead: `python scripts/ingest.py --provider azure` (~5 min, ~€0.30)
|
| 91 |
+
- The ingest is resumable — if interrupted, re-run and it skips already-ingested items
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## SIMILAR_TO step takes too long
|
| 96 |
+
|
| 97 |
+
Building SIMILAR_TO edges queries ChromaDB for every bookmark's top-5 neighbors, then writes to Neo4j. For 8K items on CPU this takes ~25-40 minutes.
|
| 98 |
+
|
| 99 |
+
**Skip it:**
|
| 100 |
+
```bash
|
| 101 |
+
python scripts/ingest.py --skip-similar
|
| 102 |
+
```
|
| 103 |
+
The app works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` agent tool and cross-topic graph traversal.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## Windows UnicodeEncodeError in terminal
|
| 108 |
+
|
| 109 |
+
**Error:** `UnicodeEncodeError: 'charmap' codec can't encode character`
|
| 110 |
+
|
| 111 |
+
**Cause:** Windows terminal (cmd/PowerShell) defaults to cp1252 encoding which can't handle emoji or some Unicode characters in bookmark titles.
|
| 112 |
+
|
| 113 |
+
**Fix:** Run from Windows Terminal (supports UTF-8) or add to the top of the script:
|
| 114 |
+
```python
|
| 115 |
+
import sys
|
| 116 |
+
sys.stdout.reconfigure(encoding='utf-8')
|
| 117 |
+
```
|
| 118 |
+
All OpenMark scripts already include this.
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
## gradio not found on Python 3.13
|
| 123 |
+
|
| 124 |
+
gradio 6.6.0 is installed on Python 3.14 by default on this machine. If using Python 3.13:
|
| 125 |
+
```bash
|
| 126 |
+
C:\Python313\python -m pip install gradio
|
| 127 |
+
```
|
openmark/__init__.py
ADDED
|
File without changes
|
openmark/agent/__init__.py
ADDED
|
File without changes
|
openmark/agent/graph.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangGraph ReAct agent for OpenMark.
|
| 3 |
+
Uses Azure gpt-4o-mini as the LLM.
|
| 4 |
+
Has access to all OpenMark tools (ChromaDB + Neo4j).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from langchain_openai import AzureChatOpenAI
|
| 8 |
+
from langgraph.prebuilt import create_react_agent
|
| 9 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 10 |
+
from openmark import config
|
| 11 |
+
from openmark.agent.tools import ALL_TOOLS
|
| 12 |
+
|
| 13 |
+
SYSTEM_PROMPT = """You are OpenMark — Ahmad's personal AI knowledge assistant.
|
| 14 |
+
|
| 15 |
+
You have access to his entire curated knowledge base of 7,000+ saved bookmarks,
|
| 16 |
+
LinkedIn posts, and YouTube videos — all categorized, tagged, and connected in a
|
| 17 |
+
knowledge graph.
|
| 18 |
+
|
| 19 |
+
Your job:
|
| 20 |
+
- Help Ahmad find exactly what he saved and can't remember
|
| 21 |
+
- Discover connections between topics he didn't know existed
|
| 22 |
+
- Answer questions by searching his real saved content (not your training data)
|
| 23 |
+
- Be direct and useful — no filler
|
| 24 |
+
|
| 25 |
+
When answering:
|
| 26 |
+
- Always use tools to search first before responding
|
| 27 |
+
- Show the actual URLs and titles from results
|
| 28 |
+
- Group results by relevance
|
| 29 |
+
- If one search doesn't find enough, try a different angle (by tag, by category, by similarity)
|
| 30 |
+
|
| 31 |
+
Available search modes:
|
| 32 |
+
- search_semantic: natural language search (most useful for general queries)
|
| 33 |
+
- search_by_category: filter by topic category
|
| 34 |
+
- find_by_tag: exact tag lookup in the knowledge graph
|
| 35 |
+
- find_similar_bookmarks: find related content to a specific URL
|
| 36 |
+
- explore_tag_cluster: discover what else connects to a topic
|
| 37 |
+
- get_stats: see what's in the knowledge base
|
| 38 |
+
- run_cypher: advanced graph queries (for power users)
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def build_agent():
|
| 43 |
+
llm = AzureChatOpenAI(
|
| 44 |
+
azure_endpoint=config.AZURE_ENDPOINT,
|
| 45 |
+
api_key=config.AZURE_API_KEY,
|
| 46 |
+
azure_deployment=config.AZURE_DEPLOYMENT_LLM,
|
| 47 |
+
api_version=config.AZURE_API_VERSION,
|
| 48 |
+
temperature=0,
|
| 49 |
+
streaming=True,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
checkpointer = MemorySaver()
|
| 53 |
+
|
| 54 |
+
agent = create_react_agent(
|
| 55 |
+
model=llm,
|
| 56 |
+
tools=ALL_TOOLS,
|
| 57 |
+
prompt=SYSTEM_PROMPT,
|
| 58 |
+
checkpointer=checkpointer,
|
| 59 |
+
)
|
| 60 |
+
return agent
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def ask(agent, question: str, thread_id: str = "default") -> str:
|
| 64 |
+
"""Run a question through the agent and return the final text response."""
|
| 65 |
+
config_run = {"configurable": {"thread_id": thread_id}}
|
| 66 |
+
result = agent.invoke(
|
| 67 |
+
{"messages": [{"role": "user", "content": question}]},
|
| 68 |
+
config=config_run,
|
| 69 |
+
)
|
| 70 |
+
return result["messages"][-1].content
|
openmark/agent/tools.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangGraph tools for the OpenMark agent.
|
| 3 |
+
Each tool hits either ChromaDB (semantic) or Neo4j (graph) or both.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from langchain_core.tools import tool
|
| 7 |
+
from openmark.embeddings.factory import get_embedder
|
| 8 |
+
from openmark.stores import chroma as chroma_store
|
| 9 |
+
from openmark.stores import neo4j_store
|
| 10 |
+
|
| 11 |
+
# Embedder is loaded once and reused
|
| 12 |
+
_embedder = None
|
| 13 |
+
|
| 14 |
+
def _get_embedder():
|
| 15 |
+
global _embedder
|
| 16 |
+
if _embedder is None:
|
| 17 |
+
_embedder = get_embedder()
|
| 18 |
+
return _embedder
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@tool
|
| 22 |
+
def search_semantic(query: str, n: int = 10) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Search bookmarks by semantic meaning using vector similarity.
|
| 25 |
+
Use this for natural language queries like 'RAG tools', 'LangGraph tutorials', etc.
|
| 26 |
+
Returns top N most relevant bookmarks.
|
| 27 |
+
"""
|
| 28 |
+
results = chroma_store.search(query, _get_embedder(), n=n)
|
| 29 |
+
if not results:
|
| 30 |
+
return "No results found."
|
| 31 |
+
lines = [f"{r['rank']}. [{r['category']}] {r['title']}\n {r['url']} (similarity: {r['similarity']}, score: {r['score']})"
|
| 32 |
+
for r in results]
|
| 33 |
+
return "\n".join(lines)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@tool
|
| 37 |
+
def search_by_category(category: str, query: str = "", n: int = 15) -> str:
|
| 38 |
+
"""
|
| 39 |
+
Find bookmarks in a specific category, optionally filtered by semantic query.
|
| 40 |
+
Categories: RAG & Vector Search, Agent Development, LangChain / LangGraph,
|
| 41 |
+
MCP & Tool Use, Context Engineering, AI Tools & Platforms, GitHub Repos & OSS,
|
| 42 |
+
Learning & Courses, YouTube & Video, Web Development, Cloud & Infrastructure,
|
| 43 |
+
Data Science & ML, Knowledge Graphs & Neo4j, Career & Jobs, LLM Fine-tuning,
|
| 44 |
+
Finance & Crypto, Design & UI/UX, News & Articles, Entertainment & Other
|
| 45 |
+
"""
|
| 46 |
+
if query:
|
| 47 |
+
results = chroma_store.search(query, _get_embedder(), n=n, category=category)
|
| 48 |
+
else:
|
| 49 |
+
results = chroma_store.search(category, _get_embedder(), n=n, category=category)
|
| 50 |
+
if not results:
|
| 51 |
+
return f"No bookmarks found in category '{category}'."
|
| 52 |
+
lines = [f"{r['rank']}. {r['title']}\n {r['url']}" for r in results]
|
| 53 |
+
return f"Category '{category}' — top results:\n" + "\n".join(lines)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@tool
|
| 57 |
+
def find_by_tag(tag: str) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Find all bookmarks tagged with a specific tag using the knowledge graph.
|
| 60 |
+
Returns bookmarks ordered by quality score.
|
| 61 |
+
"""
|
| 62 |
+
results = neo4j_store.find_by_tag(tag, limit=20)
|
| 63 |
+
if not results:
|
| 64 |
+
return f"No bookmarks found with tag '{tag}'."
|
| 65 |
+
lines = [f"- {r['title']}\n {r['url']} (score: {r['score']})" for r in results]
|
| 66 |
+
return f"Bookmarks tagged '{tag}':\n" + "\n".join(lines)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@tool
|
| 70 |
+
def find_similar_bookmarks(url: str) -> str:
|
| 71 |
+
"""
|
| 72 |
+
Find bookmarks semantically similar to a given URL.
|
| 73 |
+
Uses SIMILAR_TO edges in the knowledge graph (built from embedding neighbors).
|
| 74 |
+
"""
|
| 75 |
+
results = neo4j_store.find_similar(url, limit=10)
|
| 76 |
+
if not results:
|
| 77 |
+
return f"No similar bookmarks found for {url}."
|
| 78 |
+
lines = [f"- {r['title']}\n {r['url']} (similarity: {r['similarity']:.3f})" for r in results]
|
| 79 |
+
return "Similar bookmarks:\n" + "\n".join(lines)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@tool
|
| 83 |
+
def explore_tag_cluster(tag: str) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Explore the knowledge graph around a tag — find related tags and their bookmarks.
|
| 86 |
+
Traverses CO_OCCURS_WITH edges (2 hops) to discover connected topics.
|
| 87 |
+
Great for discovering what else you know about a topic.
|
| 88 |
+
"""
|
| 89 |
+
results = neo4j_store.find_tag_cluster(tag, hops=2, limit=25)
|
| 90 |
+
if not results:
|
| 91 |
+
return f"No cluster found for tag '{tag}'."
|
| 92 |
+
lines = [f"- [{r['via_tag']}] {r['title']}\n {r['url']}" for r in results]
|
| 93 |
+
return f"Knowledge cluster around '{tag}':\n" + "\n".join(lines)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@tool
|
| 97 |
+
def get_stats() -> str:
|
| 98 |
+
"""
|
| 99 |
+
Get statistics about the OpenMark knowledge base.
|
| 100 |
+
Shows total bookmarks, tags, categories in both ChromaDB and Neo4j.
|
| 101 |
+
"""
|
| 102 |
+
chroma_stats = chroma_store.get_stats()
|
| 103 |
+
neo4j_stats = neo4j_store.get_stats()
|
| 104 |
+
return (
|
| 105 |
+
f"OpenMark Knowledge Base Stats:\n"
|
| 106 |
+
f" ChromaDB vectors: {chroma_stats.get('total', 0)}\n"
|
| 107 |
+
f" Neo4j bookmarks: {neo4j_stats.get('bookmarks', 0)}\n"
|
| 108 |
+
f" Neo4j tags: {neo4j_stats.get('tags', 0)}\n"
|
| 109 |
+
f" Neo4j categories: {neo4j_stats.get('categories', 0)}"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@tool
|
| 114 |
+
def run_cypher(cypher: str) -> str:
|
| 115 |
+
"""
|
| 116 |
+
Run a raw Cypher query against the Neo4j knowledge graph.
|
| 117 |
+
Use for advanced graph traversals. Example:
|
| 118 |
+
MATCH (b:Bookmark)-[:TAGGED]->(t:Tag) WHERE t.name='rag' RETURN b.title, b.url LIMIT 10
|
| 119 |
+
"""
|
| 120 |
+
try:
|
| 121 |
+
rows = neo4j_store.query(cypher)
|
| 122 |
+
if not rows:
|
| 123 |
+
return "Query returned no results."
|
| 124 |
+
lines = [str(r) for r in rows[:20]]
|
| 125 |
+
return "\n".join(lines)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
return f"Cypher error: {e}"
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
ALL_TOOLS = [
|
| 131 |
+
search_semantic,
|
| 132 |
+
search_by_category,
|
| 133 |
+
find_by_tag,
|
| 134 |
+
find_similar_bookmarks,
|
| 135 |
+
explore_tag_cluster,
|
| 136 |
+
get_stats,
|
| 137 |
+
run_cypher,
|
| 138 |
+
]
|
openmark/config.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"))
|
| 5 |
+
|
| 6 |
+
# Embedding
|
| 7 |
+
EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "local")
|
| 8 |
+
PPLX_QUERY_MODEL = os.getenv("PPLX_QUERY_MODEL", "perplexity-ai/pplx-embed-v1-0.6b")
|
| 9 |
+
PPLX_DOC_MODEL = os.getenv("PPLX_DOC_MODEL", "perplexity-ai/pplx-embed-context-v1-0.6b")
|
| 10 |
+
|
| 11 |
+
# Azure
|
| 12 |
+
AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
|
| 13 |
+
AZURE_API_KEY = os.getenv("AZURE_API_KEY")
|
| 14 |
+
AZURE_DEPLOYMENT_LLM = os.getenv("AZURE_DEPLOYMENT_LLM", "gpt-4o-mini")
|
| 15 |
+
AZURE_DEPLOYMENT_EMBED = os.getenv("AZURE_DEPLOYMENT_EMBED", "text-embedding-ada-002")
|
| 16 |
+
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION", "2024-05-01-preview")
|
| 17 |
+
|
| 18 |
+
# Neo4j
|
| 19 |
+
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://127.0.0.1:7687")
|
| 20 |
+
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
|
| 21 |
+
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
| 22 |
+
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")
|
| 23 |
+
|
| 24 |
+
# Raindrop
|
| 25 |
+
RAINDROP_TOKEN = os.getenv("RAINDROP_TOKEN")
|
| 26 |
+
|
| 27 |
+
# Paths
|
| 28 |
+
RAINDROP_MISSION_DIR = os.getenv("RAINDROP_MISSION_DIR", r"C:\Users\oasrvadmin\Documents\raindrop-mission")
|
| 29 |
+
CHROMA_PATH = os.getenv("CHROMA_PATH", r"C:\Users\oasrvadmin\Documents\OpenMark\data\chroma_db")
|
| 30 |
+
|
| 31 |
+
# Canonical categories
|
| 32 |
+
CATEGORIES = [
|
| 33 |
+
"RAG & Vector Search",
|
| 34 |
+
"LLM Fine-tuning",
|
| 35 |
+
"Agent Development",
|
| 36 |
+
"LangChain / LangGraph",
|
| 37 |
+
"MCP & Tool Use",
|
| 38 |
+
"Context Engineering",
|
| 39 |
+
"AI Tools & Platforms",
|
| 40 |
+
"GitHub Repos & OSS",
|
| 41 |
+
"Learning & Courses",
|
| 42 |
+
"YouTube & Video",
|
| 43 |
+
"Web Development",
|
| 44 |
+
"Cloud & Infrastructure",
|
| 45 |
+
"Data Science & ML",
|
| 46 |
+
"Knowledge Graphs & Neo4j",
|
| 47 |
+
"Career & Jobs",
|
| 48 |
+
"Finance & Crypto",
|
| 49 |
+
"Design & UI/UX",
|
| 50 |
+
"News & Articles",
|
| 51 |
+
"Entertainment & Other",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
CATEGORY_MAP = {
|
| 55 |
+
"UI/UX Design": "Design & UI/UX",
|
| 56 |
+
"UI/UX": "Design & UI/UX",
|
| 57 |
+
"Real_Estate": "Finance & Crypto",
|
| 58 |
+
"Real Estate": "Finance & Crypto",
|
| 59 |
+
"Social_Media": "News & Articles",
|
| 60 |
+
"Social/Community": "News & Articles",
|
| 61 |
+
"Social": "News & Articles",
|
| 62 |
+
"E-commerce & Marketplaces": "News & Articles",
|
| 63 |
+
"Research & Articles": "News & Articles",
|
| 64 |
+
"Blogs & Articles": "News & Articles",
|
| 65 |
+
"Research": "News & Articles",
|
| 66 |
+
"AI Thought Leaders & Media": "News & Articles",
|
| 67 |
+
"Debugging & Tools": "AI Tools & Platforms",
|
| 68 |
+
"Health & Wellness": "Entertainment & Other",
|
| 69 |
+
"Email & Productivity": "AI Tools & Platforms",
|
| 70 |
+
"Legal": "Entertainment & Other",
|
| 71 |
+
"NoCode - LowCode": "AI Tools & Platforms",
|
| 72 |
+
"Security": "AI Tools & Platforms",
|
| 73 |
+
}
|
openmark/embeddings/__init__.py
ADDED
|
File without changes
|
openmark/embeddings/azure.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Azure AI Foundry embedding provider.
|
| 3 |
+
Uses text-embedding-ada-002 (or whatever deployment is configured).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from openai import AzureOpenAI
|
| 7 |
+
from openmark.embeddings.base import EmbeddingProvider
|
| 8 |
+
from openmark import config
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AzureEmbedder(EmbeddingProvider):
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self._client = AzureOpenAI(
|
| 14 |
+
azure_endpoint=config.AZURE_ENDPOINT,
|
| 15 |
+
api_key=config.AZURE_API_KEY,
|
| 16 |
+
api_version=config.AZURE_API_VERSION,
|
| 17 |
+
)
|
| 18 |
+
self._deployment = config.AZURE_DEPLOYMENT_EMBED
|
| 19 |
+
print(f"Azure embedder ready — deployment: {self._deployment}")
|
| 20 |
+
|
| 21 |
+
def _embed(self, texts: list[str]) -> list[list[float]]:
|
| 22 |
+
response = self._client.embeddings.create(
|
| 23 |
+
input=texts,
|
| 24 |
+
model=self._deployment,
|
| 25 |
+
)
|
| 26 |
+
return [item.embedding for item in response.data]
|
| 27 |
+
|
| 28 |
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
| 29 |
+
results = []
|
| 30 |
+
batch_size = 100
|
| 31 |
+
for i in range(0, len(texts), batch_size):
|
| 32 |
+
batch = texts[i:i + batch_size]
|
| 33 |
+
results.extend(self._embed(batch))
|
| 34 |
+
print(f" Azure embedded {min(i + batch_size, len(texts))}/{len(texts)}")
|
| 35 |
+
return results
|
| 36 |
+
|
| 37 |
+
def embed_query(self, text: str) -> list[float]:
|
| 38 |
+
return self._embed([text])[0]
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def dimension(self) -> int:
|
| 42 |
+
return 1536 # ada-002 dimension
|
openmark/embeddings/base.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class EmbeddingProvider(ABC):
|
| 5 |
+
"""Abstract base — swap local pplx-embed or Azure without changing any other code."""
|
| 6 |
+
|
| 7 |
+
@abstractmethod
|
| 8 |
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
| 9 |
+
"""Embed a list of document strings."""
|
| 10 |
+
...
|
| 11 |
+
|
| 12 |
+
@abstractmethod
|
| 13 |
+
def embed_query(self, text: str) -> list[float]:
|
| 14 |
+
"""Embed a single query string."""
|
| 15 |
+
...
|
| 16 |
+
|
| 17 |
+
@property
|
| 18 |
+
@abstractmethod
|
| 19 |
+
def dimension(self) -> int:
|
| 20 |
+
"""Output embedding dimension."""
|
| 21 |
+
...
|
openmark/embeddings/factory.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openmark import config
|
| 2 |
+
from openmark.embeddings.base import EmbeddingProvider
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def get_embedder() -> EmbeddingProvider:
|
| 6 |
+
"""Return the configured embedding provider based on EMBEDDING_PROVIDER env var."""
|
| 7 |
+
provider = config.EMBEDDING_PROVIDER.lower()
|
| 8 |
+
if provider == "local":
|
| 9 |
+
from openmark.embeddings.local import LocalEmbedder
|
| 10 |
+
return LocalEmbedder()
|
| 11 |
+
elif provider == "azure":
|
| 12 |
+
from openmark.embeddings.azure import AzureEmbedder
|
| 13 |
+
return AzureEmbedder()
|
| 14 |
+
else:
|
| 15 |
+
raise ValueError(f"Unknown EMBEDDING_PROVIDER: '{provider}'. Use 'local' or 'azure'.")
|
openmark/embeddings/local.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local pplx-embed embedding provider.
|
| 3 |
+
Uses:
|
| 4 |
+
- perplexity-ai/pplx-embed-v1-0.6b for queries
|
| 5 |
+
- perplexity-ai/pplx-embed-context-v1-0.6b for documents
|
| 6 |
+
|
| 7 |
+
Two patches applied at import time:
|
| 8 |
+
1. transformers 4.57 crashes on models without additional_chat_templates folder → catch 404
|
| 9 |
+
2. pplx-embed's st_quantize.py imports sentence_transformers.models.Module (removed in 3.x) → add it back
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# ── Patch 1: transformers 4.57 list_repo_templates 404 crash ─
|
| 13 |
+
from transformers.utils import hub as _hub
|
| 14 |
+
import transformers.tokenization_utils_base as _tub
|
| 15 |
+
_orig_lrt = _hub.list_repo_templates
|
| 16 |
+
def _safe_lrt(*a, **kw):
|
| 17 |
+
try:
|
| 18 |
+
return _orig_lrt(*a, **kw)
|
| 19 |
+
except Exception:
|
| 20 |
+
return []
|
| 21 |
+
_hub.list_repo_templates = _safe_lrt
|
| 22 |
+
_tub.list_repo_templates = _safe_lrt
|
| 23 |
+
|
| 24 |
+
# ── Patch 2: sentence_transformers.models.Module missing ─────
|
| 25 |
+
import torch.nn as _nn
|
| 26 |
+
import sentence_transformers.models as _st_models
|
| 27 |
+
if not hasattr(_st_models, "Module"):
|
| 28 |
+
_st_models.Module = _nn.Module
|
| 29 |
+
|
| 30 |
+
from sentence_transformers import SentenceTransformer
|
| 31 |
+
import numpy as np
|
| 32 |
+
from openmark.embeddings.base import EmbeddingProvider
|
| 33 |
+
from openmark import config
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class LocalEmbedder(EmbeddingProvider):
|
| 37 |
+
def __init__(self):
|
| 38 |
+
print("Loading pplx-embed query model...")
|
| 39 |
+
self._query_model = SentenceTransformer(config.PPLX_QUERY_MODEL, trust_remote_code=True)
|
| 40 |
+
print("Loading pplx-embed document model...")
|
| 41 |
+
self._doc_model = SentenceTransformer(config.PPLX_DOC_MODEL, trust_remote_code=True)
|
| 42 |
+
print("Local embedder ready.")
|
| 43 |
+
|
| 44 |
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
| 45 |
+
embeddings = self._doc_model.encode(texts, batch_size=32, show_progress_bar=True)
|
| 46 |
+
return embeddings.astype(float).tolist()
|
| 47 |
+
|
| 48 |
+
def embed_query(self, text: str) -> list[float]:
|
| 49 |
+
embedding = self._query_model.encode([text])
|
| 50 |
+
return embedding[0].astype(float).tolist()
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def dimension(self) -> int:
|
| 54 |
+
return 1024
|
openmark/pipeline/__init__.py
ADDED
|
File without changes
|
openmark/pipeline/merge.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Merge ALL data sources into one clean list:
|
| 3 |
+
- CATEGORIZED.json (Edge + old Raindrop + daily.dev — already categorized)
|
| 4 |
+
- linkedin_saved.json (1,260 LinkedIn posts)
|
| 5 |
+
- youtube_MASTER.json (liked + watch_later + playlists)
|
| 6 |
+
- Fresh Raindrop pull (new items not yet in CATEGORIZED)
|
| 7 |
+
|
| 8 |
+
Deduplicates by URL. Normalizes categories.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
from openmark import config
|
| 14 |
+
from openmark.pipeline.normalize import normalize_item, dedupe
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_categorized() -> list[dict]:
|
| 18 |
+
path = os.path.join(config.RAINDROP_MISSION_DIR, "CATEGORIZED.json")
|
| 19 |
+
with open(path, encoding="utf-8") as f:
|
| 20 |
+
items = json.load(f)
|
| 21 |
+
print(f"CATEGORIZED.json: {len(items)} items")
|
| 22 |
+
return items
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_linkedin() -> list[dict]:
|
| 26 |
+
path = os.path.join(config.RAINDROP_MISSION_DIR, "linkedin_saved.json")
|
| 27 |
+
if not os.path.exists(path):
|
| 28 |
+
print("LinkedIn: file not found, skipping")
|
| 29 |
+
return []
|
| 30 |
+
with open(path, encoding="utf-8") as f:
|
| 31 |
+
posts = json.load(f)
|
| 32 |
+
items = []
|
| 33 |
+
for p in posts:
|
| 34 |
+
content = p.get("content", "")
|
| 35 |
+
author = p.get("author", "")
|
| 36 |
+
items.append({
|
| 37 |
+
"url": p.get("url", ""),
|
| 38 |
+
"title": f"{author} — {content[:80]}" if author else content[:100],
|
| 39 |
+
"content": content[:300],
|
| 40 |
+
"author": author,
|
| 41 |
+
"folder": "LinkedIn Saved",
|
| 42 |
+
"source": "linkedin",
|
| 43 |
+
"tags": [],
|
| 44 |
+
"category": None, # will be assigned by normalize
|
| 45 |
+
"score": 6,
|
| 46 |
+
})
|
| 47 |
+
print(f"LinkedIn: {len(items)} posts")
|
| 48 |
+
return items
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_youtube() -> list[dict]:
|
| 52 |
+
path = os.path.join(config.RAINDROP_MISSION_DIR, "youtube_MASTER.json")
|
| 53 |
+
if not os.path.exists(path):
|
| 54 |
+
print("YouTube: file not found, skipping")
|
| 55 |
+
return []
|
| 56 |
+
with open(path, encoding="utf-8") as f:
|
| 57 |
+
yt = json.load(f)
|
| 58 |
+
items = []
|
| 59 |
+
for section in ["liked_videos", "watch_later", "playlists"]:
|
| 60 |
+
for v in yt.get(section, []):
|
| 61 |
+
items.append({
|
| 62 |
+
"url": v.get("url", ""),
|
| 63 |
+
"title": v.get("title", ""),
|
| 64 |
+
"channel": v.get("channel", ""),
|
| 65 |
+
"folder": f"YouTube / {section}",
|
| 66 |
+
"source": f"youtube_{section}",
|
| 67 |
+
"tags": v.get("tags", [])[:5],
|
| 68 |
+
"category": "YouTube & Video",
|
| 69 |
+
"score": 7,
|
| 70 |
+
})
|
| 71 |
+
print(f"YouTube: {len(items)} videos (liked + watch_later + playlists)")
|
| 72 |
+
return items
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def merge_all(include_fresh_raindrop: bool = False) -> list[dict]:
|
| 76 |
+
"""
|
| 77 |
+
Merge all sources. Returns deduplicated, normalized list.
|
| 78 |
+
Set include_fresh_raindrop=True to also pull live from Raindrop API.
|
| 79 |
+
"""
|
| 80 |
+
all_items = []
|
| 81 |
+
|
| 82 |
+
all_items.extend(load_categorized())
|
| 83 |
+
all_items.extend(load_linkedin())
|
| 84 |
+
all_items.extend(load_youtube())
|
| 85 |
+
|
| 86 |
+
if include_fresh_raindrop:
|
| 87 |
+
from openmark.pipeline.raindrop import pull_all
|
| 88 |
+
fresh = pull_all()
|
| 89 |
+
all_items.extend(fresh)
|
| 90 |
+
|
| 91 |
+
# Normalize each item
|
| 92 |
+
normalized = [normalize_item(i) for i in all_items]
|
| 93 |
+
|
| 94 |
+
# Deduplicate by URL
|
| 95 |
+
unique = dedupe(normalized)
|
| 96 |
+
print(f"\nTotal after merge + dedup: {len(unique)} items")
|
| 97 |
+
return unique
|
openmark/pipeline/normalize.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Normalize, clean, and deduplicate bookmark items.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from openmark import config
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def clean_title(title: str) -> str:
|
| 10 |
+
if not title:
|
| 11 |
+
return ""
|
| 12 |
+
# Strip HTML entities
|
| 13 |
+
title = re.sub(r"&", "&", title)
|
| 14 |
+
title = re.sub(r"<", "<", title)
|
| 15 |
+
title = re.sub(r">", ">", title)
|
| 16 |
+
title = re.sub(r"'", "'", title)
|
| 17 |
+
title = re.sub(r""", '"', title)
|
| 18 |
+
# Strip leading/trailing whitespace and truncate
|
| 19 |
+
title = title.strip()[:300]
|
| 20 |
+
return title
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def fix_category(cat: str | None) -> str:
|
| 24 |
+
if not cat:
|
| 25 |
+
return "News & Articles"
|
| 26 |
+
# Apply known remapping
|
| 27 |
+
cat = config.CATEGORY_MAP.get(cat, cat)
|
| 28 |
+
# If still unknown, fallback
|
| 29 |
+
if cat not in config.CATEGORIES:
|
| 30 |
+
return "News & Articles"
|
| 31 |
+
return cat
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def build_document_text(item: dict) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Build a single rich text string for embedding.
|
| 37 |
+
Combines title + tags + category + content/excerpt for better semantic matching.
|
| 38 |
+
"""
|
| 39 |
+
parts = []
|
| 40 |
+
if item.get("title"):
|
| 41 |
+
parts.append(item["title"])
|
| 42 |
+
if item.get("category"):
|
| 43 |
+
parts.append(item["category"])
|
| 44 |
+
if item.get("tags"):
|
| 45 |
+
parts.append(" ".join(item["tags"]))
|
| 46 |
+
if item.get("content"):
|
| 47 |
+
parts.append(item["content"][:200])
|
| 48 |
+
elif item.get("excerpt"):
|
| 49 |
+
parts.append(item["excerpt"][:200])
|
| 50 |
+
if item.get("channel"):
|
| 51 |
+
parts.append(item["channel"])
|
| 52 |
+
if item.get("author"):
|
| 53 |
+
parts.append(item["author"])
|
| 54 |
+
return " | ".join(p for p in parts if p)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def normalize_item(item: dict) -> dict:
|
| 58 |
+
"""Clean and normalize a single bookmark item."""
|
| 59 |
+
url = item.get("url", "").strip()
|
| 60 |
+
title = clean_title(item.get("title", ""))
|
| 61 |
+
cat = fix_category(item.get("category"))
|
| 62 |
+
tags = [t.lower().strip() for t in item.get("tags", []) if t][:5]
|
| 63 |
+
score = item.get("score", 5)
|
| 64 |
+
if not isinstance(score, (int, float)):
|
| 65 |
+
score = 5
|
| 66 |
+
|
| 67 |
+
normalized = {
|
| 68 |
+
"url": url,
|
| 69 |
+
"title": title,
|
| 70 |
+
"category": cat,
|
| 71 |
+
"tags": tags,
|
| 72 |
+
"score": score,
|
| 73 |
+
"source": item.get("source", "unknown"),
|
| 74 |
+
"folder": item.get("folder", ""),
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# Preserve optional fields
|
| 78 |
+
for field in ["content", "excerpt", "author", "channel", "description"]:
|
| 79 |
+
if item.get(field):
|
| 80 |
+
normalized[field] = item[field][:300]
|
| 81 |
+
|
| 82 |
+
# Build the document text for embedding
|
| 83 |
+
normalized["doc_text"] = build_document_text(normalized)
|
| 84 |
+
|
| 85 |
+
return normalized
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def dedupe(items: list[dict]) -> list[dict]:
|
| 89 |
+
"""Remove duplicates by URL (case-insensitive, trailing slash stripped)."""
|
| 90 |
+
seen = set()
|
| 91 |
+
unique = []
|
| 92 |
+
for item in items:
|
| 93 |
+
url = item.get("url", "").rstrip("/").lower()
|
| 94 |
+
if not url or url in seen:
|
| 95 |
+
continue
|
| 96 |
+
seen.add(url)
|
| 97 |
+
unique.append(item)
|
| 98 |
+
return unique
|
openmark/pipeline/raindrop.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fresh pull of ALL Raindrop bookmarks via API.
|
| 3 |
+
Fetches every collection and every raindrop inside it, paginated.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import time
|
| 7 |
+
import requests
|
| 8 |
+
from openmark import config
|
| 9 |
+
|
| 10 |
+
HEADERS = {"Authorization": f"Bearer {config.RAINDROP_TOKEN}"}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def fetch_all_collections() -> list[dict]:
|
| 14 |
+
"""Return all collections (top-level and nested)."""
|
| 15 |
+
resp = requests.get("https://api.raindrop.io/rest/v1/collections", headers=HEADERS)
|
| 16 |
+
resp.raise_for_status()
|
| 17 |
+
collections = resp.json().get("items", [])
|
| 18 |
+
|
| 19 |
+
# Also fetch children
|
| 20 |
+
resp2 = requests.get("https://api.raindrop.io/rest/v1/collections/childrens", headers=HEADERS)
|
| 21 |
+
if resp2.status_code == 200:
|
| 22 |
+
collections += resp2.json().get("items", [])
|
| 23 |
+
|
| 24 |
+
return collections
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def fetch_raindrops_for_collection(collection_id: int, title: str) -> list[dict]:
|
| 28 |
+
"""Fetch all raindrops in a collection, paginated."""
|
| 29 |
+
items = []
|
| 30 |
+
page = 0
|
| 31 |
+
while True:
|
| 32 |
+
resp = requests.get(
|
| 33 |
+
f"https://api.raindrop.io/rest/v1/raindrops/{collection_id}",
|
| 34 |
+
headers=HEADERS,
|
| 35 |
+
params={"perpage": 50, "page": page},
|
| 36 |
+
)
|
| 37 |
+
if resp.status_code != 200:
|
| 38 |
+
break
|
| 39 |
+
batch = resp.json().get("items", [])
|
| 40 |
+
if not batch:
|
| 41 |
+
break
|
| 42 |
+
for item in batch:
|
| 43 |
+
items.append({
|
| 44 |
+
"url": item.get("link", ""),
|
| 45 |
+
"title": item.get("title", ""),
|
| 46 |
+
"excerpt": item.get("excerpt", "")[:200],
|
| 47 |
+
"tags": item.get("tags", [])[:5],
|
| 48 |
+
"folder": title,
|
| 49 |
+
"source": "raindrop",
|
| 50 |
+
})
|
| 51 |
+
if len(batch) < 50:
|
| 52 |
+
break
|
| 53 |
+
page += 1
|
| 54 |
+
time.sleep(0.2)
|
| 55 |
+
return items
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def fetch_unsorted() -> list[dict]:
|
| 59 |
+
"""Fetch raindrops not in any collection (unsorted)."""
|
| 60 |
+
return fetch_raindrops_for_collection(-1, "Unsorted")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def pull_all() -> list[dict]:
|
| 64 |
+
"""Pull every raindrop from every collection. Returns flat list."""
|
| 65 |
+
print("Fetching Raindrop collections...")
|
| 66 |
+
collections = fetch_all_collections()
|
| 67 |
+
print(f" Found {len(collections)} collections")
|
| 68 |
+
|
| 69 |
+
all_items = []
|
| 70 |
+
for col in collections:
|
| 71 |
+
cid = col["_id"]
|
| 72 |
+
title = col.get("title", "Unknown")
|
| 73 |
+
items = fetch_raindrops_for_collection(cid, title)
|
| 74 |
+
print(f" [{title}] {len(items)} items")
|
| 75 |
+
all_items.extend(items)
|
| 76 |
+
time.sleep(0.1)
|
| 77 |
+
|
| 78 |
+
unsorted = fetch_unsorted()
|
| 79 |
+
print(f" [Unsorted] {len(unsorted)} items")
|
| 80 |
+
all_items.extend(unsorted)
|
| 81 |
+
|
| 82 |
+
print(f"Raindrop total: {len(all_items)}")
|
| 83 |
+
return all_items
|
openmark/stores/__init__.py
ADDED
|
File without changes
|
openmark/stores/chroma.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ChromaDB store — semantic vector search.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import chromadb
|
| 6 |
+
from openmark import config
|
| 7 |
+
from openmark.embeddings.base import EmbeddingProvider
|
| 8 |
+
|
| 9 |
+
COLLECTION_NAME = "openmark_bookmarks"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_client() -> chromadb.PersistentClient:
|
| 13 |
+
return chromadb.PersistentClient(path=config.CHROMA_PATH)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_collection(client: chromadb.PersistentClient, embedder: EmbeddingProvider):
|
| 17 |
+
"""Get or create the bookmarks collection."""
|
| 18 |
+
return client.get_or_create_collection(
|
| 19 |
+
name=COLLECTION_NAME,
|
| 20 |
+
metadata={"hnsw:space": "cosine"},
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def ingest(items: list[dict], embedder: EmbeddingProvider, batch_size: int = 100):
|
| 25 |
+
"""Embed all items and store in ChromaDB."""
|
| 26 |
+
client = get_client()
|
| 27 |
+
collection = get_collection(client, embedder)
|
| 28 |
+
|
| 29 |
+
# Check already ingested
|
| 30 |
+
existing = set(collection.get(include=[])["ids"])
|
| 31 |
+
new_items = [i for i in items if i["url"] not in existing]
|
| 32 |
+
print(f"ChromaDB: {len(existing)} already ingested, {len(new_items)} new")
|
| 33 |
+
|
| 34 |
+
if not new_items:
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
total = 0
|
| 38 |
+
for start in range(0, len(new_items), batch_size):
|
| 39 |
+
batch = new_items[start:start + batch_size]
|
| 40 |
+
|
| 41 |
+
texts = [i["doc_text"] for i in batch]
|
| 42 |
+
ids = [i["url"] for i in batch]
|
| 43 |
+
metas = [
|
| 44 |
+
{
|
| 45 |
+
"title": i["title"][:500],
|
| 46 |
+
"category": i["category"],
|
| 47 |
+
"source": i["source"],
|
| 48 |
+
"score": float(i["score"]),
|
| 49 |
+
"tags": ",".join(i["tags"]),
|
| 50 |
+
"folder": i.get("folder", ""),
|
| 51 |
+
}
|
| 52 |
+
for i in batch
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
embeddings = embedder.embed_documents(texts)
|
| 56 |
+
|
| 57 |
+
collection.add(
|
| 58 |
+
ids=ids,
|
| 59 |
+
embeddings=embeddings,
|
| 60 |
+
documents=texts,
|
| 61 |
+
metadatas=metas,
|
| 62 |
+
)
|
| 63 |
+
total += len(batch)
|
| 64 |
+
print(f" ChromaDB ingested {total}/{len(new_items)}")
|
| 65 |
+
|
| 66 |
+
print(f"ChromaDB total: {collection.count()} items")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def search(
|
| 70 |
+
query: str,
|
| 71 |
+
embedder: EmbeddingProvider,
|
| 72 |
+
n: int = 10,
|
| 73 |
+
category: str | None = None,
|
| 74 |
+
source: str | None = None,
|
| 75 |
+
min_score: float | None = None,
|
| 76 |
+
) -> list[dict]:
|
| 77 |
+
"""Semantic search with optional metadata filters."""
|
| 78 |
+
client = get_client()
|
| 79 |
+
collection = get_collection(client, embedder)
|
| 80 |
+
|
| 81 |
+
q_embedding = embedder.embed_query(query)
|
| 82 |
+
|
| 83 |
+
# Build filters
|
| 84 |
+
filters = []
|
| 85 |
+
if category:
|
| 86 |
+
filters.append({"category": {"$eq": category}})
|
| 87 |
+
if source:
|
| 88 |
+
filters.append({"source": {"$eq": source}})
|
| 89 |
+
if min_score is not None:
|
| 90 |
+
filters.append({"score": {"$gte": min_score}})
|
| 91 |
+
|
| 92 |
+
where = None
|
| 93 |
+
if len(filters) == 1:
|
| 94 |
+
where = filters[0]
|
| 95 |
+
elif len(filters) > 1:
|
| 96 |
+
where = {"$and": filters}
|
| 97 |
+
|
| 98 |
+
results = collection.query(
|
| 99 |
+
query_embeddings=[q_embedding],
|
| 100 |
+
n_results=n,
|
| 101 |
+
where=where,
|
| 102 |
+
include=["metadatas", "documents", "distances"],
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
output = []
|
| 106 |
+
for i, (meta, doc, dist) in enumerate(zip(
|
| 107 |
+
results["metadatas"][0],
|
| 108 |
+
results["documents"][0],
|
| 109 |
+
results["distances"][0],
|
| 110 |
+
)):
|
| 111 |
+
output.append({
|
| 112 |
+
"rank": i + 1,
|
| 113 |
+
"url": results["ids"][0][i],
|
| 114 |
+
"title": meta.get("title", ""),
|
| 115 |
+
"category": meta.get("category", ""),
|
| 116 |
+
"source": meta.get("source", ""),
|
| 117 |
+
"score": meta.get("score", 0),
|
| 118 |
+
"tags": meta.get("tags", "").split(","),
|
| 119 |
+
"similarity": round(1 - dist, 4),
|
| 120 |
+
})
|
| 121 |
+
return output
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def get_stats() -> dict:
|
| 125 |
+
client = get_client()
|
| 126 |
+
collection = get_collection(client, None)
|
| 127 |
+
return {"total": collection.count()}
|
openmark/stores/neo4j_store.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Neo4j store — knowledge graph.
|
| 3 |
+
|
| 4 |
+
Nodes: Bookmark, Tag, Category, Source, Domain
|
| 5 |
+
Edges: TAGGED, IN_CATEGORY, FROM_SOURCE, FROM_DOMAIN, SIMILAR_TO, CO_OCCURS_WITH
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
from neo4j import GraphDatabase
|
| 11 |
+
from openmark import config
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_driver():
|
| 15 |
+
return GraphDatabase.driver(
|
| 16 |
+
config.NEO4J_URI,
|
| 17 |
+
auth=(config.NEO4J_USER, config.NEO4J_PASSWORD),
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def setup_constraints(driver):
|
| 22 |
+
"""Create uniqueness constraints once."""
|
| 23 |
+
constraints = [
|
| 24 |
+
"CREATE CONSTRAINT bookmark_url IF NOT EXISTS FOR (b:Bookmark) REQUIRE b.url IS UNIQUE",
|
| 25 |
+
"CREATE CONSTRAINT tag_name IF NOT EXISTS FOR (t:Tag) REQUIRE t.name IS UNIQUE",
|
| 26 |
+
"CREATE CONSTRAINT category_name IF NOT EXISTS FOR (c:Category) REQUIRE c.name IS UNIQUE",
|
| 27 |
+
"CREATE CONSTRAINT source_name IF NOT EXISTS FOR (s:Source) REQUIRE s.name IS UNIQUE",
|
| 28 |
+
"CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE",
|
| 29 |
+
]
|
| 30 |
+
with driver.session(database=config.NEO4J_DATABASE) as session:
|
| 31 |
+
for cypher in constraints:
|
| 32 |
+
try:
|
| 33 |
+
session.run(cypher)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f" Constraint (already exists or error): {e}")
|
| 36 |
+
print("Constraints ready.")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def extract_domain(url: str) -> str:
|
| 40 |
+
try:
|
| 41 |
+
return urlparse(url).netloc.replace("www.", "")
|
| 42 |
+
except Exception:
|
| 43 |
+
return "unknown"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def ingest(items: list[dict], driver=None):
|
| 47 |
+
"""Write all nodes and relationships to Neo4j."""
|
| 48 |
+
own_driver = driver is None
|
| 49 |
+
if own_driver:
|
| 50 |
+
driver = get_driver()
|
| 51 |
+
|
| 52 |
+
setup_constraints(driver)
|
| 53 |
+
|
| 54 |
+
total = len(items)
|
| 55 |
+
batch_size = 200
|
| 56 |
+
|
| 57 |
+
print(f"Neo4j ingesting {total} items...")
|
| 58 |
+
|
| 59 |
+
for start in range(0, total, batch_size):
|
| 60 |
+
batch = items[start:start + batch_size]
|
| 61 |
+
|
| 62 |
+
with driver.session(database=config.NEO4J_DATABASE) as session:
|
| 63 |
+
session.execute_write(_write_batch, batch)
|
| 64 |
+
|
| 65 |
+
print(f" Neo4j wrote {min(start + batch_size, total)}/{total}")
|
| 66 |
+
|
| 67 |
+
print("Building tag co-occurrence edges...")
|
| 68 |
+
_build_tag_cooccurrence(driver)
|
| 69 |
+
|
| 70 |
+
print("Neo4j ingestion complete.")
|
| 71 |
+
|
| 72 |
+
if own_driver:
|
| 73 |
+
driver.close()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _write_batch(tx, batch: list[dict]):
|
| 77 |
+
for item in batch:
|
| 78 |
+
url = item["url"]
|
| 79 |
+
title = item["title"][:500]
|
| 80 |
+
category = item["category"]
|
| 81 |
+
tags = item["tags"]
|
| 82 |
+
score = float(item["score"])
|
| 83 |
+
source = item["source"]
|
| 84 |
+
domain = extract_domain(url)
|
| 85 |
+
|
| 86 |
+
# Bookmark node
|
| 87 |
+
tx.run("""
|
| 88 |
+
MERGE (b:Bookmark {url: $url})
|
| 89 |
+
SET b.title = $title, b.score = $score
|
| 90 |
+
""", url=url, title=title, score=score)
|
| 91 |
+
|
| 92 |
+
# Category node + relationship
|
| 93 |
+
tx.run("""
|
| 94 |
+
MERGE (c:Category {name: $cat})
|
| 95 |
+
WITH c
|
| 96 |
+
MATCH (b:Bookmark {url: $url})
|
| 97 |
+
MERGE (b)-[:IN_CATEGORY]->(c)
|
| 98 |
+
""", cat=category, url=url)
|
| 99 |
+
|
| 100 |
+
# Source node + relationship
|
| 101 |
+
tx.run("""
|
| 102 |
+
MERGE (s:Source {name: $src})
|
| 103 |
+
WITH s
|
| 104 |
+
MATCH (b:Bookmark {url: $url})
|
| 105 |
+
MERGE (b)-[:FROM_SOURCE]->(s)
|
| 106 |
+
""", src=source, url=url)
|
| 107 |
+
|
| 108 |
+
# Domain node + relationship
|
| 109 |
+
if domain and domain != "unknown":
|
| 110 |
+
tx.run("""
|
| 111 |
+
MERGE (d:Domain {name: $domain})
|
| 112 |
+
WITH d
|
| 113 |
+
MATCH (b:Bookmark {url: $url})
|
| 114 |
+
MERGE (b)-[:FROM_DOMAIN]->(d)
|
| 115 |
+
""", domain=domain, url=url)
|
| 116 |
+
|
| 117 |
+
# Tag nodes + relationships
|
| 118 |
+
for tag in tags:
|
| 119 |
+
if not tag:
|
| 120 |
+
continue
|
| 121 |
+
tx.run("""
|
| 122 |
+
MERGE (t:Tag {name: $tag})
|
| 123 |
+
WITH t
|
| 124 |
+
MATCH (b:Bookmark {url: $url})
|
| 125 |
+
MERGE (b)-[:TAGGED]->(t)
|
| 126 |
+
""", tag=tag, url=url)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _build_tag_cooccurrence(driver):
|
| 130 |
+
"""
|
| 131 |
+
For each bookmark with multiple tags, create CO_OCCURS_WITH edges between tags.
|
| 132 |
+
Weight = number of bookmarks where both tags appear together.
|
| 133 |
+
"""
|
| 134 |
+
with driver.session(database=config.NEO4J_DATABASE) as session:
|
| 135 |
+
session.run("""
|
| 136 |
+
MATCH (b:Bookmark)-[:TAGGED]->(t1:Tag)
|
| 137 |
+
MATCH (b)-[:TAGGED]->(t2:Tag)
|
| 138 |
+
WHERE t1.name < t2.name
|
| 139 |
+
MERGE (t1)-[r:CO_OCCURS_WITH]-(t2)
|
| 140 |
+
ON CREATE SET r.count = 1
|
| 141 |
+
ON MATCH SET r.count = r.count + 1
|
| 142 |
+
""")
|
| 143 |
+
print(" Tag co-occurrence edges built.")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def add_similar_to_edges(similar_pairs: list[tuple[str, str, float]], driver=None):
|
| 147 |
+
"""
|
| 148 |
+
Write SIMILAR_TO edges derived from ChromaDB nearest-neighbor search.
|
| 149 |
+
similar_pairs = [(url_a, url_b, similarity_score), ...]
|
| 150 |
+
"""
|
| 151 |
+
own_driver = driver is None
|
| 152 |
+
if own_driver:
|
| 153 |
+
driver = get_driver()
|
| 154 |
+
|
| 155 |
+
with driver.session(database=config.NEO4J_DATABASE) as session:
|
| 156 |
+
for url_a, url_b, score in similar_pairs:
|
| 157 |
+
session.run("""
|
| 158 |
+
MATCH (a:Bookmark {url: $url_a})
|
| 159 |
+
MATCH (b:Bookmark {url: $url_b})
|
| 160 |
+
MERGE (a)-[r:SIMILAR_TO]-(b)
|
| 161 |
+
SET r.score = $score
|
| 162 |
+
""", url_a=url_a, url_b=url_b, score=score)
|
| 163 |
+
|
| 164 |
+
print(f" SIMILAR_TO: {len(similar_pairs)} edges written.")
|
| 165 |
+
|
| 166 |
+
if own_driver:
|
| 167 |
+
driver.close()
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def query(cypher: str, params: dict | None = None) -> list[dict]:
|
| 171 |
+
"""Run arbitrary Cypher and return results as list of dicts."""
|
| 172 |
+
driver = get_driver()
|
| 173 |
+
with driver.session(database=config.NEO4J_DATABASE) as session:
|
| 174 |
+
result = session.run(cypher, params or {})
|
| 175 |
+
rows = [dict(r) for r in result]
|
| 176 |
+
driver.close()
|
| 177 |
+
return rows
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def get_stats() -> dict:
|
| 181 |
+
rows = query("""
|
| 182 |
+
MATCH (b:Bookmark) WITH count(b) AS bookmarks
|
| 183 |
+
MATCH (t:Tag) WITH bookmarks, count(t) AS tags
|
| 184 |
+
MATCH (c:Category) WITH bookmarks, tags, count(c) AS categories
|
| 185 |
+
RETURN bookmarks, tags, categories
|
| 186 |
+
""")
|
| 187 |
+
return rows[0] if rows else {}
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def find_similar(url: str, limit: int = 10) -> list[dict]:
|
| 191 |
+
return query("""
|
| 192 |
+
MATCH (b:Bookmark {url: $url})-[r:SIMILAR_TO]-(other:Bookmark)
|
| 193 |
+
RETURN other.url AS url, other.title AS title, r.score AS similarity
|
| 194 |
+
ORDER BY r.score DESC LIMIT $limit
|
| 195 |
+
""", {"url": url, "limit": limit})
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def find_by_tag(tag: str, limit: int = 20) -> list[dict]:
|
| 199 |
+
return query("""
|
| 200 |
+
MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: $tag})
|
| 201 |
+
RETURN b.url AS url, b.title AS title, b.score AS score
|
| 202 |
+
ORDER BY b.score DESC LIMIT $limit
|
| 203 |
+
""", {"tag": tag.lower(), "limit": limit})
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def find_tag_cluster(tag: str, hops: int = 2, limit: int = 30) -> list[dict]:
|
| 207 |
+
"""Follow CO_OCCURS_WITH edges to find related tags and their bookmarks."""
|
| 208 |
+
return query(f"""
|
| 209 |
+
MATCH (t:Tag {{name: $tag}})-[:CO_OCCURS_WITH*1..{hops}]-(related:Tag)
|
| 210 |
+
MATCH (b:Bookmark)-[:TAGGED]->(related)
|
| 211 |
+
RETURN DISTINCT b.url AS url, b.title AS title, b.score AS score, related.name AS via_tag
|
| 212 |
+
ORDER BY b.score DESC LIMIT $limit
|
| 213 |
+
""", {"tag": tag.lower(), "limit": limit})
|
openmark/ui/__init__.py
ADDED
|
File without changes
|
openmark/ui/app.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenMark Gradio UI — 3 tabs:
|
| 3 |
+
1. Chat — talk to the LangGraph agent
|
| 4 |
+
2. Search — instant semantic search with filters
|
| 5 |
+
3. Stats — knowledge base overview
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 11 |
+
sys.stdout.reconfigure(encoding="utf-8")
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
from openmark.agent.graph import build_agent, ask
|
| 15 |
+
from openmark.embeddings.factory import get_embedder
|
| 16 |
+
from openmark.stores import chroma as chroma_store
|
| 17 |
+
from openmark.stores import neo4j_store
|
| 18 |
+
from openmark import config
|
| 19 |
+
|
| 20 |
+
# Load once at startup
|
| 21 |
+
print("Loading OpenMark...")
|
| 22 |
+
_embedder = get_embedder()
|
| 23 |
+
_agent = build_agent()
|
| 24 |
+
print("OpenMark ready.")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ── Chat tab ──────────────────────────────────────────────────
|
| 28 |
+
|
| 29 |
+
def chat_fn(message: str, history: list, thread_id: str):
|
| 30 |
+
if not message.strip():
|
| 31 |
+
return history, ""
|
| 32 |
+
response = ask(_agent, message, thread_id=thread_id or "default")
|
| 33 |
+
history.append({"role": "user", "content": message})
|
| 34 |
+
history.append({"role": "assistant", "content": response})
|
| 35 |
+
return history, ""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ── Search tab ────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
def search_fn(query: str, category: str, min_score: float, n_results: int):
|
| 41 |
+
if not query.strip():
|
| 42 |
+
return "Enter a search query."
|
| 43 |
+
|
| 44 |
+
cat = category if category != "All" else None
|
| 45 |
+
ms = min_score if min_score > 0 else None
|
| 46 |
+
|
| 47 |
+
results = chroma_store.search(
|
| 48 |
+
query, _embedder, n=int(n_results),
|
| 49 |
+
category=cat, min_score=ms,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if not results:
|
| 53 |
+
return "No results found."
|
| 54 |
+
|
| 55 |
+
lines = []
|
| 56 |
+
for r in results:
|
| 57 |
+
lines.append(
|
| 58 |
+
f"**{r['rank']}. {r['title'] or r['url']}**\n"
|
| 59 |
+
f"🔗 {r['url']}\n"
|
| 60 |
+
f"📁 {r['category']} | 📌 {', '.join(t for t in r['tags'] if t)} | "
|
| 61 |
+
f"⭐ {r['score']} | 🎯 {r['similarity']:.3f} similarity\n"
|
| 62 |
+
)
|
| 63 |
+
return "\n---\n".join(lines)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ── Stats tab ─────────────────────────────────────────────────
|
| 67 |
+
|
| 68 |
+
def stats_fn():
|
| 69 |
+
chroma = chroma_store.get_stats()
|
| 70 |
+
neo4j = neo4j_store.get_stats()
|
| 71 |
+
|
| 72 |
+
# Category breakdown from Neo4j
|
| 73 |
+
cat_rows = neo4j_store.query("""
|
| 74 |
+
MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
|
| 75 |
+
RETURN c.name AS category, count(b) AS count
|
| 76 |
+
ORDER BY count DESC
|
| 77 |
+
""")
|
| 78 |
+
cat_lines = "\n".join(f" {r['category']:<35} {r['count']:>5}" for r in cat_rows)
|
| 79 |
+
|
| 80 |
+
# Top tags
|
| 81 |
+
tag_rows = neo4j_store.query("""
|
| 82 |
+
MATCH (b:Bookmark)-[:TAGGED]->(t:Tag)
|
| 83 |
+
RETURN t.name AS tag, count(b) AS count
|
| 84 |
+
ORDER BY count DESC LIMIT 20
|
| 85 |
+
""")
|
| 86 |
+
tag_lines = ", ".join(f"{r['tag']} ({r['count']})" for r in tag_rows)
|
| 87 |
+
|
| 88 |
+
return (
|
| 89 |
+
f"## OpenMark Knowledge Base\n\n"
|
| 90 |
+
f"**ChromaDB vectors:** {chroma.get('total', 0)}\n"
|
| 91 |
+
f"**Neo4j bookmarks:** {neo4j.get('bookmarks', 0)}\n"
|
| 92 |
+
f"**Neo4j tags:** {neo4j.get('tags', 0)}\n"
|
| 93 |
+
f"**Neo4j categories:** {neo4j.get('categories', 0)}\n\n"
|
| 94 |
+
f"### By Category\n```\n{cat_lines}\n```\n\n"
|
| 95 |
+
f"### Top Tags\n{tag_lines}"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ── Build UI ──────────────────────────────────────────────────
|
| 100 |
+
|
| 101 |
+
def build_ui():
|
| 102 |
+
categories = ["All"] + config.CATEGORIES
|
| 103 |
+
|
| 104 |
+
with gr.Blocks(title="OpenMark", theme=gr.themes.Soft()) as app:
|
| 105 |
+
gr.Markdown("# OpenMark — Your Personal Knowledge Graph")
|
| 106 |
+
|
| 107 |
+
with gr.Tabs():
|
| 108 |
+
|
| 109 |
+
# Tab 1: Chat
|
| 110 |
+
with gr.Tab("Chat"):
|
| 111 |
+
thread = gr.Textbox(value="default", label="Session ID", scale=1)
|
| 112 |
+
chatbot = gr.Chatbot(type="messages", height=500)
|
| 113 |
+
msg_box = gr.Textbox(
|
| 114 |
+
placeholder="Ask anything about your saved bookmarks...",
|
| 115 |
+
label="Message", lines=2,
|
| 116 |
+
)
|
| 117 |
+
send_btn = gr.Button("Send", variant="primary")
|
| 118 |
+
|
| 119 |
+
send_btn.click(
|
| 120 |
+
chat_fn,
|
| 121 |
+
inputs=[msg_box, chatbot, thread],
|
| 122 |
+
outputs=[chatbot, msg_box],
|
| 123 |
+
)
|
| 124 |
+
msg_box.submit(
|
| 125 |
+
chat_fn,
|
| 126 |
+
inputs=[msg_box, chatbot, thread],
|
| 127 |
+
outputs=[chatbot, msg_box],
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Tab 2: Search
|
| 131 |
+
with gr.Tab("Search"):
|
| 132 |
+
with gr.Row():
|
| 133 |
+
q_input = gr.Textbox(placeholder="Search your knowledge base...", label="Query", scale=3)
|
| 134 |
+
cat_input = gr.Dropdown(categories, value="All", label="Category")
|
| 135 |
+
with gr.Row():
|
| 136 |
+
score_input = gr.Slider(0, 10, value=0, step=1, label="Min Quality Score")
|
| 137 |
+
n_input = gr.Slider(5, 50, value=10, step=5, label="Results")
|
| 138 |
+
search_btn = gr.Button("Search", variant="primary")
|
| 139 |
+
search_output = gr.Markdown()
|
| 140 |
+
|
| 141 |
+
search_btn.click(
|
| 142 |
+
search_fn,
|
| 143 |
+
inputs=[q_input, cat_input, score_input, n_input],
|
| 144 |
+
outputs=search_output,
|
| 145 |
+
)
|
| 146 |
+
q_input.submit(
|
| 147 |
+
search_fn,
|
| 148 |
+
inputs=[q_input, cat_input, score_input, n_input],
|
| 149 |
+
outputs=search_output,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Tab 3: Stats
|
| 153 |
+
with gr.Tab("Stats"):
|
| 154 |
+
refresh_btn = gr.Button("Refresh Stats")
|
| 155 |
+
stats_output = gr.Markdown()
|
| 156 |
+
|
| 157 |
+
refresh_btn.click(stats_fn, outputs=stats_output)
|
| 158 |
+
app.load(stats_fn, outputs=stats_output)
|
| 159 |
+
|
| 160 |
+
return app
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
ui = build_ui()
|
| 165 |
+
ui.launch(server_name="0.0.0.0", server_port=7860, share=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
chromadb>=1.5.4
|
| 2 |
+
langchain>=0.3.25
|
| 3 |
+
langgraph>=1.0.1
|
| 4 |
+
langchain-openai>=0.3.23
|
| 5 |
+
langchain-neo4j>=0.4.0
|
| 6 |
+
sentence-transformers==3.3.1
|
| 7 |
+
transformers>=4.57.0
|
| 8 |
+
huggingface_hub>=1.6.0
|
| 9 |
+
torch>=2.0.0
|
| 10 |
+
neo4j>=5.28.1
|
| 11 |
+
gradio>=6.6.0
|
| 12 |
+
requests>=2.31.0
|
| 13 |
+
python-dotenv>=1.0.0
|
| 14 |
+
numpy>=1.24.0
|
scripts/ingest.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenMark Full Ingest Pipeline
|
| 3 |
+
Run this once (or again to update) to:
|
| 4 |
+
1. Merge all data sources (CATEGORIZED.json + LinkedIn + YouTube)
|
| 5 |
+
2. Embed everything with chosen provider (local pplx-embed or Azure)
|
| 6 |
+
3. Store in ChromaDB (semantic search)
|
| 7 |
+
4. Store in Neo4j (knowledge graph)
|
| 8 |
+
5. Compute SIMILAR_TO edges (top-5 neighbors per bookmark → graph edges)
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
C:\\Python313\\python scripts/ingest.py
|
| 12 |
+
C:\\Python313\\python scripts/ingest.py --provider azure
|
| 13 |
+
C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import sys
|
| 17 |
+
import os
|
| 18 |
+
import argparse
|
| 19 |
+
|
| 20 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 21 |
+
sys.stdout.reconfigure(encoding="utf-8")
|
| 22 |
+
|
| 23 |
+
from openmark.pipeline.merge import merge_all
|
| 24 |
+
from openmark.embeddings.factory import get_embedder
|
| 25 |
+
from openmark.stores import chroma as chroma_store
|
| 26 |
+
from openmark.stores import neo4j_store
|
| 27 |
+
from openmark import config
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
|
| 31 |
+
"""
|
| 32 |
+
For each item, find its top-k nearest neighbors in ChromaDB
|
| 33 |
+
and write SIMILAR_TO edges in Neo4j.
|
| 34 |
+
This creates the semantic web inside the graph.
|
| 35 |
+
"""
|
| 36 |
+
print(f"\nBuilding SIMILAR_TO edges (top-{top_k} per bookmark)...")
|
| 37 |
+
pairs = []
|
| 38 |
+
total = len(items)
|
| 39 |
+
|
| 40 |
+
for i, item in enumerate(items):
|
| 41 |
+
url = item["url"]
|
| 42 |
+
try:
|
| 43 |
+
results = chroma_store.search(
|
| 44 |
+
item["doc_text"], embedder, n=top_k + 1
|
| 45 |
+
)
|
| 46 |
+
for r in results:
|
| 47 |
+
if r["url"] != url and r["similarity"] > 0.5:
|
| 48 |
+
pairs.append((url, r["url"], r["similarity"]))
|
| 49 |
+
except Exception:
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
if (i + 1) % 500 == 0:
|
| 53 |
+
print(f" Processed {i+1}/{total} for SIMILAR_TO")
|
| 54 |
+
|
| 55 |
+
print(f" Writing {len(pairs)} SIMILAR_TO edges to Neo4j...")
|
| 56 |
+
neo4j_store.add_similar_to_edges(pairs)
|
| 57 |
+
print(" SIMILAR_TO done.")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def main():
|
| 61 |
+
parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
|
| 62 |
+
parser.add_argument("--provider", default=None, help="Embedding provider: local or azure")
|
| 63 |
+
parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
|
| 64 |
+
parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
|
| 65 |
+
args = parser.parse_args()
|
| 66 |
+
|
| 67 |
+
if args.provider:
|
| 68 |
+
os.environ["EMBEDDING_PROVIDER"] = args.provider
|
| 69 |
+
|
| 70 |
+
print("=" * 60)
|
| 71 |
+
print("OPENMARK INGEST PIPELINE")
|
| 72 |
+
print(f"Embedding: {config.EMBEDDING_PROVIDER}")
|
| 73 |
+
print("=" * 60)
|
| 74 |
+
|
| 75 |
+
# Step 1: Merge all sources
|
| 76 |
+
print("\n[1/4] Merging data sources...")
|
| 77 |
+
items = merge_all(include_fresh_raindrop=args.fresh_raindrop)
|
| 78 |
+
|
| 79 |
+
# Step 2: Load embedder
|
| 80 |
+
print(f"\n[2/4] Loading {config.EMBEDDING_PROVIDER} embedder...")
|
| 81 |
+
embedder = get_embedder()
|
| 82 |
+
|
| 83 |
+
# Step 3: ChromaDB
|
| 84 |
+
print("\n[3/4] Ingesting into ChromaDB...")
|
| 85 |
+
chroma_store.ingest(items, embedder)
|
| 86 |
+
|
| 87 |
+
# Step 4: Neo4j
|
| 88 |
+
print("\n[4/4] Ingesting into Neo4j...")
|
| 89 |
+
neo4j_store.ingest(items)
|
| 90 |
+
|
| 91 |
+
# Step 5: SIMILAR_TO edges
|
| 92 |
+
if not args.skip_similar:
|
| 93 |
+
build_similar_to_edges(items, embedder, top_k=5)
|
| 94 |
+
|
| 95 |
+
print("\n" + "=" * 60)
|
| 96 |
+
print("INGEST COMPLETE")
|
| 97 |
+
chroma = chroma_store.get_stats()
|
| 98 |
+
neo4j = neo4j_store.get_stats()
|
| 99 |
+
print(f" ChromaDB: {chroma.get('total', 0)} vectors")
|
| 100 |
+
print(f" Neo4j: {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
|
| 101 |
+
print("=" * 60)
|
| 102 |
+
print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
|
| 103 |
+
print(" or: C:\\Python313\\python -m openmark.ui.app")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
scripts/search.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenMark CLI Search — instant search from terminal.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
C:\\Python313\\python scripts/search.py "RAG tools"
|
| 6 |
+
C:\\Python313\\python scripts/search.py "LangGraph" --category "Agent Development"
|
| 7 |
+
C:\\Python313\\python scripts/search.py "embeddings" --n 20
|
| 8 |
+
C:\\Python313\\python scripts/search.py --tag "rag"
|
| 9 |
+
C:\\Python313\\python scripts/search.py --stats
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
import argparse
|
| 15 |
+
|
| 16 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 17 |
+
sys.stdout.reconfigure(encoding="utf-8")
|
| 18 |
+
|
| 19 |
+
from openmark.embeddings.factory import get_embedder
|
| 20 |
+
from openmark.stores import chroma as chroma_store
|
| 21 |
+
from openmark.stores import neo4j_store
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def print_results(results: list[dict]):
|
| 25 |
+
if not results:
|
| 26 |
+
print("No results found.")
|
| 27 |
+
return
|
| 28 |
+
for r in results:
|
| 29 |
+
title = r.get("title") or r.get("url")
|
| 30 |
+
url = r.get("url", "")
|
| 31 |
+
cat = r.get("category", "")
|
| 32 |
+
sim = r.get("similarity", "")
|
| 33 |
+
score = r.get("score", "")
|
| 34 |
+
tags = ", ".join(t for t in r.get("tags", []) if t)
|
| 35 |
+
print(f"\n {r.get('rank', '-')}. {title}")
|
| 36 |
+
print(f" {url}")
|
| 37 |
+
if cat: print(f" Category: {cat}")
|
| 38 |
+
if tags: print(f" Tags: {tags}")
|
| 39 |
+
if score: print(f" Score: {score}")
|
| 40 |
+
if sim: print(f" Similarity: {sim}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def main():
|
| 44 |
+
parser = argparse.ArgumentParser(description="OpenMark CLI Search")
|
| 45 |
+
parser.add_argument("query", nargs="?", default=None, help="Search query")
|
| 46 |
+
parser.add_argument("--category", default=None, help="Filter by category")
|
| 47 |
+
parser.add_argument("--tag", default=None, help="Search by tag (graph lookup)")
|
| 48 |
+
parser.add_argument("--n", type=int, default=10, help="Number of results")
|
| 49 |
+
parser.add_argument("--stats", action="store_true", help="Show knowledge base stats")
|
| 50 |
+
args = parser.parse_args()
|
| 51 |
+
|
| 52 |
+
if args.stats:
|
| 53 |
+
chroma = chroma_store.get_stats()
|
| 54 |
+
neo4j = neo4j_store.get_stats()
|
| 55 |
+
print("\nOpenMark Stats:")
|
| 56 |
+
print(f" ChromaDB vectors: {chroma.get('total', 0)}")
|
| 57 |
+
print(f" Neo4j bookmarks: {neo4j.get('bookmarks', 0)}")
|
| 58 |
+
print(f" Neo4j tags: {neo4j.get('tags', 0)}")
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
if args.tag:
|
| 62 |
+
print(f"\nSearching by tag: '{args.tag}'")
|
| 63 |
+
results = neo4j_store.find_by_tag(args.tag, limit=args.n)
|
| 64 |
+
for r in results:
|
| 65 |
+
print(f"\n - {r.get('title', '')}")
|
| 66 |
+
print(f" {r.get('url', '')} (score: {r.get('score', '')})")
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
if not args.query:
|
| 70 |
+
parser.print_help()
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
print(f"\nSearching: '{args.query}'")
|
| 74 |
+
if args.category:
|
| 75 |
+
print(f"Category filter: {args.category}")
|
| 76 |
+
|
| 77 |
+
embedder = get_embedder()
|
| 78 |
+
results = chroma_store.search(
|
| 79 |
+
args.query, embedder, n=args.n, category=args.category
|
| 80 |
+
)
|
| 81 |
+
print_results(results)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|