codingwithadi commited on
Commit
81598c5
·
verified ·
1 Parent(s): 39cff07

Upload folder using huggingface_hub

Browse files
.env.example ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Embedding Provider: "local" or "azure" ───────────────────
2
+ EMBEDDING_PROVIDER=local
3
+
4
+ # ── Local pplx-embed ─────────────────────────────────────────
5
+ PPLX_QUERY_MODEL=perplexity-ai/pplx-embed-v1-0.6b
6
+ PPLX_DOC_MODEL=perplexity-ai/pplx-embed-context-v1-0.6b
7
+
8
+ # ── Azure AI Foundry ──────────────────────────────────────────
9
+ AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
10
+ AZURE_API_KEY=your-azure-api-key
11
+ AZURE_DEPLOYMENT_LLM=gpt-4o-mini
12
+ AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
13
+ AZURE_API_VERSION=2024-05-01-preview
14
+
15
+ # ── Neo4j ─────────────────────────────────────────────────────
16
+ NEO4J_URI=bolt://127.0.0.1:7687
17
+ NEO4J_USER=neo4j
18
+ NEO4J_PASSWORD=your-neo4j-password
19
+ NEO4J_DATABASE=db1
20
+
21
+ # ── Raindrop ──────────────────────────────────────────────────
22
+ RAINDROP_TOKEN=your-raindrop-test-token
23
+
24
+ # ── Data paths ────────────────────────────────────────────────
25
+ RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
26
+ CHROMA_PATH=C:\path\to\OpenMark\data\chroma_db
.gitattributes CHANGED
@@ -1,35 +1,8 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Normalize line endings
2
+ * text=auto
3
+ *.py text eol=lf
4
+ *.md text eol=lf
5
+ *.txt text eol=lf
6
+ *.json text eol=lf
7
+ *.env text eol=lf
8
+ *.gitignore text eol=lf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Credentials — NEVER commit ───────────────────────────────
2
+ .env
3
+
4
+ # ── Personal data — your bookmark vectors ────────────────────
5
+ data/chroma_db/
6
+
7
+ # ── Python ────────────────────────────────────────────────────
8
+ __pycache__/
9
+ *.py[cod]
10
+ *.pyo
11
+ *.pyd
12
+ .Python
13
+ *.egg-info/
14
+ dist/
15
+ build/
16
+ *.egg
17
+ .eggs/
18
+
19
+ # ── Virtual environments ──────────────────────────────────────
20
+ venv/
21
+ .venv/
22
+ env/
23
+ ENV/
24
+
25
+ # ── IDE ───────────────────────────────────────────────────────
26
+ .idea/
27
+ .vscode/
28
+ *.swp
29
+ *.swo
30
+ .DS_Store
31
+ Thumbs.db
32
+
33
+ # ── Logs & temp ───────────────────────────────────────────────
34
+ *.log
35
+ *.tmp
36
+ *.bak
37
+
38
+ # ── HuggingFace cache (large model files) ────────────────────
39
+ .cache/
40
+
41
+ # ── Raw data exports — personal, not for the repo ────────────
42
+ raindrop-mission/
43
+ data/linkedin_saved.json
44
+ data/youtube_MASTER.json
45
+ data/CATEGORIZED.json
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ahmad Othman Ammar Adi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,31 +1,267 @@
1
- ---
2
- title: OpenMark
3
- emoji: 🔖
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: "6.6.0"
8
- app_file: app.py
9
- pinned: true
10
- license: mit
11
- short_description: Personal knowledge graph — search 8K+ bookmarks with AI
12
- tags:
13
- - rag
14
- - knowledge-graph
15
- - neo4j
16
- - chromadb
17
- - langraph
18
- - pplx-embed
19
- - second-brain
20
- - bookmarks
21
- ---
22
-
23
- # OpenMark
24
-
25
- **Personal knowledge graph** — 8,000+ bookmarks, LinkedIn saves, and YouTube videos indexed with pplx-embed, searchable with ChromaDB and Neo4j, queryable via a LangGraph agent.
26
-
27
- Built by [Ahmad Othman Ammar Adi](https://github.com/OthmanAdi) · [GitHub](https://github.com/OthmanAdi/OpenMark)
28
-
29
- ## Setup required
30
-
31
- This Space requires your own credentials (Neo4j, Azure, Raindrop). See the [GitHub repo](https://github.com/OthmanAdi/OpenMark) for full setup instructions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenMark
2
+
3
+ **Your personal knowledge graph — built from everything you've ever saved.**
4
+
5
+ OpenMark ingests your bookmarks, LinkedIn saved posts, and YouTube videos into a dual-store knowledge system: **ChromaDB** for semantic vector search and **Neo4j** for graph-based connection discovery. A LangGraph agent sits on top, letting you query everything in natural language.
6
+
7
+ Built by [Ahmad Othman Ammar Adi](https://github.com/OthmanAdi).
8
+
9
+ ---
10
+
11
+ ## What it does
12
+
13
+ - Pulls all your saved content from multiple sources into one place
14
+ - Embeds everything using [pplx-embed](https://huggingface.co/collections/perplexity-ai/pplx-embed) (local, free) or Azure AI Foundry (fast, cheap)
15
+ - Stores vectors in **ChromaDB** — find things by *meaning*, not keywords
16
+ - Builds a **Neo4j knowledge graph** — discover how topics connect
17
+ - Runs a **LangGraph agent** (powered by gpt-4o-mini) that searches both stores intelligently
18
+ - Serves a **Gradio UI** with Chat, Search, and Stats tabs
19
+ - Also works as a **CLI** — `python scripts/search.py "RAG tools"`
20
+
21
+ ---
22
+
23
+ ## Data Sources
24
+
25
+ ### 1. Raindrop.io
26
+
27
+ Create a test token at [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations).
28
+ OpenMark pulls **all collections** automatically via the Raindrop REST API.
29
+
30
+ ### 2. Browser Bookmarks
31
+
32
+ Export your bookmarks as an HTML file from Edge, Chrome, or Firefox:
33
+ - **Edge:** `Settings → Favourites → ··· → Export favourites` → save as `favorites.html`
34
+ - **Chrome/Firefox:** `Bookmarks Manager → Export`
35
+
36
+ Point `RAINDROP_MISSION_DIR` in your `.env` to the folder containing the exported HTML files.
37
+ The pipeline parses the Netscape bookmark format automatically.
38
+
39
+ ### 3. LinkedIn Saved Posts
40
+
41
+ LinkedIn does not provide a public API for saved posts. The included `linkedin_fetch.py` script uses your browser session cookie to call LinkedIn's internal Voyager GraphQL API.
42
+
43
+ **Steps:**
44
+ 1. Log into LinkedIn in your browser
45
+ 2. Open DevTools → Application → Cookies → copy the value of `li_at`
46
+ 3. Run:
47
+ ```bash
48
+ python raindrop-mission/linkedin_fetch.py
49
+ ```
50
+ Paste your `li_at` cookie when prompted. The script fetches all saved posts and writes `linkedin_saved.json`.
51
+
52
+ > **Personal use only.** This uses LinkedIn's internal API which is not publicly documented or officially supported. Use responsibly.
53
+
54
+ ### 4. YouTube
55
+
56
+ Uses the official [YouTube Data API v3](https://developers.google.com/youtube/v3) via OAuth 2.0.
57
+
58
+ **Steps:**
59
+ 1. Go to [Google Cloud Console](https://console.cloud.google.com/) → Create a project
60
+ 2. Enable the **YouTube Data API v3**
61
+ 3. Create OAuth 2.0 credentials → Download as `client_secret.json`
62
+ 4. Add your Google account as a test user (OAuth consent screen → Test users)
63
+ 5. Run:
64
+ ```bash
65
+ python raindrop-mission/youtube_fetch.py
66
+ ```
67
+ A browser window opens for auth. After that, `youtube_MASTER.json` is written with liked videos, watch later, and playlists.
68
+
69
+ ---
70
+
71
+ ## How it works
72
+
73
+ ```
74
+ Your saved content
75
+
76
+
77
+ normalize.py ← clean titles, dedupe by URL, fix categories
78
+
79
+
80
+ EmbeddingProvider ← LOCAL: pplx-embed-context-v1-0.6b (documents)
81
+ pplx-embed-v1-0.6b (queries)
82
+ AZURE: text-embedding-ada-002
83
+
84
+ ├──────────────────────────────────┐
85
+ ▼ ▼
86
+ ChromaDB Neo4j
87
+ (vector store) (knowledge graph)
88
+ find by meaning find by connection
89
+
90
+ "show me RAG tools" "what connects LangGraph
91
+ to my Neo4j saves?"
92
+ │ │
93
+ └──────────────┬───────────────────┘
94
+
95
+ LangGraph Agent
96
+ (gpt-4o-mini)
97
+
98
+
99
+ Gradio UI / CLI
100
+ ```
101
+
102
+ ### Why embeddings?
103
+
104
+ An embedding is a list of numbers that represents the *meaning* of a piece of text. Two pieces of text with similar meaning will have similar numbers — even if they use completely different words. This is how OpenMark finds "retrieval augmented generation tutorials" when you search "RAG tools."
105
+
106
+ ### Why ChromaDB?
107
+
108
+ ChromaDB stores those embedding vectors locally on your disk. It's a persistent vector database — no server, no cloud, no API key. When you search, it compares your query's embedding against all stored embeddings and returns the closest matches.
109
+
110
+ ### Why Neo4j?
111
+
112
+ Embeddings answer "what's similar?" — Neo4j answers "how are these connected?" Every bookmark is a node. Tags, categories, domains, and sources are also nodes. Edges connect them. After ingestion, OpenMark also writes `SIMILAR_TO` edges derived from embedding neighbors — so the graph contains semantic connections you never manually created. You can then traverse: *"start from this LangChain article, walk similar-to 2 hops, what clusters emerge?"*
113
+
114
+ ---
115
+
116
+ ## Requirements
117
+
118
+ - Python 3.13
119
+ - Neo4j Desktop (local) or AuraDB (cloud) — [neo4j.com/download](https://neo4j.com/download/)
120
+ - **Either** Azure AI Foundry account **or** enough disk space for local pplx-embed (~1.2 GB)
121
+
122
+ ---
123
+
124
+ ## Setup
125
+
126
+ ### 1. Clone and install
127
+
128
+ ```bash
129
+ git clone https://github.com/OthmanAdi/OpenMark.git
130
+ cd OpenMark
131
+ pip install -r requirements.txt
132
+ ```
133
+
134
+ ### 2. Configure
135
+
136
+ ```bash
137
+ cp .env.example .env
138
+ ```
139
+
140
+ Edit `.env` with your values:
141
+
142
+ ```env
143
+ # Choose your embedding provider
144
+ EMBEDDING_PROVIDER=local # or: azure
145
+
146
+ # Azure AI Foundry (required if EMBEDDING_PROVIDER=azure, also used for the LLM agent)
147
+ AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
148
+ AZURE_API_KEY=your-key
149
+ AZURE_DEPLOYMENT_LLM=gpt-4o-mini
150
+ AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
151
+
152
+ # Neo4j
153
+ NEO4J_URI=bolt://127.0.0.1:7687
154
+ NEO4J_USER=neo4j
155
+ NEO4J_PASSWORD=your-password
156
+ NEO4J_DATABASE=neo4j
157
+
158
+ # Raindrop (get token at app.raindrop.io/settings/integrations)
159
+ RAINDROP_TOKEN=your-token
160
+
161
+ # Path to your raindrop-mission data folder
162
+ RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
163
+ ```
164
+
165
+ ### 3. Ingest
166
+
167
+ ```bash
168
+ # Local embeddings (free, ~20 min for 8K items on CPU)
169
+ python scripts/ingest.py
170
+
171
+ # Azure embeddings (fast, ~5 min, costs ~€0.30 for 8K items)
172
+ python scripts/ingest.py --provider azure
173
+
174
+ # Also pull fresh from Raindrop API during ingest
175
+ python scripts/ingest.py --fresh-raindrop
176
+ ```
177
+
178
+ ### 4. Search (CLI)
179
+
180
+ ```bash
181
+ python scripts/search.py "RAG tools"
182
+ python scripts/search.py "LangGraph" --category "Agent Development"
183
+ python scripts/search.py --tag "rag"
184
+ python scripts/search.py --stats
185
+ ```
186
+
187
+ ### 5. Launch UI
188
+
189
+ ```bash
190
+ python openmark/ui/app.py
191
+ ```
192
+
193
+ Open [http://localhost:7860](http://localhost:7860)
194
+
195
+ ---
196
+
197
+ ## Required API Keys
198
+
199
+ | Key | Where to get it | Required? |
200
+ |-----|----------------|-----------|
201
+ | `RAINDROP_TOKEN` | [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations) | Yes |
202
+ | `AZURE_API_KEY` | Azure Portal → your AI Foundry resource | Only if `EMBEDDING_PROVIDER=azure` |
203
+ | `NEO4J_PASSWORD` | Set when creating your Neo4j database | Yes |
204
+ | YouTube OAuth | Google Cloud Console → YouTube Data API v3 | Only if ingesting YouTube |
205
+
206
+ No HuggingFace token is needed for local pplx-embed. The models are open weights and download automatically. You will see a warning `"You are sending unauthenticated requests to the HF Hub"` — this is harmless and can be silenced by setting `HF_TOKEN` in your `.env` if you want higher rate limits.
207
+
208
+ ---
209
+
210
+ ## Project Structure
211
+
212
+ ```
213
+ OpenMark/
214
+ ├── openmark/
215
+ │ ├── config.py ← all settings loaded from .env
216
+ │ ├── pipeline/
217
+ │ │ ├── raindrop.py ← pull all Raindrop collections via API
218
+ │ │ ├── normalize.py ← clean, dedupe, build embedding text
219
+ │ │ └── merge.py ← combine all sources
220
+ │ ├── embeddings/
221
+ │ │ ├── base.py ← abstract EmbeddingProvider interface
222
+ │ │ ├── local.py ← pplx-embed (local, free)
223
+ │ │ ├── azure.py ← Azure AI Foundry
224
+ │ │ └── factory.py ← returns provider based on .env
225
+ │ ├── stores/
226
+ │ │ ├── chroma.py ← ChromaDB: ingest + semantic search
227
+ │ │ └── neo4j_store.py ← Neo4j: graph nodes, edges, traversal
228
+ │ ├── agent/
229
+ │ │ ├── tools.py ← LangGraph tools (search, tag, graph)
230
+ │ │ └── graph.py ← create_react_agent with gpt-4o-mini
231
+ │ └── ui/
232
+ │ └── app.py ← Gradio UI (Chat / Search / Stats)
233
+ └── scripts/
234
+ ├── ingest.py ← full pipeline runner
235
+ └── search.py ← CLI search
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Roadmap
241
+
242
+ - [ ] OpenAI embeddings integration
243
+ - [ ] Ollama local LLM support
244
+ - [ ] Pinecone vector store option
245
+ - [ ] Web scraping — fetch full page content for richer embeddings
246
+ - [ ] Browser extension for real-time saving to OpenMark
247
+ - [ ] Comet / Arc browser bookmark import
248
+ - [ ] Automatic re-ingestion on schedule
249
+ - [ ] Export to Obsidian / Notion
250
+ - [ ] Multi-user support
251
+
252
+ ---
253
+
254
+ ## Documentation
255
+
256
+ | Doc | What's in it |
257
+ |-----|-------------|
258
+ | [docs/data-collection.md](docs/data-collection.md) | Full guide for each data source — Raindrop, Edge, LinkedIn cookie method, YouTube OAuth, daily.dev console script |
259
+ | [docs/ingest.md](docs/ingest.md) | All ingest flags, timing for each step, how SIMILAR_TO edges work, re-run behavior |
260
+ | [docs/architecture.md](docs/architecture.md) | Dual-store design, Neo4j graph schema, embedding patches, Cypher query examples, agent tools |
261
+ | [docs/troubleshooting.md](docs/troubleshooting.md) | pplx-embed compatibility fixes, LinkedIn queryId changes, Neo4j connection issues, Windows encoding |
262
+
263
+ ---
264
+
265
+ ## License
266
+
267
+ MIT
app.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace Space entry point — launches the OpenMark Gradio UI."""
2
+ import sys, os
3
+ sys.path.insert(0, os.path.dirname(__file__))
4
+ from openmark.ui.app import build_ui
5
+
6
+ if __name__ == "__main__":
7
+ ui = build_ui()
8
+ ui.launch()
data/.gitkeep ADDED
File without changes
docs/architecture.md ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture
2
+
3
+ ## Overview
4
+
5
+ OpenMark uses a **dual-store architecture** — two databases working together, each doing what it's best at.
6
+
7
+ ```
8
+ User Query
9
+
10
+ LangGraph Agent
11
+ (gpt-4o-mini)
12
+ / \
13
+ ChromaDB Neo4j
14
+ (vector store) (graph store)
15
+
16
+ "find by meaning" "find by connection"
17
+ "what's similar?" "how are things linked?"
18
+ ```
19
+
20
+ ---
21
+
22
+ ## Embedding Layer
23
+
24
+ The embedding layer is **provider-agnostic** — swap between local and cloud with one env var.
25
+
26
+ ```
27
+ EMBEDDING_PROVIDER=local → LocalEmbedder (pplx-embed, runs on your machine)
28
+ EMBEDDING_PROVIDER=azure → AzureEmbedder (Azure AI Foundry, API call)
29
+ ```
30
+
31
+ **Why two pplx-embed models?**
32
+
33
+ Perplexity AI ships two variants:
34
+ - `pplx-embed-v1-0.6b` — for encoding **queries** (what the user types)
35
+ - `pplx-embed-context-v1-0.6b` — for encoding **documents** (the bookmarks, surrounding context matters)
36
+
37
+ Using the correct model for each role improves retrieval quality. Most implementations use one model for both — this is the correct production pattern.
38
+
39
+ **The compatibility patches:**
40
+
41
+ pplx-embed models ship with custom Python code (`st_quantize.py`) that has two incompatibilities with modern libraries:
42
+
43
+ 1. **`sentence_transformers 4.x` removed the `Module` base class** — pplx-embed's code imports it. Fixed by aliasing `torch.nn.Module` to `sentence_transformers.models.Module` before import.
44
+
45
+ 2. **`transformers 4.57` added `list_repo_templates()`** — it looks for an `additional_chat_templates` folder in every model repo. pplx-embed doesn't have one, causing a hard 404 crash. Fixed by monkey-patching the function to return an empty list on exception.
46
+
47
+ Both patches are applied in `openmark/embeddings/local.py` before any model loading.
48
+
49
+ **Why `sentence-transformers==3.3.1` specifically?**
50
+
51
+ Version 4.x removed the `Module` base class that pplx-embed depends on. Pin to 3.3.1.
52
+
53
+ ---
54
+
55
+ ## ChromaDB
56
+
57
+ Local, file-based vector database. No server, no API key, no cloud.
58
+
59
+ **Collection:** `openmark_bookmarks`
60
+ **Similarity metric:** cosine
61
+ **Data path:** `CHROMA_PATH` in `.env` (default: `OpenMark/data/chroma_db/`)
62
+
63
+ **What's stored per item:**
64
+ ```python
65
+ {
66
+ "id": url, # primary key
67
+ "document": doc_text, # rich text used for embedding
68
+ "metadata": {
69
+ "title": str,
70
+ "category": str,
71
+ "source": str, # raindrop, linkedin, youtube_liked, edge, etc.
72
+ "score": float, # quality score 1-10
73
+ "tags": str, # comma-separated
74
+ "folder": str,
75
+ },
76
+ "embedding": [float x 1024] # or 1536 for Azure
77
+ }
78
+ ```
79
+
80
+ **Querying:**
81
+ ```python
82
+ collection.query(
83
+ query_embeddings=[embedder.embed_query("RAG tools")],
84
+ n_results=10,
85
+ where={"category": {"$eq": "RAG & Vector Search"}}, # optional filter
86
+ )
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Neo4j Graph Schema
92
+
93
+ ```
94
+ (:Bookmark {url, title, score})
95
+ -[:IN_CATEGORY]-> (:Category {name})
96
+ -[:TAGGED]-> (:Tag {name})
97
+ -[:FROM_SOURCE]-> (:Source {name})
98
+ -[:FROM_DOMAIN]-> (:Domain {name})
99
+ -[:SIMILAR_TO {score}]-> (:Bookmark) ← from embeddings
100
+
101
+ (:Tag)-[:CO_OCCURS_WITH {count}]-(:Tag) ← tags that appear together
102
+ ```
103
+
104
+ **Useful Cypher queries:**
105
+
106
+ ```cypher
107
+ // Count everything
108
+ MATCH (b:Bookmark) RETURN count(b) AS bookmarks
109
+ MATCH (t:Tag) RETURN count(t) AS tags
110
+
111
+ // Top categories
112
+ MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
113
+ RETURN c.name, count(b) AS count ORDER BY count DESC
114
+
115
+ // All bookmarks tagged 'rag'
116
+ MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: 'rag'})
117
+ RETURN b.title, b.url ORDER BY b.score DESC
118
+
119
+ // Find what connects to 'langchain' tag (2 hops)
120
+ MATCH (t:Tag {name: 'langchain'})-[:CO_OCCURS_WITH*1..2]-(related:Tag)
121
+ RETURN related.name, count(*) AS strength ORDER BY strength DESC
122
+
123
+ // Similar bookmarks to a URL
124
+ MATCH (b:Bookmark {url: 'https://...'})-[r:SIMILAR_TO]-(other)
125
+ RETURN other.title, other.url, r.score ORDER BY r.score DESC
126
+
127
+ // Most connected domains
128
+ MATCH (b:Bookmark)-[:FROM_DOMAIN]->(d:Domain)
129
+ RETURN d.name, count(b) AS saved ORDER BY saved DESC LIMIT 20
130
+ ```
131
+
132
+ ---
133
+
134
+ ## LangGraph Agent
135
+
136
+ Built with `create_react_agent` from LangGraph 1.0.x.
137
+
138
+ **Model:** Azure gpt-4o-mini (streaming enabled)
139
+ **Memory:** `MemorySaver` — conversation history persists per `thread_id` within a session
140
+
141
+ **Tools:**
142
+
143
+ | Tool | Store | Description |
144
+ |------|-------|-------------|
145
+ | `search_semantic` | ChromaDB | Natural language vector search |
146
+ | `search_by_category` | ChromaDB | Filter by category + optional query |
147
+ | `find_by_tag` | Neo4j | Exact tag lookup |
148
+ | `find_similar_bookmarks` | Neo4j | SIMILAR_TO edge traversal |
149
+ | `explore_tag_cluster` | Neo4j | CO_OCCURS_WITH traversal (2 hops) |
150
+ | `get_stats` | Both | Count totals |
151
+ | `run_cypher` | Neo4j | Raw Cypher for power users |
152
+
153
+ **Agent routing:** The LLM decides which tool(s) to call based on the query. For "what do I know about RAG" it will call `search_semantic` + `search_by_category` + `find_by_tag`. For "how does LangGraph connect to my Neo4j saves" it will call `explore_tag_cluster` and `run_cypher`.
154
+
155
+ ---
156
+
157
+ ## Gradio UI
158
+
159
+ Three tabs:
160
+
161
+ | Tab | What it does |
162
+ |-----|-------------|
163
+ | Chat | Full LangGraph agent conversation. Remembers context within session. |
164
+ | Search | Direct ChromaDB search with category filter, min score slider, result count. |
165
+ | Stats | Neo4j category breakdown + top tags. Loads on startup. |
166
+
167
+ Run: `python openmark/ui/app.py` → `http://localhost:7860`
168
+
169
+ ---
170
+
171
+ ## Data Flow Summary
172
+
173
+ ```
174
+ Source files (JSON, HTML)
175
+
176
+ merge.py → normalize.py
177
+
178
+ 8,007 items with doc_text
179
+
180
+ EmbeddingProvider.embed_documents()
181
+
182
+ ┌────┴────┐
183
+ │ │
184
+ ChromaDB Neo4j
185
+ add() MERGE nodes + relationships
186
+ CO_OCCURS_WITH edges
187
+ SIMILAR_TO edges (from ChromaDB top-5 per item)
188
+ ```
docs/data-collection.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Collection Guide
2
+
3
+ Everything you need to collect your saved content from each source before running the ingest pipeline.
4
+
5
+ ---
6
+
7
+ ## 1. Raindrop.io
8
+
9
+ OpenMark pulls **all your Raindrop collections automatically** via the official REST API. You just need a token.
10
+
11
+ **Steps:**
12
+ 1. Go to [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations)
13
+ 2. Under "For Developers" → click **Create new app**
14
+ 3. Copy the **Test token** (permanent, no expiry)
15
+ 4. Add to `.env`:
16
+ ```
17
+ RAINDROP_TOKEN=your-token-here
18
+ ```
19
+
20
+ The pipeline fetches every collection, every sub-collection, and every unsorted raindrop automatically. No manual export needed.
21
+
22
+ ---
23
+
24
+ ## 2. Browser Bookmarks (Edge / Chrome / Firefox)
25
+
26
+ Export your bookmarks as an HTML file in the Netscape bookmark format (all browsers support this).
27
+
28
+ **Edge:**
29
+ `Settings → Favourites → ··· (three dots) → Export favourites` → save as `favorites.html`
30
+
31
+ **Chrome:**
32
+ `Bookmarks Manager (Ctrl+Shift+O) → ··· → Export bookmarks` → save as `bookmarks.html`
33
+
34
+ **Firefox:**
35
+ `Bookmarks → Manage Bookmarks → Import and Backup → Export Bookmarks to HTML`
36
+
37
+ **After exporting:**
38
+ - Place the HTML file(s) in your `raindrop-mission` folder (or wherever `RAINDROP_MISSION_DIR` points)
39
+ - The pipeline (`merge.py`) looks for `favorites_*.html` and `bookmarks_*.html` patterns
40
+ - It parses the Netscape format and extracts URLs + titles + folder structure
41
+
42
+ > **Tip:** Export fresh before every ingest to capture new bookmarks.
43
+
44
+ ---
45
+
46
+ ## 3. LinkedIn Saved Posts
47
+
48
+ LinkedIn has no public API for saved posts. OpenMark uses LinkedIn's internal **Voyager GraphQL API** — the same API the LinkedIn web app uses internally.
49
+
50
+ **This is the exact endpoint used:**
51
+ ```
52
+ https://www.linkedin.com/voyager/api/graphql
53
+ ?variables=(start:0,count:10,paginationToken:null,
54
+ query:(flagshipSearchIntent:SEARCH_MY_ITEMS_SAVED_POSTS))
55
+ &queryId=voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9
56
+ ```
57
+
58
+ **How to get your session cookie:**
59
+
60
+ 1. Log into LinkedIn in your browser
61
+ 2. Open DevTools (`F12`) → **Application** tab → **Cookies** → `https://www.linkedin.com`
62
+ 3. Find the cookie named `li_at` — copy its value
63
+ 4. Also find `JSESSIONID` — copy its value (used as CSRF token, format: `ajax:XXXXXXXXXXXXXXXXXX`)
64
+
65
+ **Run the fetch script:**
66
+ ```bash
67
+ python raindrop-mission/linkedin_fetch.py
68
+ ```
69
+ Paste your `li_at` value when prompted.
70
+
71
+ **Output:** `raindrop-mission/linkedin_saved.json` — 1,260 saved posts with author, content, and URL.
72
+
73
+ **Pagination:** LinkedIn returns 10 posts per page. The script detects end of results when no `nextPageToken` is returned. With 1,260 posts that's ~133 pages.
74
+
75
+ > **Important:** The `queryId` (`voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9`) is hardcoded in LinkedIn's JavaScript bundle and can change with LinkedIn deployments. If the script returns 0 results, intercept a fresh request from your browser's Network tab — filter for `voyagerSearchDashClusters`, copy the new `queryId`.
76
+
77
+ > **Personal use only.** This method is not officially supported by LinkedIn. Do not use for scraping at scale.
78
+
79
+ ---
80
+
81
+ ## 4. YouTube
82
+
83
+ Uses the official **YouTube Data API v3** via OAuth 2.0. Collects liked videos, watch later playlist, and any saved playlists.
84
+
85
+ **One-time setup:**
86
+
87
+ 1. Go to [Google Cloud Console](https://console.cloud.google.com/)
88
+ 2. Create a new project (e.g. "OpenMark")
89
+ 3. Enable **YouTube Data API v3** (APIs & Services → Enable APIs)
90
+ 4. Create credentials: **OAuth 2.0 Client ID** → Desktop App
91
+ 5. Download the JSON file — rename it to `client_secret.json` and place it in `raindrop-mission/`
92
+ 6. Go to **OAuth consent screen** → Test users → add your Google account email
93
+
94
+ **Run the fetch script:**
95
+ ```bash
96
+ python raindrop-mission/youtube_fetch.py
97
+ ```
98
+ A browser window opens for Google sign-in. After auth, a token is cached locally — you won't need to auth again.
99
+
100
+ **Output:** `raindrop-mission/youtube_MASTER.json` with:
101
+ - `liked_videos` — videos you've liked (up to ~3,200 via API limit)
102
+ - `watch_later` — requires Google Takeout (see below)
103
+ - `playlists` — saved playlists
104
+
105
+ **Watch Later via Google Takeout:**
106
+ YouTube's API does not expose Watch Later directly. Export it via [takeout.google.com](https://takeout.google.com):
107
+ - Select only **YouTube** → **Playlists** → Download
108
+ - Extract the CSV file named `Watch later-videos.csv`
109
+ - Place it in `raindrop-mission/`
110
+ - The `youtube_organize.py` script fetches video titles via API and includes them in `youtube_MASTER.json`
111
+
112
+ ---
113
+
114
+ ## 5. daily.dev Bookmarks
115
+
116
+ daily.dev does not provide a public API. Use the included browser console script to extract bookmarks directly from the page.
117
+
118
+ **Steps:**
119
+ 1. Go to [app.daily.dev](https://app.daily.dev) → **Bookmarks**
120
+ 2. Scroll all the way down to load all bookmarks
121
+ 3. Open DevTools → **Console** tab
122
+ 4. Paste and run `raindrop-mission/dailydev_console_script.js`
123
+ 5. The script copies a JSON array to your clipboard
124
+ 6. Paste into a file named `dailydev_bookmarks.json` in `raindrop-mission/`
125
+
126
+ > The script filters for `/posts/` URLs only — it ignores profile links, squad links, and other noise.
127
+
128
+ ---
129
+
130
+ ## Summary
131
+
132
+ | Source | Method | Output file |
133
+ |--------|--------|-------------|
134
+ | Raindrop | REST API (auto) | pulled live |
135
+ | Edge/Chrome bookmarks | HTML export | `favorites.html` / `bookmarks.html` |
136
+ | LinkedIn saved posts | Voyager GraphQL + session cookie | `linkedin_saved.json` |
137
+ | YouTube liked/playlists | YouTube Data API v3 + OAuth | `youtube_MASTER.json` |
138
+ | YouTube watch later | Google Takeout CSV | included in `youtube_MASTER.json` |
139
+ | daily.dev bookmarks | Browser console script | `dailydev_bookmarks.json` |
140
+
141
+ Once all files are in place, run:
142
+ ```bash
143
+ python scripts/ingest.py
144
+ ```
docs/huggingface.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Publishing Guide
2
+
3
+ OpenMark publishes two things on HuggingFace:
4
+ 1. **Space** — live Gradio demo at `OthmanAdi/OpenMark`
5
+ 2. **Dataset** — the categorized bookmarks at `OthmanAdi/openmark-bookmarks`
6
+
7
+ ---
8
+
9
+ ## Prerequisites
10
+
11
+ You need a HuggingFace account and a **write-access token**:
12
+ 1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
13
+ 2. Create a new token → **Write** access
14
+ 3. Add to your `.env`:
15
+ ```
16
+ HF_TOKEN=hf_your_token_here
17
+ ```
18
+
19
+ ---
20
+
21
+ ## 1. HuggingFace Space (Gradio Demo)
22
+
23
+ The Space hosts the Gradio UI publicly (or privately until you're ready).
24
+
25
+ **Create the Space:**
26
+ ```bash
27
+ pip install huggingface_hub
28
+ python -c "
29
+ from huggingface_hub import HfApi
30
+ import os
31
+ from dotenv import load_dotenv
32
+ load_dotenv()
33
+ api = HfApi(token=os.getenv('HF_TOKEN'))
34
+ api.create_repo(
35
+ repo_id='OthmanAdi/OpenMark',
36
+ repo_type='space',
37
+ space_sdk='gradio',
38
+ private=True,
39
+ )
40
+ print('Space created: https://huggingface.co/spaces/OthmanAdi/OpenMark')
41
+ "
42
+ ```
43
+
44
+ **Push the code to the Space:**
45
+ ```bash
46
+ python -c "
47
+ from huggingface_hub import HfApi
48
+ import os
49
+ from dotenv import load_dotenv
50
+ load_dotenv()
51
+ api = HfApi(token=os.getenv('HF_TOKEN'))
52
+ api.upload_folder(
53
+ folder_path='.',
54
+ repo_id='OthmanAdi/OpenMark',
55
+ repo_type='space',
56
+ ignore_patterns=['.env', 'data/chroma_db/*', '__pycache__/*', '.git/*'],
57
+ )
58
+ "
59
+ ```
60
+
61
+ > **Note:** The Space version requires your ChromaDB and Neo4j data to be pre-loaded. For a public demo, you would host a sample dataset. For private use, the full local setup is better.
62
+
63
+ ---
64
+
65
+ ## 2. HuggingFace Dataset
66
+
67
+ The dataset card publishes your 8,000+ categorized bookmarks as a reusable dataset for RAG experiments.
68
+
69
+ **What's in the dataset:**
70
+ - URL, title, category (19 categories), tags, score (1-10), source
71
+ - Sources: Raindrop, Edge browser, LinkedIn, YouTube, daily.dev
72
+ - ~8,007 unique items after deduplication
73
+
74
+ **Create the dataset repo:**
75
+ ```bash
76
+ python -c "
77
+ from huggingface_hub import HfApi
78
+ import os, json
79
+ from dotenv import load_dotenv
80
+ load_dotenv()
81
+ api = HfApi(token=os.getenv('HF_TOKEN'))
82
+
83
+ # Create private dataset repo
84
+ api.create_repo(
85
+ repo_id='OthmanAdi/openmark-bookmarks',
86
+ repo_type='dataset',
87
+ private=True,
88
+ )
89
+
90
+ # Upload dataset card
91
+ api.upload_file(
92
+ path_or_fileobj='docs/dataset_card.md',
93
+ path_in_repo='README.md',
94
+ repo_id='OthmanAdi/openmark-bookmarks',
95
+ repo_type='dataset',
96
+ )
97
+
98
+ # Upload the data (RAINDROP_MISSION_DIR/CATEGORIZED.json)
99
+ api.upload_file(
100
+ path_or_fileobj=os.path.join(os.getenv('RAINDROP_MISSION_DIR'), 'CATEGORIZED.json'),
101
+ path_in_repo='data/bookmarks.json',
102
+ repo_id='OthmanAdi/openmark-bookmarks',
103
+ repo_type='dataset',
104
+ )
105
+ print('Dataset created: https://huggingface.co/datasets/OthmanAdi/openmark-bookmarks')
106
+ "
107
+ ```
108
+
109
+ ---
110
+
111
+ ## Making Public
112
+
113
+ When you're ready to go public, flip visibility:
114
+ ```bash
115
+ python -c "
116
+ from huggingface_hub import HfApi
117
+ import os
118
+ from dotenv import load_dotenv
119
+ load_dotenv()
120
+ api = HfApi(token=os.getenv('HF_TOKEN'))
121
+
122
+ # Make Space public
123
+ api.update_repo_visibility('OthmanAdi/OpenMark', private=False, repo_type='space')
124
+
125
+ # Make Dataset public
126
+ api.update_repo_visibility('OthmanAdi/openmark-bookmarks', private=False, repo_type='dataset')
127
+ print('Both are now public.')
128
+ "
129
+ ```
docs/ingest.md ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ingest Pipeline
2
+
3
+ The ingest pipeline is the heart of OpenMark. It merges all your data, embeds everything, and writes to both ChromaDB and Neo4j.
4
+
5
+ ---
6
+
7
+ ## Command
8
+
9
+ ```bash
10
+ python scripts/ingest.py [options]
11
+ ```
12
+
13
+ | Flag | Default | Description |
14
+ |------|---------|-------------|
15
+ | `--provider local` | from `.env` | Use local pplx-embed models |
16
+ | `--provider azure` | from `.env` | Use Azure AI Foundry embeddings |
17
+ | `--fresh-raindrop` | off | Also pull live from Raindrop API during merge |
18
+ | `--skip-similar` | off | Skip SIMILAR_TO edge computation (saves ~30 min) |
19
+
20
+ ---
21
+
22
+ ## Pipeline Steps
23
+
24
+ ### Step 1 — Merge
25
+
26
+ Loads and deduplicates all sources:
27
+ - `CATEGORIZED.json` — pre-categorized bookmarks from Edge + Raindrop + daily.dev
28
+ - `linkedin_saved.json` — LinkedIn saved posts
29
+ - `youtube_MASTER.json` — liked videos, watch later, playlists (not subscriptions)
30
+
31
+ Deduplication is URL-based (case-insensitive, trailing slash stripped). If the same URL appears in multiple sources, the first occurrence wins.
32
+
33
+ Each item gets a `doc_text` field built for embedding:
34
+ ```
35
+ {title} | {category} | {tag1 tag2 tag3} | {content/excerpt/channel}
36
+ ```
37
+ This rich text is what gets embedded — not just the title.
38
+
39
+ **Output:** ~8,000 normalized items in memory.
40
+
41
+ ---
42
+
43
+ ### Step 2 — Embedding
44
+
45
+ Loads the embedding provider specified by `EMBEDDING_PROVIDER` in `.env` (or `--provider` flag).
46
+
47
+ **Local (pplx-embed):**
48
+ - Query model: `perplexity-ai/pplx-embed-v1-0.6b` — used for user search queries
49
+ - Document model: `perplexity-ai/pplx-embed-context-v1-0.6b` — used for bookmark documents
50
+ - Output dimension: 1024
51
+ - Downloaded once to HuggingFace cache (~1.2 GB total), free on every subsequent run
52
+ - **Known compatibility issue:** pplx-embed requires `sentence-transformers==3.3.1` and two runtime patches (applied automatically in `local.py`). See [troubleshooting.md](troubleshooting.md) for details.
53
+
54
+ **Azure:**
55
+ - Uses `text-embedding-ada-002` (or configured `AZURE_DEPLOYMENT_EMBED`)
56
+ - Output dimension: 1536
57
+ - Cost: ~€0.30 for 8,000 items (as of 2026)
58
+ - Batched in groups of 100 with progress logging
59
+
60
+ ---
61
+
62
+ ### Step 3 — ChromaDB Ingest
63
+
64
+ Embeds all documents in batches of 100 and stores in ChromaDB.
65
+
66
+ - Skips items already in ChromaDB (resumable — safe to re-run)
67
+ - Stores: URL (as ID), embedding vector, title, category, source, score, tags
68
+ - Uses cosine similarity space (`hnsw:space: cosine`)
69
+ - Database written to disk at `CHROMA_PATH` (default: `OpenMark/data/chroma_db/`)
70
+
71
+ **Timing:**
72
+ | Provider | 8K items | Notes |
73
+ |----------|----------|-------|
74
+ | Local pplx-embed (CPU) | ~20 min | No GPU detected = CPU inference |
75
+ | Local pplx-embed (GPU) | ~3 min | NVIDIA GPU with CUDA |
76
+ | Azure AI Foundry | ~5 min | Network bound |
77
+
78
+ ---
79
+
80
+ ### Step 4 — Neo4j Ingest
81
+
82
+ Creates nodes and relationships in batches of 200.
83
+
84
+ **Nodes created:**
85
+ - `Bookmark` — url, title, score
86
+ - `Category` — name
87
+ - `Tag` — name
88
+ - `Source` — name (raindrop, linkedin, youtube_liked, edge, dailydev, etc.)
89
+ - `Domain` — extracted from URL (e.g. `github.com`, `medium.com`)
90
+
91
+ **Relationships created:**
92
+ - `(Bookmark)-[:IN_CATEGORY]->(Category)`
93
+ - `(Bookmark)-[:TAGGED]->(Tag)`
94
+ - `(Bookmark)-[:FROM_SOURCE]->(Source)`
95
+ - `(Bookmark)-[:FROM_DOMAIN]->(Domain)`
96
+ - `(Tag)-[:CO_OCCURS_WITH {count}]-(Tag)` — built after all nodes are written
97
+
98
+ **Timing:** ~3-5 minutes for 8K items.
99
+
100
+ **Idempotent:** Uses `MERGE` everywhere — safe to re-run, won't create duplicates.
101
+
102
+ ---
103
+
104
+ ### Step 5 — SIMILAR_TO Edges
105
+
106
+ This is the most powerful and most time-consuming step.
107
+
108
+ For each of the 8K bookmarks, OpenMark queries ChromaDB for its top-5 nearest semantic neighbors and writes those as `SIMILAR_TO` edges in Neo4j with a similarity score.
109
+
110
+ ```
111
+ (Bookmark {url: "...langchain-docs..."})-[:SIMILAR_TO {score: 0.94}]->(Bookmark {url: "...langgraph-tutorial..."})
112
+ ```
113
+
114
+ These edges encode **semantic connections you never manually created**. The knowledge graph becomes a web of meaning, not just a web of tags.
115
+
116
+ **Timing:** ~25-40 minutes on CPU for 8K items. This is the longest step.
117
+
118
+ **Skip it if you're in a hurry:**
119
+ ```bash
120
+ python scripts/ingest.py --skip-similar
121
+ ```
122
+ Everything else works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` tool in the agent and the graph traversal from those edges.
123
+
124
+ **Only edges with similarity > 0.5 are written.** Low-quality connections are discarded.
125
+
126
+ ---
127
+
128
+ ## Re-running the Pipeline
129
+
130
+ The pipeline is safe to re-run at any time:
131
+
132
+ - **ChromaDB:** skips already-ingested URLs automatically
133
+ - **Neo4j:** uses `MERGE` — no duplicates created
134
+ - **SIMILAR_TO:** edges are overwritten (not duplicated) via `MERGE`
135
+
136
+ To add new bookmarks after the first run:
137
+ 1. Update your source files (fresh Raindrop pull, new LinkedIn export, etc.)
138
+ 2. Run `python scripts/ingest.py` — only new items get embedded and stored
139
+
140
+ ---
141
+
142
+ ## Checking What's Ingested
143
+
144
+ ```bash
145
+ # Quick stats
146
+ python scripts/search.py --stats
147
+
148
+ # Search to verify
149
+ python scripts/search.py "RAG tools"
150
+
151
+ # Neo4j — open browser
152
+ # http://localhost:7474
153
+ # Run: MATCH (b:Bookmark) RETURN count(b)
154
+ ```
docs/troubleshooting.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Troubleshooting
2
+
3
+ ---
4
+
5
+ ## pplx-embed fails to load
6
+
7
+ **Error:** `ImportError: cannot import name 'Module' from 'sentence_transformers.models'`
8
+
9
+ **Cause:** pplx-embed's custom `st_quantize.py` imports `Module` from `sentence_transformers.models`, which was removed in version 4.x.
10
+
11
+ **Fix:** Pin to the correct version:
12
+ ```bash
13
+ pip install "sentence-transformers==3.3.1"
14
+ ```
15
+
16
+ ---
17
+
18
+ ## pplx-embed crashes with 404 on chat templates
19
+
20
+ **Error:** `RemoteEntryNotFoundError: 404 ... additional_chat_templates does not exist`
21
+
22
+ **Cause:** `transformers 4.57+` added `list_repo_templates()` which looks for an `additional_chat_templates` folder in every model repo. pplx-embed predates this feature and doesn't have the folder.
23
+
24
+ **Fix:** Already handled automatically in `openmark/embeddings/local.py` via a monkey-patch applied before model loading. If you see this error outside of OpenMark, apply:
25
+ ```python
26
+ from transformers.utils import hub as _hub
27
+ import transformers.tokenization_utils_base as _tub
28
+ _orig = _hub.list_repo_templates
29
+ def _safe(*a, **kw):
30
+ try: return _orig(*a, **kw)
31
+ except Exception: return []
32
+ _hub.list_repo_templates = _safe
33
+ _tub.list_repo_templates = _safe
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Neo4j connection error: "Unable to retrieve routing information"
39
+
40
+ **Cause:** Using `neo4j://` URI (routing protocol) with a single local Neo4j instance.
41
+
42
+ **Fix:** Use `bolt://` instead:
43
+ ```env
44
+ NEO4J_URI=bolt://127.0.0.1:7687
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Neo4j error: "Database does not exist"
50
+
51
+ **Cause:** The database name in `.env` doesn't match what's in Neo4j Desktop.
52
+
53
+ **Fix:** Open `http://localhost:7474`, check what databases exist:
54
+ ```cypher
55
+ SHOW DATABASES
56
+ ```
57
+ Update `NEO4J_DATABASE` in `.env` to match.
58
+
59
+ ---
60
+
61
+ ## LinkedIn script returns 0 results or 404
62
+
63
+ **Cause:** LinkedIn's internal `queryId` changes when they deploy new JavaScript bundles.
64
+
65
+ **Fix:**
66
+ 1. Open LinkedIn in your browser → go to Saved Posts
67
+ 2. Open DevTools → Network tab → filter for `voyagerSearchDashClusters`
68
+ 3. Click one of the requests → copy the full URL
69
+ 4. Extract the new `queryId` value
70
+ 5. Update `linkedin_fetch.py` with the new `queryId`
71
+
72
+ ---
73
+
74
+ ## YouTube OAuth "Access Blocked: App not verified"
75
+
76
+ **Cause:** Your Google Cloud app is in testing mode and your account isn't listed as a test user.
77
+
78
+ **Fix:**
79
+ 1. Google Cloud Console → OAuth consent screen
80
+ 2. Scroll to "Test users" → Add users → add your Google account email
81
+ 3. Re-run `youtube_fetch.py`
82
+
83
+ ---
84
+
85
+ ## ChromaDB ingest is slow
86
+
87
+ On CPU with local pplx-embed, embedding 8K items takes ~20 minutes. This is normal.
88
+
89
+ **Options:**
90
+ - Use Azure instead: `python scripts/ingest.py --provider azure` (~5 min, ~€0.30)
91
+ - The ingest is resumable — if interrupted, re-run and it skips already-ingested items
92
+
93
+ ---
94
+
95
+ ## SIMILAR_TO step takes too long
96
+
97
+ Building SIMILAR_TO edges queries ChromaDB for every bookmark's top-5 neighbors, then writes to Neo4j. For 8K items on CPU this takes ~25-40 minutes.
98
+
99
+ **Skip it:**
100
+ ```bash
101
+ python scripts/ingest.py --skip-similar
102
+ ```
103
+ The app works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` agent tool and cross-topic graph traversal.
104
+
105
+ ---
106
+
107
+ ## Windows UnicodeEncodeError in terminal
108
+
109
+ **Error:** `UnicodeEncodeError: 'charmap' codec can't encode character`
110
+
111
+ **Cause:** Windows terminal (cmd/PowerShell) defaults to cp1252 encoding which can't handle emoji or some Unicode characters in bookmark titles.
112
+
113
+ **Fix:** Run from Windows Terminal (supports UTF-8) or add to the top of the script:
114
+ ```python
115
+ import sys
116
+ sys.stdout.reconfigure(encoding='utf-8')
117
+ ```
118
+ All OpenMark scripts already include this.
119
+
120
+ ---
121
+
122
+ ## gradio not found on Python 3.13
123
+
124
+ gradio 6.6.0 is installed on Python 3.14 by default on this machine. If using Python 3.13:
125
+ ```bash
126
+ C:\Python313\python -m pip install gradio
127
+ ```
openmark/__init__.py ADDED
File without changes
openmark/agent/__init__.py ADDED
File without changes
openmark/agent/graph.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph ReAct agent for OpenMark.
3
+ Uses Azure gpt-4o-mini as the LLM.
4
+ Has access to all OpenMark tools (ChromaDB + Neo4j).
5
+ """
6
+
7
+ from langchain_openai import AzureChatOpenAI
8
+ from langgraph.prebuilt import create_react_agent
9
+ from langgraph.checkpoint.memory import MemorySaver
10
+ from openmark import config
11
+ from openmark.agent.tools import ALL_TOOLS
12
+
13
+ SYSTEM_PROMPT = """You are OpenMark — Ahmad's personal AI knowledge assistant.
14
+
15
+ You have access to his entire curated knowledge base of 7,000+ saved bookmarks,
16
+ LinkedIn posts, and YouTube videos — all categorized, tagged, and connected in a
17
+ knowledge graph.
18
+
19
+ Your job:
20
+ - Help Ahmad find exactly what he saved and can't remember
21
+ - Discover connections between topics he didn't know existed
22
+ - Answer questions by searching his real saved content (not your training data)
23
+ - Be direct and useful — no filler
24
+
25
+ When answering:
26
+ - Always use tools to search first before responding
27
+ - Show the actual URLs and titles from results
28
+ - Group results by relevance
29
+ - If one search doesn't find enough, try a different angle (by tag, by category, by similarity)
30
+
31
+ Available search modes:
32
+ - search_semantic: natural language search (most useful for general queries)
33
+ - search_by_category: filter by topic category
34
+ - find_by_tag: exact tag lookup in the knowledge graph
35
+ - find_similar_bookmarks: find related content to a specific URL
36
+ - explore_tag_cluster: discover what else connects to a topic
37
+ - get_stats: see what's in the knowledge base
38
+ - run_cypher: advanced graph queries (for power users)
39
+ """
40
+
41
+
42
+ def build_agent():
43
+ llm = AzureChatOpenAI(
44
+ azure_endpoint=config.AZURE_ENDPOINT,
45
+ api_key=config.AZURE_API_KEY,
46
+ azure_deployment=config.AZURE_DEPLOYMENT_LLM,
47
+ api_version=config.AZURE_API_VERSION,
48
+ temperature=0,
49
+ streaming=True,
50
+ )
51
+
52
+ checkpointer = MemorySaver()
53
+
54
+ agent = create_react_agent(
55
+ model=llm,
56
+ tools=ALL_TOOLS,
57
+ prompt=SYSTEM_PROMPT,
58
+ checkpointer=checkpointer,
59
+ )
60
+ return agent
61
+
62
+
63
+ def ask(agent, question: str, thread_id: str = "default") -> str:
64
+ """Run a question through the agent and return the final text response."""
65
+ config_run = {"configurable": {"thread_id": thread_id}}
66
+ result = agent.invoke(
67
+ {"messages": [{"role": "user", "content": question}]},
68
+ config=config_run,
69
+ )
70
+ return result["messages"][-1].content
openmark/agent/tools.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph tools for the OpenMark agent.
3
+ Each tool hits either ChromaDB (semantic) or Neo4j (graph) or both.
4
+ """
5
+
6
+ from langchain_core.tools import tool
7
+ from openmark.embeddings.factory import get_embedder
8
+ from openmark.stores import chroma as chroma_store
9
+ from openmark.stores import neo4j_store
10
+
11
+ # Embedder is loaded once and reused
12
+ _embedder = None
13
+
14
+ def _get_embedder():
15
+ global _embedder
16
+ if _embedder is None:
17
+ _embedder = get_embedder()
18
+ return _embedder
19
+
20
+
21
+ @tool
22
+ def search_semantic(query: str, n: int = 10) -> str:
23
+ """
24
+ Search bookmarks by semantic meaning using vector similarity.
25
+ Use this for natural language queries like 'RAG tools', 'LangGraph tutorials', etc.
26
+ Returns top N most relevant bookmarks.
27
+ """
28
+ results = chroma_store.search(query, _get_embedder(), n=n)
29
+ if not results:
30
+ return "No results found."
31
+ lines = [f"{r['rank']}. [{r['category']}] {r['title']}\n {r['url']} (similarity: {r['similarity']}, score: {r['score']})"
32
+ for r in results]
33
+ return "\n".join(lines)
34
+
35
+
36
+ @tool
37
+ def search_by_category(category: str, query: str = "", n: int = 15) -> str:
38
+ """
39
+ Find bookmarks in a specific category, optionally filtered by semantic query.
40
+ Categories: RAG & Vector Search, Agent Development, LangChain / LangGraph,
41
+ MCP & Tool Use, Context Engineering, AI Tools & Platforms, GitHub Repos & OSS,
42
+ Learning & Courses, YouTube & Video, Web Development, Cloud & Infrastructure,
43
+ Data Science & ML, Knowledge Graphs & Neo4j, Career & Jobs, LLM Fine-tuning,
44
+ Finance & Crypto, Design & UI/UX, News & Articles, Entertainment & Other
45
+ """
46
+ if query:
47
+ results = chroma_store.search(query, _get_embedder(), n=n, category=category)
48
+ else:
49
+ results = chroma_store.search(category, _get_embedder(), n=n, category=category)
50
+ if not results:
51
+ return f"No bookmarks found in category '{category}'."
52
+ lines = [f"{r['rank']}. {r['title']}\n {r['url']}" for r in results]
53
+ return f"Category '{category}' — top results:\n" + "\n".join(lines)
54
+
55
+
56
+ @tool
57
+ def find_by_tag(tag: str) -> str:
58
+ """
59
+ Find all bookmarks tagged with a specific tag using the knowledge graph.
60
+ Returns bookmarks ordered by quality score.
61
+ """
62
+ results = neo4j_store.find_by_tag(tag, limit=20)
63
+ if not results:
64
+ return f"No bookmarks found with tag '{tag}'."
65
+ lines = [f"- {r['title']}\n {r['url']} (score: {r['score']})" for r in results]
66
+ return f"Bookmarks tagged '{tag}':\n" + "\n".join(lines)
67
+
68
+
69
+ @tool
70
+ def find_similar_bookmarks(url: str) -> str:
71
+ """
72
+ Find bookmarks semantically similar to a given URL.
73
+ Uses SIMILAR_TO edges in the knowledge graph (built from embedding neighbors).
74
+ """
75
+ results = neo4j_store.find_similar(url, limit=10)
76
+ if not results:
77
+ return f"No similar bookmarks found for {url}."
78
+ lines = [f"- {r['title']}\n {r['url']} (similarity: {r['similarity']:.3f})" for r in results]
79
+ return "Similar bookmarks:\n" + "\n".join(lines)
80
+
81
+
82
+ @tool
83
+ def explore_tag_cluster(tag: str) -> str:
84
+ """
85
+ Explore the knowledge graph around a tag — find related tags and their bookmarks.
86
+ Traverses CO_OCCURS_WITH edges (2 hops) to discover connected topics.
87
+ Great for discovering what else you know about a topic.
88
+ """
89
+ results = neo4j_store.find_tag_cluster(tag, hops=2, limit=25)
90
+ if not results:
91
+ return f"No cluster found for tag '{tag}'."
92
+ lines = [f"- [{r['via_tag']}] {r['title']}\n {r['url']}" for r in results]
93
+ return f"Knowledge cluster around '{tag}':\n" + "\n".join(lines)
94
+
95
+
96
+ @tool
97
+ def get_stats() -> str:
98
+ """
99
+ Get statistics about the OpenMark knowledge base.
100
+ Shows total bookmarks, tags, categories in both ChromaDB and Neo4j.
101
+ """
102
+ chroma_stats = chroma_store.get_stats()
103
+ neo4j_stats = neo4j_store.get_stats()
104
+ return (
105
+ f"OpenMark Knowledge Base Stats:\n"
106
+ f" ChromaDB vectors: {chroma_stats.get('total', 0)}\n"
107
+ f" Neo4j bookmarks: {neo4j_stats.get('bookmarks', 0)}\n"
108
+ f" Neo4j tags: {neo4j_stats.get('tags', 0)}\n"
109
+ f" Neo4j categories: {neo4j_stats.get('categories', 0)}"
110
+ )
111
+
112
+
113
+ @tool
114
+ def run_cypher(cypher: str) -> str:
115
+ """
116
+ Run a raw Cypher query against the Neo4j knowledge graph.
117
+ Use for advanced graph traversals. Example:
118
+ MATCH (b:Bookmark)-[:TAGGED]->(t:Tag) WHERE t.name='rag' RETURN b.title, b.url LIMIT 10
119
+ """
120
+ try:
121
+ rows = neo4j_store.query(cypher)
122
+ if not rows:
123
+ return "Query returned no results."
124
+ lines = [str(r) for r in rows[:20]]
125
+ return "\n".join(lines)
126
+ except Exception as e:
127
+ return f"Cypher error: {e}"
128
+
129
+
130
+ ALL_TOOLS = [
131
+ search_semantic,
132
+ search_by_category,
133
+ find_by_tag,
134
+ find_similar_bookmarks,
135
+ explore_tag_cluster,
136
+ get_stats,
137
+ run_cypher,
138
+ ]
openmark/config.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"))
5
+
6
+ # Embedding
7
+ EMBEDDING_PROVIDER = os.getenv("EMBEDDING_PROVIDER", "local")
8
+ PPLX_QUERY_MODEL = os.getenv("PPLX_QUERY_MODEL", "perplexity-ai/pplx-embed-v1-0.6b")
9
+ PPLX_DOC_MODEL = os.getenv("PPLX_DOC_MODEL", "perplexity-ai/pplx-embed-context-v1-0.6b")
10
+
11
+ # Azure
12
+ AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
13
+ AZURE_API_KEY = os.getenv("AZURE_API_KEY")
14
+ AZURE_DEPLOYMENT_LLM = os.getenv("AZURE_DEPLOYMENT_LLM", "gpt-4o-mini")
15
+ AZURE_DEPLOYMENT_EMBED = os.getenv("AZURE_DEPLOYMENT_EMBED", "text-embedding-ada-002")
16
+ AZURE_API_VERSION = os.getenv("AZURE_API_VERSION", "2024-05-01-preview")
17
+
18
+ # Neo4j
19
+ NEO4J_URI = os.getenv("NEO4J_URI", "bolt://127.0.0.1:7687")
20
+ NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
21
+ NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
22
+ NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j")
23
+
24
+ # Raindrop
25
+ RAINDROP_TOKEN = os.getenv("RAINDROP_TOKEN")
26
+
27
+ # Paths
28
+ RAINDROP_MISSION_DIR = os.getenv("RAINDROP_MISSION_DIR", r"C:\Users\oasrvadmin\Documents\raindrop-mission")
29
+ CHROMA_PATH = os.getenv("CHROMA_PATH", r"C:\Users\oasrvadmin\Documents\OpenMark\data\chroma_db")
30
+
31
+ # Canonical categories
32
+ CATEGORIES = [
33
+ "RAG & Vector Search",
34
+ "LLM Fine-tuning",
35
+ "Agent Development",
36
+ "LangChain / LangGraph",
37
+ "MCP & Tool Use",
38
+ "Context Engineering",
39
+ "AI Tools & Platforms",
40
+ "GitHub Repos & OSS",
41
+ "Learning & Courses",
42
+ "YouTube & Video",
43
+ "Web Development",
44
+ "Cloud & Infrastructure",
45
+ "Data Science & ML",
46
+ "Knowledge Graphs & Neo4j",
47
+ "Career & Jobs",
48
+ "Finance & Crypto",
49
+ "Design & UI/UX",
50
+ "News & Articles",
51
+ "Entertainment & Other",
52
+ ]
53
+
54
+ CATEGORY_MAP = {
55
+ "UI/UX Design": "Design & UI/UX",
56
+ "UI/UX": "Design & UI/UX",
57
+ "Real_Estate": "Finance & Crypto",
58
+ "Real Estate": "Finance & Crypto",
59
+ "Social_Media": "News & Articles",
60
+ "Social/Community": "News & Articles",
61
+ "Social": "News & Articles",
62
+ "E-commerce & Marketplaces": "News & Articles",
63
+ "Research & Articles": "News & Articles",
64
+ "Blogs & Articles": "News & Articles",
65
+ "Research": "News & Articles",
66
+ "AI Thought Leaders & Media": "News & Articles",
67
+ "Debugging & Tools": "AI Tools & Platforms",
68
+ "Health & Wellness": "Entertainment & Other",
69
+ "Email & Productivity": "AI Tools & Platforms",
70
+ "Legal": "Entertainment & Other",
71
+ "NoCode - LowCode": "AI Tools & Platforms",
72
+ "Security": "AI Tools & Platforms",
73
+ }
openmark/embeddings/__init__.py ADDED
File without changes
openmark/embeddings/azure.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Azure AI Foundry embedding provider.
3
+ Uses text-embedding-ada-002 (or whatever deployment is configured).
4
+ """
5
+
6
+ from openai import AzureOpenAI
7
+ from openmark.embeddings.base import EmbeddingProvider
8
+ from openmark import config
9
+
10
+
11
+ class AzureEmbedder(EmbeddingProvider):
12
+ def __init__(self):
13
+ self._client = AzureOpenAI(
14
+ azure_endpoint=config.AZURE_ENDPOINT,
15
+ api_key=config.AZURE_API_KEY,
16
+ api_version=config.AZURE_API_VERSION,
17
+ )
18
+ self._deployment = config.AZURE_DEPLOYMENT_EMBED
19
+ print(f"Azure embedder ready — deployment: {self._deployment}")
20
+
21
+ def _embed(self, texts: list[str]) -> list[list[float]]:
22
+ response = self._client.embeddings.create(
23
+ input=texts,
24
+ model=self._deployment,
25
+ )
26
+ return [item.embedding for item in response.data]
27
+
28
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
29
+ results = []
30
+ batch_size = 100
31
+ for i in range(0, len(texts), batch_size):
32
+ batch = texts[i:i + batch_size]
33
+ results.extend(self._embed(batch))
34
+ print(f" Azure embedded {min(i + batch_size, len(texts))}/{len(texts)}")
35
+ return results
36
+
37
+ def embed_query(self, text: str) -> list[float]:
38
+ return self._embed([text])[0]
39
+
40
+ @property
41
+ def dimension(self) -> int:
42
+ return 1536 # ada-002 dimension
openmark/embeddings/base.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class EmbeddingProvider(ABC):
5
+ """Abstract base — swap local pplx-embed or Azure without changing any other code."""
6
+
7
+ @abstractmethod
8
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
9
+ """Embed a list of document strings."""
10
+ ...
11
+
12
+ @abstractmethod
13
+ def embed_query(self, text: str) -> list[float]:
14
+ """Embed a single query string."""
15
+ ...
16
+
17
+ @property
18
+ @abstractmethod
19
+ def dimension(self) -> int:
20
+ """Output embedding dimension."""
21
+ ...
openmark/embeddings/factory.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openmark import config
2
+ from openmark.embeddings.base import EmbeddingProvider
3
+
4
+
5
+ def get_embedder() -> EmbeddingProvider:
6
+ """Return the configured embedding provider based on EMBEDDING_PROVIDER env var."""
7
+ provider = config.EMBEDDING_PROVIDER.lower()
8
+ if provider == "local":
9
+ from openmark.embeddings.local import LocalEmbedder
10
+ return LocalEmbedder()
11
+ elif provider == "azure":
12
+ from openmark.embeddings.azure import AzureEmbedder
13
+ return AzureEmbedder()
14
+ else:
15
+ raise ValueError(f"Unknown EMBEDDING_PROVIDER: '{provider}'. Use 'local' or 'azure'.")
openmark/embeddings/local.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local pplx-embed embedding provider.
3
+ Uses:
4
+ - perplexity-ai/pplx-embed-v1-0.6b for queries
5
+ - perplexity-ai/pplx-embed-context-v1-0.6b for documents
6
+
7
+ Two patches applied at import time:
8
+ 1. transformers 4.57 crashes on models without additional_chat_templates folder → catch 404
9
+ 2. pplx-embed's st_quantize.py imports sentence_transformers.models.Module (removed in 3.x) → add it back
10
+ """
11
+
12
+ # ── Patch 1: transformers 4.57 list_repo_templates 404 crash ─
13
+ from transformers.utils import hub as _hub
14
+ import transformers.tokenization_utils_base as _tub
15
+ _orig_lrt = _hub.list_repo_templates
16
+ def _safe_lrt(*a, **kw):
17
+ try:
18
+ return _orig_lrt(*a, **kw)
19
+ except Exception:
20
+ return []
21
+ _hub.list_repo_templates = _safe_lrt
22
+ _tub.list_repo_templates = _safe_lrt
23
+
24
+ # ── Patch 2: sentence_transformers.models.Module missing ─────
25
+ import torch.nn as _nn
26
+ import sentence_transformers.models as _st_models
27
+ if not hasattr(_st_models, "Module"):
28
+ _st_models.Module = _nn.Module
29
+
30
+ from sentence_transformers import SentenceTransformer
31
+ import numpy as np
32
+ from openmark.embeddings.base import EmbeddingProvider
33
+ from openmark import config
34
+
35
+
36
+ class LocalEmbedder(EmbeddingProvider):
37
+ def __init__(self):
38
+ print("Loading pplx-embed query model...")
39
+ self._query_model = SentenceTransformer(config.PPLX_QUERY_MODEL, trust_remote_code=True)
40
+ print("Loading pplx-embed document model...")
41
+ self._doc_model = SentenceTransformer(config.PPLX_DOC_MODEL, trust_remote_code=True)
42
+ print("Local embedder ready.")
43
+
44
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
45
+ embeddings = self._doc_model.encode(texts, batch_size=32, show_progress_bar=True)
46
+ return embeddings.astype(float).tolist()
47
+
48
+ def embed_query(self, text: str) -> list[float]:
49
+ embedding = self._query_model.encode([text])
50
+ return embedding[0].astype(float).tolist()
51
+
52
+ @property
53
+ def dimension(self) -> int:
54
+ return 1024
openmark/pipeline/__init__.py ADDED
File without changes
openmark/pipeline/merge.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merge ALL data sources into one clean list:
3
+ - CATEGORIZED.json (Edge + old Raindrop + daily.dev — already categorized)
4
+ - linkedin_saved.json (1,260 LinkedIn posts)
5
+ - youtube_MASTER.json (liked + watch_later + playlists)
6
+ - Fresh Raindrop pull (new items not yet in CATEGORIZED)
7
+
8
+ Deduplicates by URL. Normalizes categories.
9
+ """
10
+
11
+ import json
12
+ import os
13
+ from openmark import config
14
+ from openmark.pipeline.normalize import normalize_item, dedupe
15
+
16
+
17
+ def load_categorized() -> list[dict]:
18
+ path = os.path.join(config.RAINDROP_MISSION_DIR, "CATEGORIZED.json")
19
+ with open(path, encoding="utf-8") as f:
20
+ items = json.load(f)
21
+ print(f"CATEGORIZED.json: {len(items)} items")
22
+ return items
23
+
24
+
25
+ def load_linkedin() -> list[dict]:
26
+ path = os.path.join(config.RAINDROP_MISSION_DIR, "linkedin_saved.json")
27
+ if not os.path.exists(path):
28
+ print("LinkedIn: file not found, skipping")
29
+ return []
30
+ with open(path, encoding="utf-8") as f:
31
+ posts = json.load(f)
32
+ items = []
33
+ for p in posts:
34
+ content = p.get("content", "")
35
+ author = p.get("author", "")
36
+ items.append({
37
+ "url": p.get("url", ""),
38
+ "title": f"{author} — {content[:80]}" if author else content[:100],
39
+ "content": content[:300],
40
+ "author": author,
41
+ "folder": "LinkedIn Saved",
42
+ "source": "linkedin",
43
+ "tags": [],
44
+ "category": None, # will be assigned by normalize
45
+ "score": 6,
46
+ })
47
+ print(f"LinkedIn: {len(items)} posts")
48
+ return items
49
+
50
+
51
+ def load_youtube() -> list[dict]:
52
+ path = os.path.join(config.RAINDROP_MISSION_DIR, "youtube_MASTER.json")
53
+ if not os.path.exists(path):
54
+ print("YouTube: file not found, skipping")
55
+ return []
56
+ with open(path, encoding="utf-8") as f:
57
+ yt = json.load(f)
58
+ items = []
59
+ for section in ["liked_videos", "watch_later", "playlists"]:
60
+ for v in yt.get(section, []):
61
+ items.append({
62
+ "url": v.get("url", ""),
63
+ "title": v.get("title", ""),
64
+ "channel": v.get("channel", ""),
65
+ "folder": f"YouTube / {section}",
66
+ "source": f"youtube_{section}",
67
+ "tags": v.get("tags", [])[:5],
68
+ "category": "YouTube & Video",
69
+ "score": 7,
70
+ })
71
+ print(f"YouTube: {len(items)} videos (liked + watch_later + playlists)")
72
+ return items
73
+
74
+
75
+ def merge_all(include_fresh_raindrop: bool = False) -> list[dict]:
76
+ """
77
+ Merge all sources. Returns deduplicated, normalized list.
78
+ Set include_fresh_raindrop=True to also pull live from Raindrop API.
79
+ """
80
+ all_items = []
81
+
82
+ all_items.extend(load_categorized())
83
+ all_items.extend(load_linkedin())
84
+ all_items.extend(load_youtube())
85
+
86
+ if include_fresh_raindrop:
87
+ from openmark.pipeline.raindrop import pull_all
88
+ fresh = pull_all()
89
+ all_items.extend(fresh)
90
+
91
+ # Normalize each item
92
+ normalized = [normalize_item(i) for i in all_items]
93
+
94
+ # Deduplicate by URL
95
+ unique = dedupe(normalized)
96
+ print(f"\nTotal after merge + dedup: {len(unique)} items")
97
+ return unique
openmark/pipeline/normalize.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Normalize, clean, and deduplicate bookmark items.
3
+ """
4
+
5
+ import re
6
+ from openmark import config
7
+
8
+
9
+ def clean_title(title: str) -> str:
10
+ if not title:
11
+ return ""
12
+ # Strip HTML entities
13
+ title = re.sub(r"&", "&", title)
14
+ title = re.sub(r"&lt;", "<", title)
15
+ title = re.sub(r"&gt;", ">", title)
16
+ title = re.sub(r"&#39;", "'", title)
17
+ title = re.sub(r"&quot;", '"', title)
18
+ # Strip leading/trailing whitespace and truncate
19
+ title = title.strip()[:300]
20
+ return title
21
+
22
+
23
+ def fix_category(cat: str | None) -> str:
24
+ if not cat:
25
+ return "News & Articles"
26
+ # Apply known remapping
27
+ cat = config.CATEGORY_MAP.get(cat, cat)
28
+ # If still unknown, fallback
29
+ if cat not in config.CATEGORIES:
30
+ return "News & Articles"
31
+ return cat
32
+
33
+
34
+ def build_document_text(item: dict) -> str:
35
+ """
36
+ Build a single rich text string for embedding.
37
+ Combines title + tags + category + content/excerpt for better semantic matching.
38
+ """
39
+ parts = []
40
+ if item.get("title"):
41
+ parts.append(item["title"])
42
+ if item.get("category"):
43
+ parts.append(item["category"])
44
+ if item.get("tags"):
45
+ parts.append(" ".join(item["tags"]))
46
+ if item.get("content"):
47
+ parts.append(item["content"][:200])
48
+ elif item.get("excerpt"):
49
+ parts.append(item["excerpt"][:200])
50
+ if item.get("channel"):
51
+ parts.append(item["channel"])
52
+ if item.get("author"):
53
+ parts.append(item["author"])
54
+ return " | ".join(p for p in parts if p)
55
+
56
+
57
+ def normalize_item(item: dict) -> dict:
58
+ """Clean and normalize a single bookmark item."""
59
+ url = item.get("url", "").strip()
60
+ title = clean_title(item.get("title", ""))
61
+ cat = fix_category(item.get("category"))
62
+ tags = [t.lower().strip() for t in item.get("tags", []) if t][:5]
63
+ score = item.get("score", 5)
64
+ if not isinstance(score, (int, float)):
65
+ score = 5
66
+
67
+ normalized = {
68
+ "url": url,
69
+ "title": title,
70
+ "category": cat,
71
+ "tags": tags,
72
+ "score": score,
73
+ "source": item.get("source", "unknown"),
74
+ "folder": item.get("folder", ""),
75
+ }
76
+
77
+ # Preserve optional fields
78
+ for field in ["content", "excerpt", "author", "channel", "description"]:
79
+ if item.get(field):
80
+ normalized[field] = item[field][:300]
81
+
82
+ # Build the document text for embedding
83
+ normalized["doc_text"] = build_document_text(normalized)
84
+
85
+ return normalized
86
+
87
+
88
+ def dedupe(items: list[dict]) -> list[dict]:
89
+ """Remove duplicates by URL (case-insensitive, trailing slash stripped)."""
90
+ seen = set()
91
+ unique = []
92
+ for item in items:
93
+ url = item.get("url", "").rstrip("/").lower()
94
+ if not url or url in seen:
95
+ continue
96
+ seen.add(url)
97
+ unique.append(item)
98
+ return unique
openmark/pipeline/raindrop.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fresh pull of ALL Raindrop bookmarks via API.
3
+ Fetches every collection and every raindrop inside it, paginated.
4
+ """
5
+
6
+ import time
7
+ import requests
8
+ from openmark import config
9
+
10
+ HEADERS = {"Authorization": f"Bearer {config.RAINDROP_TOKEN}"}
11
+
12
+
13
+ def fetch_all_collections() -> list[dict]:
14
+ """Return all collections (top-level and nested)."""
15
+ resp = requests.get("https://api.raindrop.io/rest/v1/collections", headers=HEADERS)
16
+ resp.raise_for_status()
17
+ collections = resp.json().get("items", [])
18
+
19
+ # Also fetch children
20
+ resp2 = requests.get("https://api.raindrop.io/rest/v1/collections/childrens", headers=HEADERS)
21
+ if resp2.status_code == 200:
22
+ collections += resp2.json().get("items", [])
23
+
24
+ return collections
25
+
26
+
27
+ def fetch_raindrops_for_collection(collection_id: int, title: str) -> list[dict]:
28
+ """Fetch all raindrops in a collection, paginated."""
29
+ items = []
30
+ page = 0
31
+ while True:
32
+ resp = requests.get(
33
+ f"https://api.raindrop.io/rest/v1/raindrops/{collection_id}",
34
+ headers=HEADERS,
35
+ params={"perpage": 50, "page": page},
36
+ )
37
+ if resp.status_code != 200:
38
+ break
39
+ batch = resp.json().get("items", [])
40
+ if not batch:
41
+ break
42
+ for item in batch:
43
+ items.append({
44
+ "url": item.get("link", ""),
45
+ "title": item.get("title", ""),
46
+ "excerpt": item.get("excerpt", "")[:200],
47
+ "tags": item.get("tags", [])[:5],
48
+ "folder": title,
49
+ "source": "raindrop",
50
+ })
51
+ if len(batch) < 50:
52
+ break
53
+ page += 1
54
+ time.sleep(0.2)
55
+ return items
56
+
57
+
58
+ def fetch_unsorted() -> list[dict]:
59
+ """Fetch raindrops not in any collection (unsorted)."""
60
+ return fetch_raindrops_for_collection(-1, "Unsorted")
61
+
62
+
63
+ def pull_all() -> list[dict]:
64
+ """Pull every raindrop from every collection. Returns flat list."""
65
+ print("Fetching Raindrop collections...")
66
+ collections = fetch_all_collections()
67
+ print(f" Found {len(collections)} collections")
68
+
69
+ all_items = []
70
+ for col in collections:
71
+ cid = col["_id"]
72
+ title = col.get("title", "Unknown")
73
+ items = fetch_raindrops_for_collection(cid, title)
74
+ print(f" [{title}] {len(items)} items")
75
+ all_items.extend(items)
76
+ time.sleep(0.1)
77
+
78
+ unsorted = fetch_unsorted()
79
+ print(f" [Unsorted] {len(unsorted)} items")
80
+ all_items.extend(unsorted)
81
+
82
+ print(f"Raindrop total: {len(all_items)}")
83
+ return all_items
openmark/stores/__init__.py ADDED
File without changes
openmark/stores/chroma.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ChromaDB store — semantic vector search.
3
+ """
4
+
5
+ import chromadb
6
+ from openmark import config
7
+ from openmark.embeddings.base import EmbeddingProvider
8
+
9
+ COLLECTION_NAME = "openmark_bookmarks"
10
+
11
+
12
+ def get_client() -> chromadb.PersistentClient:
13
+ return chromadb.PersistentClient(path=config.CHROMA_PATH)
14
+
15
+
16
+ def get_collection(client: chromadb.PersistentClient, embedder: EmbeddingProvider):
17
+ """Get or create the bookmarks collection."""
18
+ return client.get_or_create_collection(
19
+ name=COLLECTION_NAME,
20
+ metadata={"hnsw:space": "cosine"},
21
+ )
22
+
23
+
24
+ def ingest(items: list[dict], embedder: EmbeddingProvider, batch_size: int = 100):
25
+ """Embed all items and store in ChromaDB."""
26
+ client = get_client()
27
+ collection = get_collection(client, embedder)
28
+
29
+ # Check already ingested
30
+ existing = set(collection.get(include=[])["ids"])
31
+ new_items = [i for i in items if i["url"] not in existing]
32
+ print(f"ChromaDB: {len(existing)} already ingested, {len(new_items)} new")
33
+
34
+ if not new_items:
35
+ return
36
+
37
+ total = 0
38
+ for start in range(0, len(new_items), batch_size):
39
+ batch = new_items[start:start + batch_size]
40
+
41
+ texts = [i["doc_text"] for i in batch]
42
+ ids = [i["url"] for i in batch]
43
+ metas = [
44
+ {
45
+ "title": i["title"][:500],
46
+ "category": i["category"],
47
+ "source": i["source"],
48
+ "score": float(i["score"]),
49
+ "tags": ",".join(i["tags"]),
50
+ "folder": i.get("folder", ""),
51
+ }
52
+ for i in batch
53
+ ]
54
+
55
+ embeddings = embedder.embed_documents(texts)
56
+
57
+ collection.add(
58
+ ids=ids,
59
+ embeddings=embeddings,
60
+ documents=texts,
61
+ metadatas=metas,
62
+ )
63
+ total += len(batch)
64
+ print(f" ChromaDB ingested {total}/{len(new_items)}")
65
+
66
+ print(f"ChromaDB total: {collection.count()} items")
67
+
68
+
69
+ def search(
70
+ query: str,
71
+ embedder: EmbeddingProvider,
72
+ n: int = 10,
73
+ category: str | None = None,
74
+ source: str | None = None,
75
+ min_score: float | None = None,
76
+ ) -> list[dict]:
77
+ """Semantic search with optional metadata filters."""
78
+ client = get_client()
79
+ collection = get_collection(client, embedder)
80
+
81
+ q_embedding = embedder.embed_query(query)
82
+
83
+ # Build filters
84
+ filters = []
85
+ if category:
86
+ filters.append({"category": {"$eq": category}})
87
+ if source:
88
+ filters.append({"source": {"$eq": source}})
89
+ if min_score is not None:
90
+ filters.append({"score": {"$gte": min_score}})
91
+
92
+ where = None
93
+ if len(filters) == 1:
94
+ where = filters[0]
95
+ elif len(filters) > 1:
96
+ where = {"$and": filters}
97
+
98
+ results = collection.query(
99
+ query_embeddings=[q_embedding],
100
+ n_results=n,
101
+ where=where,
102
+ include=["metadatas", "documents", "distances"],
103
+ )
104
+
105
+ output = []
106
+ for i, (meta, doc, dist) in enumerate(zip(
107
+ results["metadatas"][0],
108
+ results["documents"][0],
109
+ results["distances"][0],
110
+ )):
111
+ output.append({
112
+ "rank": i + 1,
113
+ "url": results["ids"][0][i],
114
+ "title": meta.get("title", ""),
115
+ "category": meta.get("category", ""),
116
+ "source": meta.get("source", ""),
117
+ "score": meta.get("score", 0),
118
+ "tags": meta.get("tags", "").split(","),
119
+ "similarity": round(1 - dist, 4),
120
+ })
121
+ return output
122
+
123
+
124
+ def get_stats() -> dict:
125
+ client = get_client()
126
+ collection = get_collection(client, None)
127
+ return {"total": collection.count()}
openmark/stores/neo4j_store.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Neo4j store — knowledge graph.
3
+
4
+ Nodes: Bookmark, Tag, Category, Source, Domain
5
+ Edges: TAGGED, IN_CATEGORY, FROM_SOURCE, FROM_DOMAIN, SIMILAR_TO, CO_OCCURS_WITH
6
+ """
7
+
8
+ import re
9
+ from urllib.parse import urlparse
10
+ from neo4j import GraphDatabase
11
+ from openmark import config
12
+
13
+
14
+ def get_driver():
15
+ return GraphDatabase.driver(
16
+ config.NEO4J_URI,
17
+ auth=(config.NEO4J_USER, config.NEO4J_PASSWORD),
18
+ )
19
+
20
+
21
+ def setup_constraints(driver):
22
+ """Create uniqueness constraints once."""
23
+ constraints = [
24
+ "CREATE CONSTRAINT bookmark_url IF NOT EXISTS FOR (b:Bookmark) REQUIRE b.url IS UNIQUE",
25
+ "CREATE CONSTRAINT tag_name IF NOT EXISTS FOR (t:Tag) REQUIRE t.name IS UNIQUE",
26
+ "CREATE CONSTRAINT category_name IF NOT EXISTS FOR (c:Category) REQUIRE c.name IS UNIQUE",
27
+ "CREATE CONSTRAINT source_name IF NOT EXISTS FOR (s:Source) REQUIRE s.name IS UNIQUE",
28
+ "CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE",
29
+ ]
30
+ with driver.session(database=config.NEO4J_DATABASE) as session:
31
+ for cypher in constraints:
32
+ try:
33
+ session.run(cypher)
34
+ except Exception as e:
35
+ print(f" Constraint (already exists or error): {e}")
36
+ print("Constraints ready.")
37
+
38
+
39
+ def extract_domain(url: str) -> str:
40
+ try:
41
+ return urlparse(url).netloc.replace("www.", "")
42
+ except Exception:
43
+ return "unknown"
44
+
45
+
46
+ def ingest(items: list[dict], driver=None):
47
+ """Write all nodes and relationships to Neo4j."""
48
+ own_driver = driver is None
49
+ if own_driver:
50
+ driver = get_driver()
51
+
52
+ setup_constraints(driver)
53
+
54
+ total = len(items)
55
+ batch_size = 200
56
+
57
+ print(f"Neo4j ingesting {total} items...")
58
+
59
+ for start in range(0, total, batch_size):
60
+ batch = items[start:start + batch_size]
61
+
62
+ with driver.session(database=config.NEO4J_DATABASE) as session:
63
+ session.execute_write(_write_batch, batch)
64
+
65
+ print(f" Neo4j wrote {min(start + batch_size, total)}/{total}")
66
+
67
+ print("Building tag co-occurrence edges...")
68
+ _build_tag_cooccurrence(driver)
69
+
70
+ print("Neo4j ingestion complete.")
71
+
72
+ if own_driver:
73
+ driver.close()
74
+
75
+
76
+ def _write_batch(tx, batch: list[dict]):
77
+ for item in batch:
78
+ url = item["url"]
79
+ title = item["title"][:500]
80
+ category = item["category"]
81
+ tags = item["tags"]
82
+ score = float(item["score"])
83
+ source = item["source"]
84
+ domain = extract_domain(url)
85
+
86
+ # Bookmark node
87
+ tx.run("""
88
+ MERGE (b:Bookmark {url: $url})
89
+ SET b.title = $title, b.score = $score
90
+ """, url=url, title=title, score=score)
91
+
92
+ # Category node + relationship
93
+ tx.run("""
94
+ MERGE (c:Category {name: $cat})
95
+ WITH c
96
+ MATCH (b:Bookmark {url: $url})
97
+ MERGE (b)-[:IN_CATEGORY]->(c)
98
+ """, cat=category, url=url)
99
+
100
+ # Source node + relationship
101
+ tx.run("""
102
+ MERGE (s:Source {name: $src})
103
+ WITH s
104
+ MATCH (b:Bookmark {url: $url})
105
+ MERGE (b)-[:FROM_SOURCE]->(s)
106
+ """, src=source, url=url)
107
+
108
+ # Domain node + relationship
109
+ if domain and domain != "unknown":
110
+ tx.run("""
111
+ MERGE (d:Domain {name: $domain})
112
+ WITH d
113
+ MATCH (b:Bookmark {url: $url})
114
+ MERGE (b)-[:FROM_DOMAIN]->(d)
115
+ """, domain=domain, url=url)
116
+
117
+ # Tag nodes + relationships
118
+ for tag in tags:
119
+ if not tag:
120
+ continue
121
+ tx.run("""
122
+ MERGE (t:Tag {name: $tag})
123
+ WITH t
124
+ MATCH (b:Bookmark {url: $url})
125
+ MERGE (b)-[:TAGGED]->(t)
126
+ """, tag=tag, url=url)
127
+
128
+
129
+ def _build_tag_cooccurrence(driver):
130
+ """
131
+ For each bookmark with multiple tags, create CO_OCCURS_WITH edges between tags.
132
+ Weight = number of bookmarks where both tags appear together.
133
+ """
134
+ with driver.session(database=config.NEO4J_DATABASE) as session:
135
+ session.run("""
136
+ MATCH (b:Bookmark)-[:TAGGED]->(t1:Tag)
137
+ MATCH (b)-[:TAGGED]->(t2:Tag)
138
+ WHERE t1.name < t2.name
139
+ MERGE (t1)-[r:CO_OCCURS_WITH]-(t2)
140
+ ON CREATE SET r.count = 1
141
+ ON MATCH SET r.count = r.count + 1
142
+ """)
143
+ print(" Tag co-occurrence edges built.")
144
+
145
+
146
+ def add_similar_to_edges(similar_pairs: list[tuple[str, str, float]], driver=None):
147
+ """
148
+ Write SIMILAR_TO edges derived from ChromaDB nearest-neighbor search.
149
+ similar_pairs = [(url_a, url_b, similarity_score), ...]
150
+ """
151
+ own_driver = driver is None
152
+ if own_driver:
153
+ driver = get_driver()
154
+
155
+ with driver.session(database=config.NEO4J_DATABASE) as session:
156
+ for url_a, url_b, score in similar_pairs:
157
+ session.run("""
158
+ MATCH (a:Bookmark {url: $url_a})
159
+ MATCH (b:Bookmark {url: $url_b})
160
+ MERGE (a)-[r:SIMILAR_TO]-(b)
161
+ SET r.score = $score
162
+ """, url_a=url_a, url_b=url_b, score=score)
163
+
164
+ print(f" SIMILAR_TO: {len(similar_pairs)} edges written.")
165
+
166
+ if own_driver:
167
+ driver.close()
168
+
169
+
170
+ def query(cypher: str, params: dict | None = None) -> list[dict]:
171
+ """Run arbitrary Cypher and return results as list of dicts."""
172
+ driver = get_driver()
173
+ with driver.session(database=config.NEO4J_DATABASE) as session:
174
+ result = session.run(cypher, params or {})
175
+ rows = [dict(r) for r in result]
176
+ driver.close()
177
+ return rows
178
+
179
+
180
+ def get_stats() -> dict:
181
+ rows = query("""
182
+ MATCH (b:Bookmark) WITH count(b) AS bookmarks
183
+ MATCH (t:Tag) WITH bookmarks, count(t) AS tags
184
+ MATCH (c:Category) WITH bookmarks, tags, count(c) AS categories
185
+ RETURN bookmarks, tags, categories
186
+ """)
187
+ return rows[0] if rows else {}
188
+
189
+
190
+ def find_similar(url: str, limit: int = 10) -> list[dict]:
191
+ return query("""
192
+ MATCH (b:Bookmark {url: $url})-[r:SIMILAR_TO]-(other:Bookmark)
193
+ RETURN other.url AS url, other.title AS title, r.score AS similarity
194
+ ORDER BY r.score DESC LIMIT $limit
195
+ """, {"url": url, "limit": limit})
196
+
197
+
198
+ def find_by_tag(tag: str, limit: int = 20) -> list[dict]:
199
+ return query("""
200
+ MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: $tag})
201
+ RETURN b.url AS url, b.title AS title, b.score AS score
202
+ ORDER BY b.score DESC LIMIT $limit
203
+ """, {"tag": tag.lower(), "limit": limit})
204
+
205
+
206
+ def find_tag_cluster(tag: str, hops: int = 2, limit: int = 30) -> list[dict]:
207
+ """Follow CO_OCCURS_WITH edges to find related tags and their bookmarks."""
208
+ return query(f"""
209
+ MATCH (t:Tag {{name: $tag}})-[:CO_OCCURS_WITH*1..{hops}]-(related:Tag)
210
+ MATCH (b:Bookmark)-[:TAGGED]->(related)
211
+ RETURN DISTINCT b.url AS url, b.title AS title, b.score AS score, related.name AS via_tag
212
+ ORDER BY b.score DESC LIMIT $limit
213
+ """, {"tag": tag.lower(), "limit": limit})
openmark/ui/__init__.py ADDED
File without changes
openmark/ui/app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenMark Gradio UI — 3 tabs:
3
+ 1. Chat — talk to the LangGraph agent
4
+ 2. Search — instant semantic search with filters
5
+ 3. Stats — knowledge base overview
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
11
+ sys.stdout.reconfigure(encoding="utf-8")
12
+
13
+ import gradio as gr
14
+ from openmark.agent.graph import build_agent, ask
15
+ from openmark.embeddings.factory import get_embedder
16
+ from openmark.stores import chroma as chroma_store
17
+ from openmark.stores import neo4j_store
18
+ from openmark import config
19
+
20
+ # Load once at startup
21
+ print("Loading OpenMark...")
22
+ _embedder = get_embedder()
23
+ _agent = build_agent()
24
+ print("OpenMark ready.")
25
+
26
+
27
+ # ── Chat tab ──────────────────────────────────────────────────
28
+
29
+ def chat_fn(message: str, history: list, thread_id: str):
30
+ if not message.strip():
31
+ return history, ""
32
+ response = ask(_agent, message, thread_id=thread_id or "default")
33
+ history.append({"role": "user", "content": message})
34
+ history.append({"role": "assistant", "content": response})
35
+ return history, ""
36
+
37
+
38
+ # ── Search tab ────────────────────────────────────────────────
39
+
40
+ def search_fn(query: str, category: str, min_score: float, n_results: int):
41
+ if not query.strip():
42
+ return "Enter a search query."
43
+
44
+ cat = category if category != "All" else None
45
+ ms = min_score if min_score > 0 else None
46
+
47
+ results = chroma_store.search(
48
+ query, _embedder, n=int(n_results),
49
+ category=cat, min_score=ms,
50
+ )
51
+
52
+ if not results:
53
+ return "No results found."
54
+
55
+ lines = []
56
+ for r in results:
57
+ lines.append(
58
+ f"**{r['rank']}. {r['title'] or r['url']}**\n"
59
+ f"🔗 {r['url']}\n"
60
+ f"📁 {r['category']} | 📌 {', '.join(t for t in r['tags'] if t)} | "
61
+ f"⭐ {r['score']} | 🎯 {r['similarity']:.3f} similarity\n"
62
+ )
63
+ return "\n---\n".join(lines)
64
+
65
+
66
+ # ── Stats tab ─────────────────────────────────────────────────
67
+
68
+ def stats_fn():
69
+ chroma = chroma_store.get_stats()
70
+ neo4j = neo4j_store.get_stats()
71
+
72
+ # Category breakdown from Neo4j
73
+ cat_rows = neo4j_store.query("""
74
+ MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
75
+ RETURN c.name AS category, count(b) AS count
76
+ ORDER BY count DESC
77
+ """)
78
+ cat_lines = "\n".join(f" {r['category']:<35} {r['count']:>5}" for r in cat_rows)
79
+
80
+ # Top tags
81
+ tag_rows = neo4j_store.query("""
82
+ MATCH (b:Bookmark)-[:TAGGED]->(t:Tag)
83
+ RETURN t.name AS tag, count(b) AS count
84
+ ORDER BY count DESC LIMIT 20
85
+ """)
86
+ tag_lines = ", ".join(f"{r['tag']} ({r['count']})" for r in tag_rows)
87
+
88
+ return (
89
+ f"## OpenMark Knowledge Base\n\n"
90
+ f"**ChromaDB vectors:** {chroma.get('total', 0)}\n"
91
+ f"**Neo4j bookmarks:** {neo4j.get('bookmarks', 0)}\n"
92
+ f"**Neo4j tags:** {neo4j.get('tags', 0)}\n"
93
+ f"**Neo4j categories:** {neo4j.get('categories', 0)}\n\n"
94
+ f"### By Category\n```\n{cat_lines}\n```\n\n"
95
+ f"### Top Tags\n{tag_lines}"
96
+ )
97
+
98
+
99
+ # ── Build UI ──────────────────────────────────────────────────
100
+
101
+ def build_ui():
102
+ categories = ["All"] + config.CATEGORIES
103
+
104
+ with gr.Blocks(title="OpenMark", theme=gr.themes.Soft()) as app:
105
+ gr.Markdown("# OpenMark — Your Personal Knowledge Graph")
106
+
107
+ with gr.Tabs():
108
+
109
+ # Tab 1: Chat
110
+ with gr.Tab("Chat"):
111
+ thread = gr.Textbox(value="default", label="Session ID", scale=1)
112
+ chatbot = gr.Chatbot(type="messages", height=500)
113
+ msg_box = gr.Textbox(
114
+ placeholder="Ask anything about your saved bookmarks...",
115
+ label="Message", lines=2,
116
+ )
117
+ send_btn = gr.Button("Send", variant="primary")
118
+
119
+ send_btn.click(
120
+ chat_fn,
121
+ inputs=[msg_box, chatbot, thread],
122
+ outputs=[chatbot, msg_box],
123
+ )
124
+ msg_box.submit(
125
+ chat_fn,
126
+ inputs=[msg_box, chatbot, thread],
127
+ outputs=[chatbot, msg_box],
128
+ )
129
+
130
+ # Tab 2: Search
131
+ with gr.Tab("Search"):
132
+ with gr.Row():
133
+ q_input = gr.Textbox(placeholder="Search your knowledge base...", label="Query", scale=3)
134
+ cat_input = gr.Dropdown(categories, value="All", label="Category")
135
+ with gr.Row():
136
+ score_input = gr.Slider(0, 10, value=0, step=1, label="Min Quality Score")
137
+ n_input = gr.Slider(5, 50, value=10, step=5, label="Results")
138
+ search_btn = gr.Button("Search", variant="primary")
139
+ search_output = gr.Markdown()
140
+
141
+ search_btn.click(
142
+ search_fn,
143
+ inputs=[q_input, cat_input, score_input, n_input],
144
+ outputs=search_output,
145
+ )
146
+ q_input.submit(
147
+ search_fn,
148
+ inputs=[q_input, cat_input, score_input, n_input],
149
+ outputs=search_output,
150
+ )
151
+
152
+ # Tab 3: Stats
153
+ with gr.Tab("Stats"):
154
+ refresh_btn = gr.Button("Refresh Stats")
155
+ stats_output = gr.Markdown()
156
+
157
+ refresh_btn.click(stats_fn, outputs=stats_output)
158
+ app.load(stats_fn, outputs=stats_output)
159
+
160
+ return app
161
+
162
+
163
+ if __name__ == "__main__":
164
+ ui = build_ui()
165
+ ui.launch(server_name="0.0.0.0", server_port=7860, share=False)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb>=1.5.4
2
+ langchain>=0.3.25
3
+ langgraph>=1.0.1
4
+ langchain-openai>=0.3.23
5
+ langchain-neo4j>=0.4.0
6
+ sentence-transformers==3.3.1
7
+ transformers>=4.57.0
8
+ huggingface_hub>=1.6.0
9
+ torch>=2.0.0
10
+ neo4j>=5.28.1
11
+ gradio>=6.6.0
12
+ requests>=2.31.0
13
+ python-dotenv>=1.0.0
14
+ numpy>=1.24.0
scripts/ingest.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenMark Full Ingest Pipeline
3
+ Run this once (or again to update) to:
4
+ 1. Merge all data sources (CATEGORIZED.json + LinkedIn + YouTube)
5
+ 2. Embed everything with chosen provider (local pplx-embed or Azure)
6
+ 3. Store in ChromaDB (semantic search)
7
+ 4. Store in Neo4j (knowledge graph)
8
+ 5. Compute SIMILAR_TO edges (top-5 neighbors per bookmark → graph edges)
9
+
10
+ Usage:
11
+ C:\\Python313\\python scripts/ingest.py
12
+ C:\\Python313\\python scripts/ingest.py --provider azure
13
+ C:\\Python313\\python scripts/ingest.py --fresh-raindrop (also pulls live from Raindrop API)
14
+ """
15
+
16
+ import sys
17
+ import os
18
+ import argparse
19
+
20
+ sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
21
+ sys.stdout.reconfigure(encoding="utf-8")
22
+
23
+ from openmark.pipeline.merge import merge_all
24
+ from openmark.embeddings.factory import get_embedder
25
+ from openmark.stores import chroma as chroma_store
26
+ from openmark.stores import neo4j_store
27
+ from openmark import config
28
+
29
+
30
+ def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
31
+ """
32
+ For each item, find its top-k nearest neighbors in ChromaDB
33
+ and write SIMILAR_TO edges in Neo4j.
34
+ This creates the semantic web inside the graph.
35
+ """
36
+ print(f"\nBuilding SIMILAR_TO edges (top-{top_k} per bookmark)...")
37
+ pairs = []
38
+ total = len(items)
39
+
40
+ for i, item in enumerate(items):
41
+ url = item["url"]
42
+ try:
43
+ results = chroma_store.search(
44
+ item["doc_text"], embedder, n=top_k + 1
45
+ )
46
+ for r in results:
47
+ if r["url"] != url and r["similarity"] > 0.5:
48
+ pairs.append((url, r["url"], r["similarity"]))
49
+ except Exception:
50
+ pass
51
+
52
+ if (i + 1) % 500 == 0:
53
+ print(f" Processed {i+1}/{total} for SIMILAR_TO")
54
+
55
+ print(f" Writing {len(pairs)} SIMILAR_TO edges to Neo4j...")
56
+ neo4j_store.add_similar_to_edges(pairs)
57
+ print(" SIMILAR_TO done.")
58
+
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
62
+ parser.add_argument("--provider", default=None, help="Embedding provider: local or azure")
63
+ parser.add_argument("--fresh-raindrop", action="store_true", help="Also pull fresh from Raindrop API")
64
+ parser.add_argument("--skip-similar", action="store_true", help="Skip SIMILAR_TO edge computation")
65
+ args = parser.parse_args()
66
+
67
+ if args.provider:
68
+ os.environ["EMBEDDING_PROVIDER"] = args.provider
69
+
70
+ print("=" * 60)
71
+ print("OPENMARK INGEST PIPELINE")
72
+ print(f"Embedding: {config.EMBEDDING_PROVIDER}")
73
+ print("=" * 60)
74
+
75
+ # Step 1: Merge all sources
76
+ print("\n[1/4] Merging data sources...")
77
+ items = merge_all(include_fresh_raindrop=args.fresh_raindrop)
78
+
79
+ # Step 2: Load embedder
80
+ print(f"\n[2/4] Loading {config.EMBEDDING_PROVIDER} embedder...")
81
+ embedder = get_embedder()
82
+
83
+ # Step 3: ChromaDB
84
+ print("\n[3/4] Ingesting into ChromaDB...")
85
+ chroma_store.ingest(items, embedder)
86
+
87
+ # Step 4: Neo4j
88
+ print("\n[4/4] Ingesting into Neo4j...")
89
+ neo4j_store.ingest(items)
90
+
91
+ # Step 5: SIMILAR_TO edges
92
+ if not args.skip_similar:
93
+ build_similar_to_edges(items, embedder, top_k=5)
94
+
95
+ print("\n" + "=" * 60)
96
+ print("INGEST COMPLETE")
97
+ chroma = chroma_store.get_stats()
98
+ neo4j = neo4j_store.get_stats()
99
+ print(f" ChromaDB: {chroma.get('total', 0)} vectors")
100
+ print(f" Neo4j: {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
101
+ print("=" * 60)
102
+ print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
103
+ print(" or: C:\\Python313\\python -m openmark.ui.app")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
scripts/search.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenMark CLI Search — instant search from terminal.
3
+
4
+ Usage:
5
+ C:\\Python313\\python scripts/search.py "RAG tools"
6
+ C:\\Python313\\python scripts/search.py "LangGraph" --category "Agent Development"
7
+ C:\\Python313\\python scripts/search.py "embeddings" --n 20
8
+ C:\\Python313\\python scripts/search.py --tag "rag"
9
+ C:\\Python313\\python scripts/search.py --stats
10
+ """
11
+
12
+ import sys
13
+ import os
14
+ import argparse
15
+
16
+ sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
17
+ sys.stdout.reconfigure(encoding="utf-8")
18
+
19
+ from openmark.embeddings.factory import get_embedder
20
+ from openmark.stores import chroma as chroma_store
21
+ from openmark.stores import neo4j_store
22
+
23
+
24
+ def print_results(results: list[dict]):
25
+ if not results:
26
+ print("No results found.")
27
+ return
28
+ for r in results:
29
+ title = r.get("title") or r.get("url")
30
+ url = r.get("url", "")
31
+ cat = r.get("category", "")
32
+ sim = r.get("similarity", "")
33
+ score = r.get("score", "")
34
+ tags = ", ".join(t for t in r.get("tags", []) if t)
35
+ print(f"\n {r.get('rank', '-')}. {title}")
36
+ print(f" {url}")
37
+ if cat: print(f" Category: {cat}")
38
+ if tags: print(f" Tags: {tags}")
39
+ if score: print(f" Score: {score}")
40
+ if sim: print(f" Similarity: {sim}")
41
+
42
+
43
+ def main():
44
+ parser = argparse.ArgumentParser(description="OpenMark CLI Search")
45
+ parser.add_argument("query", nargs="?", default=None, help="Search query")
46
+ parser.add_argument("--category", default=None, help="Filter by category")
47
+ parser.add_argument("--tag", default=None, help="Search by tag (graph lookup)")
48
+ parser.add_argument("--n", type=int, default=10, help="Number of results")
49
+ parser.add_argument("--stats", action="store_true", help="Show knowledge base stats")
50
+ args = parser.parse_args()
51
+
52
+ if args.stats:
53
+ chroma = chroma_store.get_stats()
54
+ neo4j = neo4j_store.get_stats()
55
+ print("\nOpenMark Stats:")
56
+ print(f" ChromaDB vectors: {chroma.get('total', 0)}")
57
+ print(f" Neo4j bookmarks: {neo4j.get('bookmarks', 0)}")
58
+ print(f" Neo4j tags: {neo4j.get('tags', 0)}")
59
+ return
60
+
61
+ if args.tag:
62
+ print(f"\nSearching by tag: '{args.tag}'")
63
+ results = neo4j_store.find_by_tag(args.tag, limit=args.n)
64
+ for r in results:
65
+ print(f"\n - {r.get('title', '')}")
66
+ print(f" {r.get('url', '')} (score: {r.get('score', '')})")
67
+ return
68
+
69
+ if not args.query:
70
+ parser.print_help()
71
+ return
72
+
73
+ print(f"\nSearching: '{args.query}'")
74
+ if args.category:
75
+ print(f"Category filter: {args.category}")
76
+
77
+ embedder = get_embedder()
78
+ results = chroma_store.search(
79
+ args.query, embedder, n=args.n, category=args.category
80
+ )
81
+ print_results(results)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()