Spaces:

codingwithadi
/

OpenMark

Sleeping

App Files Files Community

codingwithadi commited on Mar 9

Commit

81598c5

verified ·

1 Parent(s): 39cff07

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

.env.example +26 -0
.gitattributes +8 -35
.gitignore +45 -0
LICENSE +21 -0
README.md +267 -31
app.py +8 -0
data/.gitkeep +0 -0
docs/architecture.md +188 -0
docs/data-collection.md +144 -0
docs/huggingface.md +129 -0
docs/ingest.md +154 -0
docs/troubleshooting.md +127 -0
openmark/__init__.py +0 -0
openmark/agent/__init__.py +0 -0
openmark/agent/graph.py +70 -0
openmark/agent/tools.py +138 -0
openmark/config.py +73 -0
openmark/embeddings/__init__.py +0 -0
openmark/embeddings/azure.py +42 -0
openmark/embeddings/base.py +21 -0
openmark/embeddings/factory.py +15 -0
openmark/embeddings/local.py +54 -0
openmark/pipeline/__init__.py +0 -0
openmark/pipeline/merge.py +97 -0
openmark/pipeline/normalize.py +98 -0
openmark/pipeline/raindrop.py +83 -0
openmark/stores/__init__.py +0 -0
openmark/stores/chroma.py +127 -0
openmark/stores/neo4j_store.py +213 -0
openmark/ui/__init__.py +0 -0
openmark/ui/app.py +165 -0
requirements.txt +14 -0
scripts/ingest.py +107 -0
scripts/search.py +85 -0

.env.example ADDED Viewed

	@@ -0,0 +1,26 @@

+# ── Embedding Provider: "local" or "azure" ───────────────────
+EMBEDDING_PROVIDER=local
+# ── Local pplx-embed ─────────────────────────────────────────
+PPLX_QUERY_MODEL=perplexity-ai/pplx-embed-v1-0.6b
+PPLX_DOC_MODEL=perplexity-ai/pplx-embed-context-v1-0.6b
+# ── Azure AI Foundry ──────────────────────────────────────────
+AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
+AZURE_API_KEY=your-azure-api-key
+AZURE_DEPLOYMENT_LLM=gpt-4o-mini
+AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
+AZURE_API_VERSION=2024-05-01-preview
+# ── Neo4j ─────────────────────────────────────────────────────
+NEO4J_URI=bolt://127.0.0.1:7687
+NEO4J_USER=neo4j
+NEO4J_PASSWORD=your-neo4j-password
+NEO4J_DATABASE=db1
+# ── Raindrop ──────────────────────────────────────────────────
+RAINDROP_TOKEN=your-raindrop-test-token
+# ── Data paths ────────────────────────────────────────────────
+RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
+CHROMA_PATH=C:\path\to\OpenMark\data\chroma_db

.gitattributes CHANGED Viewed

@@ -1,35 +1,8 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Normalize line endings
+* text=auto
+*.py text eol=lf
+*.md text eol=lf
+*.txt text eol=lf
+*.json text eol=lf
+*.env text eol=lf
+*.gitignore text eol=lf

.gitignore ADDED Viewed

	@@ -0,0 +1,45 @@

+# ── Credentials — NEVER commit ───────────────────────────────
+.env
+# ── Personal data — your bookmark vectors ────────────────────
+data/chroma_db/
+# ── Python ────────────────────────────────────────────────────
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+.eggs/
+# ── Virtual environments ──────────────────────────────────────
+venv/
+.venv/
+env/
+ENV/
+# ── IDE ───────────────────────────────────────────────────────
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+Thumbs.db
+# ── Logs & temp ───────────────────────────────────────────────
+*.log
+*.tmp
+*.bak
+# ── HuggingFace cache (large model files) ────────────────────
+.cache/
+# ── Raw data exports — personal, not for the repo ────────────
+raindrop-mission/
+data/linkedin_saved.json
+data/youtube_MASTER.json
+data/CATEGORIZED.json

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Ahmad Othman Ammar Adi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,31 +1,267 @@
----
-title: OpenMark
-emoji: 🔖
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: "6.6.0"
-app_file: app.py
-pinned: true
-license: mit
-short_description: Personal knowledge graph — search 8K+ bookmarks with AI
-tags:
-  - rag
-  - knowledge-graph
-  - neo4j
-  - chromadb
-  - langraph
-  - pplx-embed
-  - second-brain
-  - bookmarks
----
-# OpenMark
-**Personal knowledge graph** — 8,000+ bookmarks, LinkedIn saves, and YouTube videos indexed with pplx-embed, searchable with ChromaDB and Neo4j, queryable via a LangGraph agent.
-Built by [Ahmad Othman Ammar Adi](https://github.com/OthmanAdi) · [GitHub](https://github.com/OthmanAdi/OpenMark)
-## Setup required
-This Space requires your own credentials (Neo4j, Azure, Raindrop). See the [GitHub repo](https://github.com/OthmanAdi/OpenMark) for full setup instructions.

+# OpenMark
+**Your personal knowledge graph — built from everything you've ever saved.**
+OpenMark ingests your bookmarks, LinkedIn saved posts, and YouTube videos into a dual-store knowledge system: **ChromaDB** for semantic vector search and **Neo4j** for graph-based connection discovery. A LangGraph agent sits on top, letting you query everything in natural language.
+Built by [Ahmad Othman Ammar Adi](https://github.com/OthmanAdi).
+---
+## What it does
+- Pulls all your saved content from multiple sources into one place
+- Embeds everything using [pplx-embed](https://huggingface.co/collections/perplexity-ai/pplx-embed) (local, free) or Azure AI Foundry (fast, cheap)
+- Stores vectors in **ChromaDB** — find things by *meaning*, not keywords
+- Builds a **Neo4j knowledge graph** — discover how topics connect
+- Runs a **LangGraph agent** (powered by gpt-4o-mini) that searches both stores intelligently
+- Serves a **Gradio UI** with Chat, Search, and Stats tabs
+- Also works as a **CLI** — `python scripts/search.py "RAG tools"`
+---
+## Data Sources
+### 1. Raindrop.io
+Create a test token at [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations).
+OpenMark pulls **all collections** automatically via the Raindrop REST API.
+### 2. Browser Bookmarks
+Export your bookmarks as an HTML file from Edge, Chrome, or Firefox:
+- **Edge:** `Settings → Favourites → ··· → Export favourites` → save as `favorites.html`
+- **Chrome/Firefox:** `Bookmarks Manager → Export`
+Point `RAINDROP_MISSION_DIR` in your `.env` to the folder containing the exported HTML files.
+The pipeline parses the Netscape bookmark format automatically.
+### 3. LinkedIn Saved Posts
+LinkedIn does not provide a public API for saved posts. The included `linkedin_fetch.py` script uses your browser session cookie to call LinkedIn's internal Voyager GraphQL API.
+**Steps:**
+1. Log into LinkedIn in your browser
+2. Open DevTools → Application → Cookies → copy the value of `li_at`
+3. Run:
+   ```bash
+   python raindrop-mission/linkedin_fetch.py
+   ```
+   Paste your `li_at` cookie when prompted. The script fetches all saved posts and writes `linkedin_saved.json`.
+> **Personal use only.** This uses LinkedIn's internal API which is not publicly documented or officially supported. Use responsibly.
+### 4. YouTube
+Uses the official [YouTube Data API v3](https://developers.google.com/youtube/v3) via OAuth 2.0.
+**Steps:**
+1. Go to [Google Cloud Console](https://console.cloud.google.com/) → Create a project
+2. Enable the **YouTube Data API v3**
+3. Create OAuth 2.0 credentials → Download as `client_secret.json`
+4. Add your Google account as a test user (OAuth consent screen → Test users)
+5. Run:
+   ```bash
+   python raindrop-mission/youtube_fetch.py
+   ```
+   A browser window opens for auth. After that, `youtube_MASTER.json` is written with liked videos, watch later, and playlists.
+---
+## How it works
+```
+Your saved content
+        │
+        ▼
+  normalize.py          ← clean titles, dedupe by URL, fix categories
+        │
+        ▼
+  EmbeddingProvider     ← LOCAL: pplx-embed-context-v1-0.6b (documents)
+                                  pplx-embed-v1-0.6b (queries)
+                           AZURE: text-embedding-ada-002
+        │
+        ├──────────────────────────────────┐
+        ▼                                  ▼
+    ChromaDB                            Neo4j
+  (vector store)                    (knowledge graph)
+  find by meaning                   find by connection
+  "show me RAG tools"               "what connects LangGraph
+                                     to my Neo4j saves?"
+        │                                  │
+        └──────────────┬───────────────────┘
+                       ▼
+              LangGraph Agent
+              (gpt-4o-mini)
+                       │
+                       ▼
+                  Gradio UI  /  CLI
+```
+### Why embeddings?
+An embedding is a list of numbers that represents the *meaning* of a piece of text. Two pieces of text with similar meaning will have similar numbers — even if they use completely different words. This is how OpenMark finds "retrieval augmented generation tutorials" when you search "RAG tools."
+### Why ChromaDB?
+ChromaDB stores those embedding vectors locally on your disk. It's a persistent vector database — no server, no cloud, no API key. When you search, it compares your query's embedding against all stored embeddings and returns the closest matches.
+### Why Neo4j?
+Embeddings answer "what's similar?" — Neo4j answers "how are these connected?" Every bookmark is a node. Tags, categories, domains, and sources are also nodes. Edges connect them. After ingestion, OpenMark also writes `SIMILAR_TO` edges derived from embedding neighbors — so the graph contains semantic connections you never manually created. You can then traverse: *"start from this LangChain article, walk similar-to 2 hops, what clusters emerge?"*
+---
+## Requirements
+- Python 3.13
+- Neo4j Desktop (local) or AuraDB (cloud) — [neo4j.com/download](https://neo4j.com/download/)
+- **Either** Azure AI Foundry account **or** enough disk space for local pplx-embed (~1.2 GB)
+---
+## Setup
+### 1. Clone and install
+```bash
+git clone https://github.com/OthmanAdi/OpenMark.git
+cd OpenMark
+pip install -r requirements.txt
+```
+### 2. Configure
+```bash
+cp .env.example .env
+```
+Edit `.env` with your values:
+```env
+# Choose your embedding provider
+EMBEDDING_PROVIDER=local        # or: azure
+# Azure AI Foundry (required if EMBEDDING_PROVIDER=azure, also used for the LLM agent)
+AZURE_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
+AZURE_API_KEY=your-key
+AZURE_DEPLOYMENT_LLM=gpt-4o-mini
+AZURE_DEPLOYMENT_EMBED=text-embedding-ada-002
+# Neo4j
+NEO4J_URI=bolt://127.0.0.1:7687
+NEO4J_USER=neo4j
+NEO4J_PASSWORD=your-password
+NEO4J_DATABASE=neo4j
+# Raindrop (get token at app.raindrop.io/settings/integrations)
+RAINDROP_TOKEN=your-token
+# Path to your raindrop-mission data folder
+RAINDROP_MISSION_DIR=C:\path\to\raindrop-mission
+```
+### 3. Ingest
+```bash
+# Local embeddings (free, ~20 min for 8K items on CPU)
+python scripts/ingest.py
+# Azure embeddings (fast, ~5 min, costs ~€0.30 for 8K items)
+python scripts/ingest.py --provider azure
+# Also pull fresh from Raindrop API during ingest
+python scripts/ingest.py --fresh-raindrop
+```
+### 4. Search (CLI)
+```bash
+python scripts/search.py "RAG tools"
+python scripts/search.py "LangGraph" --category "Agent Development"
+python scripts/search.py --tag "rag"
+python scripts/search.py --stats
+```
+### 5. Launch UI
+```bash
+python openmark/ui/app.py
+```
+Open [http://localhost:7860](http://localhost:7860)
+---
+## Required API Keys
+| Key | Where to get it | Required? |
+|-----|----------------|-----------|
+| `RAINDROP_TOKEN` | [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations) | Yes |
+| `AZURE_API_KEY` | Azure Portal → your AI Foundry resource | Only if `EMBEDDING_PROVIDER=azure` |
+| `NEO4J_PASSWORD` | Set when creating your Neo4j database | Yes |
+| YouTube OAuth | Google Cloud Console → YouTube Data API v3 | Only if ingesting YouTube |
+No HuggingFace token is needed for local pplx-embed. The models are open weights and download automatically. You will see a warning `"You are sending unauthenticated requests to the HF Hub"` — this is harmless and can be silenced by setting `HF_TOKEN` in your `.env` if you want higher rate limits.
+---
+## Project Structure
+```
+OpenMark/
+├── openmark/
+│   ├── config.py              ← all settings loaded from .env
+│   ├── pipeline/
+│   │   ├── raindrop.py        ← pull all Raindrop collections via API
+│   │   ├── normalize.py       ← clean, dedupe, build embedding text
+│   │   └── merge.py           ← combine all sources
+│   ├── embeddings/
+│   │   ├── base.py            ← abstract EmbeddingProvider interface
+│   │   ├── local.py           ← pplx-embed (local, free)
+│   │   ├── azure.py           ← Azure AI Foundry
+│   │   └── factory.py         ← returns provider based on .env
+│   ├── stores/
+│   │   ├── chroma.py          ← ChromaDB: ingest + semantic search
+│   │   └── neo4j_store.py     ← Neo4j: graph nodes, edges, traversal
+│   ├── agent/
+│   │   ├── tools.py           ← LangGraph tools (search, tag, graph)
+│   │   └── graph.py           ← create_react_agent with gpt-4o-mini
+│   └── ui/
+│       └── app.py             ← Gradio UI (Chat / Search / Stats)
+└── scripts/
+    ├── ingest.py              ← full pipeline runner
+    └── search.py              ← CLI search
+```
+---
+## Roadmap
+- [ ] OpenAI embeddings integration
+- [ ] Ollama local LLM support
+- [ ] Pinecone vector store option
+- [ ] Web scraping — fetch full page content for richer embeddings
+- [ ] Browser extension for real-time saving to OpenMark
+- [ ] Comet / Arc browser bookmark import
+- [ ] Automatic re-ingestion on schedule
+- [ ] Export to Obsidian / Notion
+- [ ] Multi-user support
+---
+## Documentation
+| Doc | What's in it |
+|-----|-------------|
+| [docs/data-collection.md](docs/data-collection.md) | Full guide for each data source — Raindrop, Edge, LinkedIn cookie method, YouTube OAuth, daily.dev console script |
+| [docs/ingest.md](docs/ingest.md) | All ingest flags, timing for each step, how SIMILAR_TO edges work, re-run behavior |
+| [docs/architecture.md](docs/architecture.md) | Dual-store design, Neo4j graph schema, embedding patches, Cypher query examples, agent tools |
+| [docs/troubleshooting.md](docs/troubleshooting.md) | pplx-embed compatibility fixes, LinkedIn queryId changes, Neo4j connection issues, Windows encoding |
+---
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""HuggingFace Space entry point — launches the OpenMark Gradio UI."""
+import sys, os
+sys.path.insert(0, os.path.dirname(__file__))
+from openmark.ui.app import build_ui
+if __name__ == "__main__":
+    ui = build_ui()
+    ui.launch()

data/.gitkeep ADDED Viewed

File without changes

docs/architecture.md ADDED Viewed

	@@ -0,0 +1,188 @@

+# Architecture
+## Overview
+OpenMark uses a **dual-store architecture** — two databases working together, each doing what it's best at.
+```
+                        User Query
+                            │
+                    LangGraph Agent
+                    (gpt-4o-mini)
+                   /              \
+          ChromaDB               Neo4j
+        (vector store)        (graph store)
+        "find by meaning"     "find by connection"
+        "what's similar?"     "how are things linked?"
+```
+---
+## Embedding Layer
+The embedding layer is **provider-agnostic** — swap between local and cloud with one env var.
+```
+EMBEDDING_PROVIDER=local   →  LocalEmbedder  (pplx-embed, runs on your machine)
+EMBEDDING_PROVIDER=azure   →  AzureEmbedder  (Azure AI Foundry, API call)
+```
+**Why two pplx-embed models?**
+Perplexity AI ships two variants:
+- `pplx-embed-v1-0.6b` — for encoding **queries** (what the user types)
+- `pplx-embed-context-v1-0.6b` — for encoding **documents** (the bookmarks, surrounding context matters)
+Using the correct model for each role improves retrieval quality. Most implementations use one model for both — this is the correct production pattern.
+**The compatibility patches:**
+pplx-embed models ship with custom Python code (`st_quantize.py`) that has two incompatibilities with modern libraries:
+1. **`sentence_transformers 4.x` removed the `Module` base class** — pplx-embed's code imports it. Fixed by aliasing `torch.nn.Module` to `sentence_transformers.models.Module` before import.
+2. **`transformers 4.57` added `list_repo_templates()`** — it looks for an `additional_chat_templates` folder in every model repo. pplx-embed doesn't have one, causing a hard 404 crash. Fixed by monkey-patching the function to return an empty list on exception.
+Both patches are applied in `openmark/embeddings/local.py` before any model loading.
+**Why `sentence-transformers==3.3.1` specifically?**
+Version 4.x removed the `Module` base class that pplx-embed depends on. Pin to 3.3.1.
+---
+## ChromaDB
+Local, file-based vector database. No server, no API key, no cloud.
+**Collection:** `openmark_bookmarks`
+**Similarity metric:** cosine
+**Data path:** `CHROMA_PATH` in `.env` (default: `OpenMark/data/chroma_db/`)
+**What's stored per item:**
+```python
+{
+    "id":       url,           # primary key
+    "document": doc_text,      # rich text used for embedding
+    "metadata": {
+        "title":    str,
+        "category": str,
+        "source":   str,       # raindrop, linkedin, youtube_liked, edge, etc.
+        "score":    float,     # quality score 1-10
+        "tags":     str,       # comma-separated
+        "folder":   str,
+    },
+    "embedding": [float x 1024]  # or 1536 for Azure
+}
+```
+**Querying:**
+```python
+collection.query(
+    query_embeddings=[embedder.embed_query("RAG tools")],
+    n_results=10,
+    where={"category": {"$eq": "RAG & Vector Search"}},  # optional filter
+)
+```
+---
+## Neo4j Graph Schema
+```
+(:Bookmark {url, title, score})
+    -[:IN_CATEGORY]->   (:Category {name})
+    -[:TAGGED]->        (:Tag {name})
+    -[:FROM_SOURCE]->   (:Source {name})
+    -[:FROM_DOMAIN]->   (:Domain {name})
+    -[:SIMILAR_TO {score}]->  (:Bookmark)  ← from embeddings
+(:Tag)-[:CO_OCCURS_WITH {count}]-(:Tag)    ← tags that appear together
+```
+**Useful Cypher queries:**
+```cypher
+// Count everything
+MATCH (b:Bookmark) RETURN count(b) AS bookmarks
+MATCH (t:Tag) RETURN count(t) AS tags
+// Top categories
+MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
+RETURN c.name, count(b) AS count ORDER BY count DESC
+// All bookmarks tagged 'rag'
+MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: 'rag'})
+RETURN b.title, b.url ORDER BY b.score DESC
+// Find what connects to 'langchain' tag (2 hops)
+MATCH (t:Tag {name: 'langchain'})-[:CO_OCCURS_WITH*1..2]-(related:Tag)
+RETURN related.name, count(*) AS strength ORDER BY strength DESC
+// Similar bookmarks to a URL
+MATCH (b:Bookmark {url: 'https://...'})-[r:SIMILAR_TO]-(other)
+RETURN other.title, other.url, r.score ORDER BY r.score DESC
+// Most connected domains
+MATCH (b:Bookmark)-[:FROM_DOMAIN]->(d:Domain)
+RETURN d.name, count(b) AS saved ORDER BY saved DESC LIMIT 20
+```
+---
+## LangGraph Agent
+Built with `create_react_agent` from LangGraph 1.0.x.
+**Model:** Azure gpt-4o-mini (streaming enabled)
+**Memory:** `MemorySaver` — conversation history persists per `thread_id` within a session
+**Tools:**
+| Tool | Store | Description |
+|------|-------|-------------|
+| `search_semantic` | ChromaDB | Natural language vector search |
+| `search_by_category` | ChromaDB | Filter by category + optional query |
+| `find_by_tag` | Neo4j | Exact tag lookup |
+| `find_similar_bookmarks` | Neo4j | SIMILAR_TO edge traversal |
+| `explore_tag_cluster` | Neo4j | CO_OCCURS_WITH traversal (2 hops) |
+| `get_stats` | Both | Count totals |
+| `run_cypher` | Neo4j | Raw Cypher for power users |
+**Agent routing:** The LLM decides which tool(s) to call based on the query. For "what do I know about RAG" it will call `search_semantic` + `search_by_category` + `find_by_tag`. For "how does LangGraph connect to my Neo4j saves" it will call `explore_tag_cluster` and `run_cypher`.
+---
+## Gradio UI
+Three tabs:
+| Tab | What it does |
+|-----|-------------|
+| Chat | Full LangGraph agent conversation. Remembers context within session. |
+| Search | Direct ChromaDB search with category filter, min score slider, result count. |
+| Stats | Neo4j category breakdown + top tags. Loads on startup. |
+Run: `python openmark/ui/app.py` → `http://localhost:7860`
+---
+## Data Flow Summary
+```
+Source files (JSON, HTML)
+        │
+   merge.py → normalize.py
+        │
+   8,007 items with doc_text
+        │
+   EmbeddingProvider.embed_documents()
+        │
+   ┌────┴────┐
+   │         │
+ChromaDB   Neo4j
+add()      MERGE nodes + relationships
+           CO_OCCURS_WITH edges
+           SIMILAR_TO edges (from ChromaDB top-5 per item)
+```

docs/data-collection.md ADDED Viewed

	@@ -0,0 +1,144 @@

+# Data Collection Guide
+Everything you need to collect your saved content from each source before running the ingest pipeline.
+---
+## 1. Raindrop.io
+OpenMark pulls **all your Raindrop collections automatically** via the official REST API. You just need a token.
+**Steps:**
+1. Go to [app.raindrop.io/settings/integrations](https://app.raindrop.io/settings/integrations)
+2. Under "For Developers" → click **Create new app**
+3. Copy the **Test token** (permanent, no expiry)
+4. Add to `.env`:
+   ```
+   RAINDROP_TOKEN=your-token-here
+   ```
+The pipeline fetches every collection, every sub-collection, and every unsorted raindrop automatically. No manual export needed.
+---
+## 2. Browser Bookmarks (Edge / Chrome / Firefox)
+Export your bookmarks as an HTML file in the Netscape bookmark format (all browsers support this).
+**Edge:**
+`Settings → Favourites → ··· (three dots) → Export favourites` → save as `favorites.html`
+**Chrome:**
+`Bookmarks Manager (Ctrl+Shift+O) → ··· → Export bookmarks` → save as `bookmarks.html`
+**Firefox:**
+`Bookmarks → Manage Bookmarks → Import and Backup → Export Bookmarks to HTML`
+**After exporting:**
+- Place the HTML file(s) in your `raindrop-mission` folder (or wherever `RAINDROP_MISSION_DIR` points)
+- The pipeline (`merge.py`) looks for `favorites_*.html` and `bookmarks_*.html` patterns
+- It parses the Netscape format and extracts URLs + titles + folder structure
+> **Tip:** Export fresh before every ingest to capture new bookmarks.
+---
+## 3. LinkedIn Saved Posts
+LinkedIn has no public API for saved posts. OpenMark uses LinkedIn's internal **Voyager GraphQL API** — the same API the LinkedIn web app uses internally.
+**This is the exact endpoint used:**
+```
+https://www.linkedin.com/voyager/api/graphql
+  ?variables=(start:0,count:10,paginationToken:null,
+    query:(flagshipSearchIntent:SEARCH_MY_ITEMS_SAVED_POSTS))
+  &queryId=voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9
+```
+**How to get your session cookie:**
+1. Log into LinkedIn in your browser
+2. Open DevTools (`F12`) → **Application** tab → **Cookies** → `https://www.linkedin.com`
+3. Find the cookie named `li_at` — copy its value
+4. Also find `JSESSIONID` — copy its value (used as CSRF token, format: `ajax:XXXXXXXXXXXXXXXXXX`)
+**Run the fetch script:**
+```bash
+python raindrop-mission/linkedin_fetch.py
+```
+Paste your `li_at` value when prompted.
+**Output:** `raindrop-mission/linkedin_saved.json` — 1,260 saved posts with author, content, and URL.
+**Pagination:** LinkedIn returns 10 posts per page. The script detects end of results when no `nextPageToken` is returned. With 1,260 posts that's ~133 pages.
+> **Important:** The `queryId` (`voyagerSearchDashClusters.05111e1b90ee7fea15bebe9f9410ced9`) is hardcoded in LinkedIn's JavaScript bundle and can change with LinkedIn deployments. If the script returns 0 results, intercept a fresh request from your browser's Network tab — filter for `voyagerSearchDashClusters`, copy the new `queryId`.
+> **Personal use only.** This method is not officially supported by LinkedIn. Do not use for scraping at scale.
+---
+## 4. YouTube
+Uses the official **YouTube Data API v3** via OAuth 2.0. Collects liked videos, watch later playlist, and any saved playlists.
+**One-time setup:**
+1. Go to [Google Cloud Console](https://console.cloud.google.com/)
+2. Create a new project (e.g. "OpenMark")
+3. Enable **YouTube Data API v3** (APIs & Services → Enable APIs)
+4. Create credentials: **OAuth 2.0 Client ID** → Desktop App
+5. Download the JSON file — rename it to `client_secret.json` and place it in `raindrop-mission/`
+6. Go to **OAuth consent screen** → Test users → add your Google account email
+**Run the fetch script:**
+```bash
+python raindrop-mission/youtube_fetch.py
+```
+A browser window opens for Google sign-in. After auth, a token is cached locally — you won't need to auth again.
+**Output:** `raindrop-mission/youtube_MASTER.json` with:
+- `liked_videos` — videos you've liked (up to ~3,200 via API limit)
+- `watch_later` — requires Google Takeout (see below)
+- `playlists` — saved playlists
+**Watch Later via Google Takeout:**
+YouTube's API does not expose Watch Later directly. Export it via [takeout.google.com](https://takeout.google.com):
+- Select only **YouTube** → **Playlists** → Download
+- Extract the CSV file named `Watch later-videos.csv`
+- Place it in `raindrop-mission/`
+- The `youtube_organize.py` script fetches video titles via API and includes them in `youtube_MASTER.json`
+---
+## 5. daily.dev Bookmarks
+daily.dev does not provide a public API. Use the included browser console script to extract bookmarks directly from the page.
+**Steps:**
+1. Go to [app.daily.dev](https://app.daily.dev) → **Bookmarks**
+2. Scroll all the way down to load all bookmarks
+3. Open DevTools → **Console** tab
+4. Paste and run `raindrop-mission/dailydev_console_script.js`
+5. The script copies a JSON array to your clipboard
+6. Paste into a file named `dailydev_bookmarks.json` in `raindrop-mission/`
+> The script filters for `/posts/` URLs only — it ignores profile links, squad links, and other noise.
+---
+## Summary
+| Source | Method | Output file |
+|--------|--------|-------------|
+| Raindrop | REST API (auto) | pulled live |
+| Edge/Chrome bookmarks | HTML export | `favorites.html` / `bookmarks.html` |
+| LinkedIn saved posts | Voyager GraphQL + session cookie | `linkedin_saved.json` |
+| YouTube liked/playlists | YouTube Data API v3 + OAuth | `youtube_MASTER.json` |
+| YouTube watch later | Google Takeout CSV | included in `youtube_MASTER.json` |
+| daily.dev bookmarks | Browser console script | `dailydev_bookmarks.json` |
+Once all files are in place, run:
+```bash
+python scripts/ingest.py
+```

docs/huggingface.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# HuggingFace Publishing Guide
+OpenMark publishes two things on HuggingFace:
+1. **Space** — live Gradio demo at `OthmanAdi/OpenMark`
+2. **Dataset** — the categorized bookmarks at `OthmanAdi/openmark-bookmarks`
+---
+## Prerequisites
+You need a HuggingFace account and a **write-access token**:
+1. Go to [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+2. Create a new token → **Write** access
+3. Add to your `.env`:
+   ```
+   HF_TOKEN=hf_your_token_here
+   ```
+---
+## 1. HuggingFace Space (Gradio Demo)
+The Space hosts the Gradio UI publicly (or privately until you're ready).
+**Create the Space:**
+```bash
+pip install huggingface_hub
+python -c "
+from huggingface_hub import HfApi
+import os
+from dotenv import load_dotenv
+load_dotenv()
+api = HfApi(token=os.getenv('HF_TOKEN'))
+api.create_repo(
+    repo_id='OthmanAdi/OpenMark',
+    repo_type='space',
+    space_sdk='gradio',
+    private=True,
+)
+print('Space created: https://huggingface.co/spaces/OthmanAdi/OpenMark')
+"
+```
+**Push the code to the Space:**
+```bash
+python -c "
+from huggingface_hub import HfApi
+import os
+from dotenv import load_dotenv
+load_dotenv()
+api = HfApi(token=os.getenv('HF_TOKEN'))
+api.upload_folder(
+    folder_path='.',
+    repo_id='OthmanAdi/OpenMark',
+    repo_type='space',
+    ignore_patterns=['.env', 'data/chroma_db/*', '__pycache__/*', '.git/*'],
+)
+"
+```
+> **Note:** The Space version requires your ChromaDB and Neo4j data to be pre-loaded. For a public demo, you would host a sample dataset. For private use, the full local setup is better.
+---
+## 2. HuggingFace Dataset
+The dataset card publishes your 8,000+ categorized bookmarks as a reusable dataset for RAG experiments.
+**What's in the dataset:**
+- URL, title, category (19 categories), tags, score (1-10), source
+- Sources: Raindrop, Edge browser, LinkedIn, YouTube, daily.dev
+- ~8,007 unique items after deduplication
+**Create the dataset repo:**
+```bash
+python -c "
+from huggingface_hub import HfApi
+import os, json
+from dotenv import load_dotenv
+load_dotenv()
+api = HfApi(token=os.getenv('HF_TOKEN'))
+# Create private dataset repo
+api.create_repo(
+    repo_id='OthmanAdi/openmark-bookmarks',
+    repo_type='dataset',
+    private=True,
+)
+# Upload dataset card
+api.upload_file(
+    path_or_fileobj='docs/dataset_card.md',
+    path_in_repo='README.md',
+    repo_id='OthmanAdi/openmark-bookmarks',
+    repo_type='dataset',
+)
+# Upload the data (RAINDROP_MISSION_DIR/CATEGORIZED.json)
+api.upload_file(
+    path_or_fileobj=os.path.join(os.getenv('RAINDROP_MISSION_DIR'), 'CATEGORIZED.json'),
+    path_in_repo='data/bookmarks.json',
+    repo_id='OthmanAdi/openmark-bookmarks',
+    repo_type='dataset',
+)
+print('Dataset created: https://huggingface.co/datasets/OthmanAdi/openmark-bookmarks')
+"
+```
+---
+## Making Public
+When you're ready to go public, flip visibility:
+```bash
+python -c "
+from huggingface_hub import HfApi
+import os
+from dotenv import load_dotenv
+load_dotenv()
+api = HfApi(token=os.getenv('HF_TOKEN'))
+# Make Space public
+api.update_repo_visibility('OthmanAdi/OpenMark', private=False, repo_type='space')
+# Make Dataset public
+api.update_repo_visibility('OthmanAdi/openmark-bookmarks', private=False, repo_type='dataset')
+print('Both are now public.')
+"
+```

docs/ingest.md ADDED Viewed

	@@ -0,0 +1,154 @@

+# Ingest Pipeline
+The ingest pipeline is the heart of OpenMark. It merges all your data, embeds everything, and writes to both ChromaDB and Neo4j.
+---
+## Command
+```bash
+python scripts/ingest.py [options]
+```
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--provider local` | from `.env` | Use local pplx-embed models |
+| `--provider azure` | from `.env` | Use Azure AI Foundry embeddings |
+| `--fresh-raindrop` | off | Also pull live from Raindrop API during merge |
+| `--skip-similar` | off | Skip SIMILAR_TO edge computation (saves ~30 min) |
+---
+## Pipeline Steps
+### Step 1 — Merge
+Loads and deduplicates all sources:
+- `CATEGORIZED.json` — pre-categorized bookmarks from Edge + Raindrop + daily.dev
+- `linkedin_saved.json` — LinkedIn saved posts
+- `youtube_MASTER.json` — liked videos, watch later, playlists (not subscriptions)
+Deduplication is URL-based (case-insensitive, trailing slash stripped). If the same URL appears in multiple sources, the first occurrence wins.
+Each item gets a `doc_text` field built for embedding:
+```
+{title} | {category} | {tag1 tag2 tag3} | {content/excerpt/channel}
+```
+This rich text is what gets embedded — not just the title.
+**Output:** ~8,000 normalized items in memory.
+---
+### Step 2 — Embedding
+Loads the embedding provider specified by `EMBEDDING_PROVIDER` in `.env` (or `--provider` flag).
+**Local (pplx-embed):**
+- Query model: `perplexity-ai/pplx-embed-v1-0.6b` — used for user search queries
+- Document model: `perplexity-ai/pplx-embed-context-v1-0.6b` — used for bookmark documents
+- Output dimension: 1024
+- Downloaded once to HuggingFace cache (~1.2 GB total), free on every subsequent run
+- **Known compatibility issue:** pplx-embed requires `sentence-transformers==3.3.1` and two runtime patches (applied automatically in `local.py`). See [troubleshooting.md](troubleshooting.md) for details.
+**Azure:**
+- Uses `text-embedding-ada-002` (or configured `AZURE_DEPLOYMENT_EMBED`)
+- Output dimension: 1536
+- Cost: ~€0.30 for 8,000 items (as of 2026)
+- Batched in groups of 100 with progress logging
+---
+### Step 3 — ChromaDB Ingest
+Embeds all documents in batches of 100 and stores in ChromaDB.
+- Skips items already in ChromaDB (resumable — safe to re-run)
+- Stores: URL (as ID), embedding vector, title, category, source, score, tags
+- Uses cosine similarity space (`hnsw:space: cosine`)
+- Database written to disk at `CHROMA_PATH` (default: `OpenMark/data/chroma_db/`)
+**Timing:**
+| Provider | 8K items | Notes |
+|----------|----------|-------|
+| Local pplx-embed (CPU) | ~20 min | No GPU detected = CPU inference |
+| Local pplx-embed (GPU) | ~3 min | NVIDIA GPU with CUDA |
+| Azure AI Foundry | ~5 min | Network bound |
+---
+### Step 4 — Neo4j Ingest
+Creates nodes and relationships in batches of 200.
+**Nodes created:**
+- `Bookmark` — url, title, score
+- `Category` — name
+- `Tag` — name
+- `Source` — name (raindrop, linkedin, youtube_liked, edge, dailydev, etc.)
+- `Domain` — extracted from URL (e.g. `github.com`, `medium.com`)
+**Relationships created:**
+- `(Bookmark)-[:IN_CATEGORY]->(Category)`
+- `(Bookmark)-[:TAGGED]->(Tag)`
+- `(Bookmark)-[:FROM_SOURCE]->(Source)`
+- `(Bookmark)-[:FROM_DOMAIN]->(Domain)`
+- `(Tag)-[:CO_OCCURS_WITH {count}]-(Tag)` — built after all nodes are written
+**Timing:** ~3-5 minutes for 8K items.
+**Idempotent:** Uses `MERGE` everywhere — safe to re-run, won't create duplicates.
+---
+### Step 5 — SIMILAR_TO Edges
+This is the most powerful and most time-consuming step.
+For each of the 8K bookmarks, OpenMark queries ChromaDB for its top-5 nearest semantic neighbors and writes those as `SIMILAR_TO` edges in Neo4j with a similarity score.
+```
+(Bookmark {url: "...langchain-docs..."})-[:SIMILAR_TO {score: 0.94}]->(Bookmark {url: "...langgraph-tutorial..."})
+```
+These edges encode **semantic connections you never manually created**. The knowledge graph becomes a web of meaning, not just a web of tags.
+**Timing:** ~25-40 minutes on CPU for 8K items. This is the longest step.
+**Skip it if you're in a hurry:**
+```bash
+python scripts/ingest.py --skip-similar
+```
+Everything else works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` tool in the agent and the graph traversal from those edges.
+**Only edges with similarity > 0.5 are written.** Low-quality connections are discarded.
+---
+## Re-running the Pipeline
+The pipeline is safe to re-run at any time:
+- **ChromaDB:** skips already-ingested URLs automatically
+- **Neo4j:** uses `MERGE` — no duplicates created
+- **SIMILAR_TO:** edges are overwritten (not duplicated) via `MERGE`
+To add new bookmarks after the first run:
+1. Update your source files (fresh Raindrop pull, new LinkedIn export, etc.)
+2. Run `python scripts/ingest.py` — only new items get embedded and stored
+---
+## Checking What's Ingested
+```bash
+# Quick stats
+python scripts/search.py --stats
+# Search to verify
+python scripts/search.py "RAG tools"
+# Neo4j — open browser
+# http://localhost:7474
+# Run: MATCH (b:Bookmark) RETURN count(b)
+```

docs/troubleshooting.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# Troubleshooting
+---
+## pplx-embed fails to load
+**Error:** `ImportError: cannot import name 'Module' from 'sentence_transformers.models'`
+**Cause:** pplx-embed's custom `st_quantize.py` imports `Module` from `sentence_transformers.models`, which was removed in version 4.x.
+**Fix:** Pin to the correct version:
+```bash
+pip install "sentence-transformers==3.3.1"
+```
+---
+## pplx-embed crashes with 404 on chat templates
+**Error:** `RemoteEntryNotFoundError: 404 ... additional_chat_templates does not exist`
+**Cause:** `transformers 4.57+` added `list_repo_templates()` which looks for an `additional_chat_templates` folder in every model repo. pplx-embed predates this feature and doesn't have the folder.
+**Fix:** Already handled automatically in `openmark/embeddings/local.py` via a monkey-patch applied before model loading. If you see this error outside of OpenMark, apply:
+```python
+from transformers.utils import hub as _hub
+import transformers.tokenization_utils_base as _tub
+_orig = _hub.list_repo_templates
+def _safe(*a, **kw):
+    try: return _orig(*a, **kw)
+    except Exception: return []
+_hub.list_repo_templates = _safe
+_tub.list_repo_templates = _safe
+```
+---
+## Neo4j connection error: "Unable to retrieve routing information"
+**Cause:** Using `neo4j://` URI (routing protocol) with a single local Neo4j instance.
+**Fix:** Use `bolt://` instead:
+```env
+NEO4J_URI=bolt://127.0.0.1:7687
+```
+---
+## Neo4j error: "Database does not exist"
+**Cause:** The database name in `.env` doesn't match what's in Neo4j Desktop.
+**Fix:** Open `http://localhost:7474`, check what databases exist:
+```cypher
+SHOW DATABASES
+```
+Update `NEO4J_DATABASE` in `.env` to match.
+---
+## LinkedIn script returns 0 results or 404
+**Cause:** LinkedIn's internal `queryId` changes when they deploy new JavaScript bundles.
+**Fix:**
+1. Open LinkedIn in your browser → go to Saved Posts
+2. Open DevTools → Network tab → filter for `voyagerSearchDashClusters`
+3. Click one of the requests → copy the full URL
+4. Extract the new `queryId` value
+5. Update `linkedin_fetch.py` with the new `queryId`
+---
+## YouTube OAuth "Access Blocked: App not verified"
+**Cause:** Your Google Cloud app is in testing mode and your account isn't listed as a test user.
+**Fix:**
+1. Google Cloud Console → OAuth consent screen
+2. Scroll to "Test users" → Add users → add your Google account email
+3. Re-run `youtube_fetch.py`
+---
+## ChromaDB ingest is slow
+On CPU with local pplx-embed, embedding 8K items takes ~20 minutes. This is normal.
+**Options:**
+- Use Azure instead: `python scripts/ingest.py --provider azure` (~5 min, ~€0.30)
+- The ingest is resumable — if interrupted, re-run and it skips already-ingested items
+---
+## SIMILAR_TO step takes too long
+Building SIMILAR_TO edges queries ChromaDB for every bookmark's top-5 neighbors, then writes to Neo4j. For 8K items on CPU this takes ~25-40 minutes.
+**Skip it:**
+```bash
+python scripts/ingest.py --skip-similar
+```
+The app works without SIMILAR_TO edges. You only lose the `find_similar_bookmarks` agent tool and cross-topic graph traversal.
+---
+## Windows UnicodeEncodeError in terminal
+**Error:** `UnicodeEncodeError: 'charmap' codec can't encode character`
+**Cause:** Windows terminal (cmd/PowerShell) defaults to cp1252 encoding which can't handle emoji or some Unicode characters in bookmark titles.
+**Fix:** Run from Windows Terminal (supports UTF-8) or add to the top of the script:
+```python
+import sys
+sys.stdout.reconfigure(encoding='utf-8')
+```
+All OpenMark scripts already include this.
+---
+## gradio not found on Python 3.13
+gradio 6.6.0 is installed on Python 3.14 by default on this machine. If using Python 3.13:
+```bash
+C:\Python313\python -m pip install gradio
+```

openmark/__init__.py ADDED Viewed

File without changes

openmark/agent/__init__.py ADDED Viewed

File without changes

openmark/agent/graph.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+LangGraph ReAct agent for OpenMark.
+Uses Azure gpt-4o-mini as the LLM.
+Has access to all OpenMark tools (ChromaDB + Neo4j).
+"""
+from langchain_openai import AzureChatOpenAI
+from langgraph.prebuilt import create_react_agent
+from langgraph.checkpoint.memory import MemorySaver
+from openmark import config
+from openmark.agent.tools import ALL_TOOLS
+SYSTEM_PROMPT = """You are OpenMark — Ahmad's personal AI knowledge assistant.
+You have access to his entire curated knowledge base of 7,000+ saved bookmarks,
+LinkedIn posts, and YouTube videos — all categorized, tagged, and connected in a
+knowledge graph.
+Your job:
+- Help Ahmad find exactly what he saved and can't remember
+- Discover connections between topics he didn't know existed
+- Answer questions by searching his real saved content (not your training data)
+- Be direct and useful — no filler
+When answering:
+- Always use tools to search first before responding
+- Show the actual URLs and titles from results
+- Group results by relevance
+- If one search doesn't find enough, try a different angle (by tag, by category, by similarity)
+Available search modes:
+- search_semantic: natural language search (most useful for general queries)
+- search_by_category: filter by topic category
+- find_by_tag: exact tag lookup in the knowledge graph
+- find_similar_bookmarks: find related content to a specific URL
+- explore_tag_cluster: discover what else connects to a topic
+- get_stats: see what's in the knowledge base
+- run_cypher: advanced graph queries (for power users)
+"""
+def build_agent():
+    llm = AzureChatOpenAI(
+        azure_endpoint=config.AZURE_ENDPOINT,
+        api_key=config.AZURE_API_KEY,
+        azure_deployment=config.AZURE_DEPLOYMENT_LLM,
+        api_version=config.AZURE_API_VERSION,
+        temperature=0,
+        streaming=True,
+    )
+    checkpointer = MemorySaver()
+    agent = create_react_agent(
+        model=llm,
+        tools=ALL_TOOLS,
+        prompt=SYSTEM_PROMPT,
+        checkpointer=checkpointer,
+    )
+    return agent
+def ask(agent, question: str, thread_id: str = "default") -> str:
+    """Run a question through the agent and return the final text response."""
+    config_run = {"configurable": {"thread_id": thread_id}}
+    result = agent.invoke(
+        {"messages": [{"role": "user", "content": question}]},
+        config=config_run,
+    )
+    return result["messages"][-1].content

openmark/agent/tools.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+LangGraph tools for the OpenMark agent.
+Each tool hits either ChromaDB (semantic) or Neo4j (graph) or both.
+"""
+from langchain_core.tools import tool
+from openmark.embeddings.factory import get_embedder
+from openmark.stores import chroma as chroma_store
+from openmark.stores import neo4j_store
+# Embedder is loaded once and reused
+_embedder = None
+def _get_embedder():
+    global _embedder
+    if _embedder is None:
+        _embedder = get_embedder()
+    return _embedder
+@tool
+def search_semantic(query: str, n: int = 10) -> str:
+    """
+    Search bookmarks by semantic meaning using vector similarity.
+    Use this for natural language queries like 'RAG tools', 'LangGraph tutorials', etc.
+    Returns top N most relevant bookmarks.
+    """
+    results = chroma_store.search(query, _get_embedder(), n=n)
+    if not results:
+        return "No results found."
+    lines = [f"{r['rank']}. [{r['category']}] {r['title']}\n   {r['url']} (similarity: {r['similarity']}, score: {r['score']})"
+             for r in results]
+    return "\n".join(lines)
+@tool
+def search_by_category(category: str, query: str = "", n: int = 15) -> str:
+    """
+    Find bookmarks in a specific category, optionally filtered by semantic query.
+    Categories: RAG & Vector Search, Agent Development, LangChain / LangGraph,
+    MCP & Tool Use, Context Engineering, AI Tools & Platforms, GitHub Repos & OSS,
+    Learning & Courses, YouTube & Video, Web Development, Cloud & Infrastructure,
+    Data Science & ML, Knowledge Graphs & Neo4j, Career & Jobs, LLM Fine-tuning,
+    Finance & Crypto, Design & UI/UX, News & Articles, Entertainment & Other
+    """
+    if query:
+        results = chroma_store.search(query, _get_embedder(), n=n, category=category)
+    else:
+        results = chroma_store.search(category, _get_embedder(), n=n, category=category)
+    if not results:
+        return f"No bookmarks found in category '{category}'."
+    lines = [f"{r['rank']}. {r['title']}\n   {r['url']}" for r in results]
+    return f"Category '{category}' — top results:\n" + "\n".join(lines)
+@tool
+def find_by_tag(tag: str) -> str:
+    """
+    Find all bookmarks tagged with a specific tag using the knowledge graph.
+    Returns bookmarks ordered by quality score.
+    """
+    results = neo4j_store.find_by_tag(tag, limit=20)
+    if not results:
+        return f"No bookmarks found with tag '{tag}'."
+    lines = [f"- {r['title']}\n  {r['url']} (score: {r['score']})" for r in results]
+    return f"Bookmarks tagged '{tag}':\n" + "\n".join(lines)
+@tool
+def find_similar_bookmarks(url: str) -> str:
+    """
+    Find bookmarks semantically similar to a given URL.
+    Uses SIMILAR_TO edges in the knowledge graph (built from embedding neighbors).
+    """
+    results = neo4j_store.find_similar(url, limit=10)
+    if not results:
+        return f"No similar bookmarks found for {url}."
+    lines = [f"- {r['title']}\n  {r['url']} (similarity: {r['similarity']:.3f})" for r in results]
+    return "Similar bookmarks:\n" + "\n".join(lines)
+@tool
+def explore_tag_cluster(tag: str) -> str:
+    """
+    Explore the knowledge graph around a tag — find related tags and their bookmarks.
+    Traverses CO_OCCURS_WITH edges (2 hops) to discover connected topics.
+    Great for discovering what else you know about a topic.
+    """
+    results = neo4j_store.find_tag_cluster(tag, hops=2, limit=25)
+    if not results:
+        return f"No cluster found for tag '{tag}'."
+    lines = [f"- [{r['via_tag']}] {r['title']}\n  {r['url']}" for r in results]
+    return f"Knowledge cluster around '{tag}':\n" + "\n".join(lines)
+@tool
+def get_stats() -> str:
+    """
+    Get statistics about the OpenMark knowledge base.
+    Shows total bookmarks, tags, categories in both ChromaDB and Neo4j.
+    """
+    chroma_stats = chroma_store.get_stats()
+    neo4j_stats  = neo4j_store.get_stats()
+    return (
+        f"OpenMark Knowledge Base Stats:\n"
+        f"  ChromaDB vectors:   {chroma_stats.get('total', 0)}\n"
+        f"  Neo4j bookmarks:    {neo4j_stats.get('bookmarks', 0)}\n"
+        f"  Neo4j tags:         {neo4j_stats.get('tags', 0)}\n"
+        f"  Neo4j categories:   {neo4j_stats.get('categories', 0)}"
+    )
+@tool
+def run_cypher(cypher: str) -> str:
+    """
+    Run a raw Cypher query against the Neo4j knowledge graph.
+    Use for advanced graph traversals. Example:
+    MATCH (b:Bookmark)-[:TAGGED]->(t:Tag) WHERE t.name='rag' RETURN b.title, b.url LIMIT 10
+    """
+    try:
+        rows = neo4j_store.query(cypher)
+        if not rows:
+            return "Query returned no results."
+        lines = [str(r) for r in rows[:20]]
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Cypher error: {e}"
+ALL_TOOLS = [
+    search_semantic,
+    search_by_category,
+    find_by_tag,
+    find_similar_bookmarks,
+    explore_tag_cluster,
+    get_stats,
+    run_cypher,
+]

openmark/config.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+from dotenv import load_dotenv
+load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"))
+# Embedding
+EMBEDDING_PROVIDER     = os.getenv("EMBEDDING_PROVIDER", "local")
+PPLX_QUERY_MODEL       = os.getenv("PPLX_QUERY_MODEL", "perplexity-ai/pplx-embed-v1-0.6b")
+PPLX_DOC_MODEL         = os.getenv("PPLX_DOC_MODEL", "perplexity-ai/pplx-embed-context-v1-0.6b")
+# Azure
+AZURE_ENDPOINT         = os.getenv("AZURE_ENDPOINT")
+AZURE_API_KEY          = os.getenv("AZURE_API_KEY")
+AZURE_DEPLOYMENT_LLM   = os.getenv("AZURE_DEPLOYMENT_LLM", "gpt-4o-mini")
+AZURE_DEPLOYMENT_EMBED = os.getenv("AZURE_DEPLOYMENT_EMBED", "text-embedding-ada-002")
+AZURE_API_VERSION      = os.getenv("AZURE_API_VERSION", "2024-05-01-preview")
+# Neo4j
+NEO4J_URI              = os.getenv("NEO4J_URI", "bolt://127.0.0.1:7687")
+NEO4J_USER             = os.getenv("NEO4J_USER", "neo4j")
+NEO4J_PASSWORD         = os.getenv("NEO4J_PASSWORD")
+NEO4J_DATABASE         = os.getenv("NEO4J_DATABASE", "neo4j")
+# Raindrop
+RAINDROP_TOKEN         = os.getenv("RAINDROP_TOKEN")
+# Paths
+RAINDROP_MISSION_DIR   = os.getenv("RAINDROP_MISSION_DIR", r"C:\Users\oasrvadmin\Documents\raindrop-mission")
+CHROMA_PATH            = os.getenv("CHROMA_PATH", r"C:\Users\oasrvadmin\Documents\OpenMark\data\chroma_db")
+# Canonical categories
+CATEGORIES = [
+    "RAG & Vector Search",
+    "LLM Fine-tuning",
+    "Agent Development",
+    "LangChain / LangGraph",
+    "MCP & Tool Use",
+    "Context Engineering",
+    "AI Tools & Platforms",
+    "GitHub Repos & OSS",
+    "Learning & Courses",
+    "YouTube & Video",
+    "Web Development",
+    "Cloud & Infrastructure",
+    "Data Science & ML",
+    "Knowledge Graphs & Neo4j",
+    "Career & Jobs",
+    "Finance & Crypto",
+    "Design & UI/UX",
+    "News & Articles",
+    "Entertainment & Other",
+]
+CATEGORY_MAP = {
+    "UI/UX Design":               "Design & UI/UX",
+    "UI/UX":                      "Design & UI/UX",
+    "Real_Estate":                "Finance & Crypto",
+    "Real Estate":                "Finance & Crypto",
+    "Social_Media":               "News & Articles",
+    "Social/Community":           "News & Articles",
+    "Social":                     "News & Articles",
+    "E-commerce & Marketplaces":  "News & Articles",
+    "Research & Articles":        "News & Articles",
+    "Blogs & Articles":           "News & Articles",
+    "Research":                   "News & Articles",
+    "AI Thought Leaders & Media": "News & Articles",
+    "Debugging & Tools":          "AI Tools & Platforms",
+    "Health & Wellness":          "Entertainment & Other",
+    "Email & Productivity":       "AI Tools & Platforms",
+    "Legal":                      "Entertainment & Other",
+    "NoCode - LowCode":           "AI Tools & Platforms",
+    "Security":                   "AI Tools & Platforms",
+}

openmark/embeddings/__init__.py ADDED Viewed

File without changes

openmark/embeddings/azure.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Azure AI Foundry embedding provider.
+Uses text-embedding-ada-002 (or whatever deployment is configured).
+"""
+from openai import AzureOpenAI
+from openmark.embeddings.base import EmbeddingProvider
+from openmark import config
+class AzureEmbedder(EmbeddingProvider):
+    def __init__(self):
+        self._client = AzureOpenAI(
+            azure_endpoint=config.AZURE_ENDPOINT,
+            api_key=config.AZURE_API_KEY,
+            api_version=config.AZURE_API_VERSION,
+        )
+        self._deployment = config.AZURE_DEPLOYMENT_EMBED
+        print(f"Azure embedder ready — deployment: {self._deployment}")
+    def _embed(self, texts: list[str]) -> list[list[float]]:
+        response = self._client.embeddings.create(
+            input=texts,
+            model=self._deployment,
+        )
+        return [item.embedding for item in response.data]
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        results = []
+        batch_size = 100
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            results.extend(self._embed(batch))
+            print(f"  Azure embedded {min(i + batch_size, len(texts))}/{len(texts)}")
+        return results
+    def embed_query(self, text: str) -> list[float]:
+        return self._embed([text])[0]
+    @property
+    def dimension(self) -> int:
+        return 1536  # ada-002 dimension

openmark/embeddings/base.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from abc import ABC, abstractmethod
+class EmbeddingProvider(ABC):
+    """Abstract base — swap local pplx-embed or Azure without changing any other code."""
+    @abstractmethod
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """Embed a list of document strings."""
+        ...
+    @abstractmethod
+    def embed_query(self, text: str) -> list[float]:
+        """Embed a single query string."""
+        ...
+    @property
+    @abstractmethod
+    def dimension(self) -> int:
+        """Output embedding dimension."""
+        ...

openmark/embeddings/factory.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from openmark import config
+from openmark.embeddings.base import EmbeddingProvider
+def get_embedder() -> EmbeddingProvider:
+    """Return the configured embedding provider based on EMBEDDING_PROVIDER env var."""
+    provider = config.EMBEDDING_PROVIDER.lower()
+    if provider == "local":
+        from openmark.embeddings.local import LocalEmbedder
+        return LocalEmbedder()
+    elif provider == "azure":
+        from openmark.embeddings.azure import AzureEmbedder
+        return AzureEmbedder()
+    else:
+        raise ValueError(f"Unknown EMBEDDING_PROVIDER: '{provider}'. Use 'local' or 'azure'.")

openmark/embeddings/local.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Local pplx-embed embedding provider.
+Uses:
+  - perplexity-ai/pplx-embed-v1-0.6b        for queries
+  - perplexity-ai/pplx-embed-context-v1-0.6b for documents
+Two patches applied at import time:
+  1. transformers 4.57 crashes on models without additional_chat_templates folder → catch 404
+  2. pplx-embed's st_quantize.py imports sentence_transformers.models.Module (removed in 3.x) → add it back
+"""
+# ── Patch 1: transformers 4.57 list_repo_templates 404 crash ─
+from transformers.utils import hub as _hub
+import transformers.tokenization_utils_base as _tub
+_orig_lrt = _hub.list_repo_templates
+def _safe_lrt(*a, **kw):
+    try:
+        return _orig_lrt(*a, **kw)
+    except Exception:
+        return []
+_hub.list_repo_templates = _safe_lrt
+_tub.list_repo_templates = _safe_lrt
+# ── Patch 2: sentence_transformers.models.Module missing ─────
+import torch.nn as _nn
+import sentence_transformers.models as _st_models
+if not hasattr(_st_models, "Module"):
+    _st_models.Module = _nn.Module
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from openmark.embeddings.base import EmbeddingProvider
+from openmark import config
+class LocalEmbedder(EmbeddingProvider):
+    def __init__(self):
+        print("Loading pplx-embed query model...")
+        self._query_model = SentenceTransformer(config.PPLX_QUERY_MODEL, trust_remote_code=True)
+        print("Loading pplx-embed document model...")
+        self._doc_model = SentenceTransformer(config.PPLX_DOC_MODEL, trust_remote_code=True)
+        print("Local embedder ready.")
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        embeddings = self._doc_model.encode(texts, batch_size=32, show_progress_bar=True)
+        return embeddings.astype(float).tolist()
+    def embed_query(self, text: str) -> list[float]:
+        embedding = self._query_model.encode([text])
+        return embedding[0].astype(float).tolist()
+    @property
+    def dimension(self) -> int:
+        return 1024

openmark/pipeline/__init__.py ADDED Viewed

File without changes

openmark/pipeline/merge.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+Merge ALL data sources into one clean list:
+  - CATEGORIZED.json  (Edge + old Raindrop + daily.dev — already categorized)
+  - linkedin_saved.json  (1,260 LinkedIn posts)
+  - youtube_MASTER.json  (liked + watch_later + playlists)
+  - Fresh Raindrop pull  (new items not yet in CATEGORIZED)
+Deduplicates by URL. Normalizes categories.
+"""
+import json
+import os
+from openmark import config
+from openmark.pipeline.normalize import normalize_item, dedupe
+def load_categorized() -> list[dict]:
+    path = os.path.join(config.RAINDROP_MISSION_DIR, "CATEGORIZED.json")
+    with open(path, encoding="utf-8") as f:
+        items = json.load(f)
+    print(f"CATEGORIZED.json: {len(items)} items")
+    return items
+def load_linkedin() -> list[dict]:
+    path = os.path.join(config.RAINDROP_MISSION_DIR, "linkedin_saved.json")
+    if not os.path.exists(path):
+        print("LinkedIn: file not found, skipping")
+        return []
+    with open(path, encoding="utf-8") as f:
+        posts = json.load(f)
+    items = []
+    for p in posts:
+        content = p.get("content", "")
+        author  = p.get("author", "")
+        items.append({
+            "url":      p.get("url", ""),
+            "title":    f"{author} — {content[:80]}" if author else content[:100],
+            "content":  content[:300],
+            "author":   author,
+            "folder":   "LinkedIn Saved",
+            "source":   "linkedin",
+            "tags":     [],
+            "category": None,  # will be assigned by normalize
+            "score":    6,
+        })
+    print(f"LinkedIn: {len(items)} posts")
+    return items
+def load_youtube() -> list[dict]:
+    path = os.path.join(config.RAINDROP_MISSION_DIR, "youtube_MASTER.json")
+    if not os.path.exists(path):
+        print("YouTube: file not found, skipping")
+        return []
+    with open(path, encoding="utf-8") as f:
+        yt = json.load(f)
+    items = []
+    for section in ["liked_videos", "watch_later", "playlists"]:
+        for v in yt.get(section, []):
+            items.append({
+                "url":      v.get("url", ""),
+                "title":    v.get("title", ""),
+                "channel":  v.get("channel", ""),
+                "folder":   f"YouTube / {section}",
+                "source":   f"youtube_{section}",
+                "tags":     v.get("tags", [])[:5],
+                "category": "YouTube & Video",
+                "score":    7,
+            })
+    print(f"YouTube: {len(items)} videos (liked + watch_later + playlists)")
+    return items
+def merge_all(include_fresh_raindrop: bool = False) -> list[dict]:
+    """
+    Merge all sources. Returns deduplicated, normalized list.
+    Set include_fresh_raindrop=True to also pull live from Raindrop API.
+    """
+    all_items = []
+    all_items.extend(load_categorized())
+    all_items.extend(load_linkedin())
+    all_items.extend(load_youtube())
+    if include_fresh_raindrop:
+        from openmark.pipeline.raindrop import pull_all
+        fresh = pull_all()
+        all_items.extend(fresh)
+    # Normalize each item
+    normalized = [normalize_item(i) for i in all_items]
+    # Deduplicate by URL
+    unique = dedupe(normalized)
+    print(f"\nTotal after merge + dedup: {len(unique)} items")
+    return unique

openmark/pipeline/normalize.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Normalize, clean, and deduplicate bookmark items.
+"""
+import re
+from openmark import config
+def clean_title(title: str) -> str:
+    if not title:
+        return ""
+    # Strip HTML entities
+    title = re.sub(r"&amp;", "&", title)
+    title = re.sub(r"&lt;", "<", title)
+    title = re.sub(r"&gt;", ">", title)
+    title = re.sub(r"&#39;", "'", title)
+    title = re.sub(r"&quot;", '"', title)
+    # Strip leading/trailing whitespace and truncate
+    title = title.strip()[:300]
+    return title
+def fix_category(cat: str | None) -> str:
+    if not cat:
+        return "News & Articles"
+    # Apply known remapping
+    cat = config.CATEGORY_MAP.get(cat, cat)
+    # If still unknown, fallback
+    if cat not in config.CATEGORIES:
+        return "News & Articles"
+    return cat
+def build_document_text(item: dict) -> str:
+    """
+    Build a single rich text string for embedding.
+    Combines title + tags + category + content/excerpt for better semantic matching.
+    """
+    parts = []
+    if item.get("title"):
+        parts.append(item["title"])
+    if item.get("category"):
+        parts.append(item["category"])
+    if item.get("tags"):
+        parts.append(" ".join(item["tags"]))
+    if item.get("content"):
+        parts.append(item["content"][:200])
+    elif item.get("excerpt"):
+        parts.append(item["excerpt"][:200])
+    if item.get("channel"):
+        parts.append(item["channel"])
+    if item.get("author"):
+        parts.append(item["author"])
+    return " | ".join(p for p in parts if p)
+def normalize_item(item: dict) -> dict:
+    """Clean and normalize a single bookmark item."""
+    url   = item.get("url", "").strip()
+    title = clean_title(item.get("title", ""))
+    cat   = fix_category(item.get("category"))
+    tags  = [t.lower().strip() for t in item.get("tags", []) if t][:5]
+    score = item.get("score", 5)
+    if not isinstance(score, (int, float)):
+        score = 5
+    normalized = {
+        "url":      url,
+        "title":    title,
+        "category": cat,
+        "tags":     tags,
+        "score":    score,
+        "source":   item.get("source", "unknown"),
+        "folder":   item.get("folder", ""),
+    }
+    # Preserve optional fields
+    for field in ["content", "excerpt", "author", "channel", "description"]:
+        if item.get(field):
+            normalized[field] = item[field][:300]
+    # Build the document text for embedding
+    normalized["doc_text"] = build_document_text(normalized)
+    return normalized
+def dedupe(items: list[dict]) -> list[dict]:
+    """Remove duplicates by URL (case-insensitive, trailing slash stripped)."""
+    seen = set()
+    unique = []
+    for item in items:
+        url = item.get("url", "").rstrip("/").lower()
+        if not url or url in seen:
+            continue
+        seen.add(url)
+        unique.append(item)
+    return unique

openmark/pipeline/raindrop.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Fresh pull of ALL Raindrop bookmarks via API.
+Fetches every collection and every raindrop inside it, paginated.
+"""
+import time
+import requests
+from openmark import config
+HEADERS = {"Authorization": f"Bearer {config.RAINDROP_TOKEN}"}
+def fetch_all_collections() -> list[dict]:
+    """Return all collections (top-level and nested)."""
+    resp = requests.get("https://api.raindrop.io/rest/v1/collections", headers=HEADERS)
+    resp.raise_for_status()
+    collections = resp.json().get("items", [])
+    # Also fetch children
+    resp2 = requests.get("https://api.raindrop.io/rest/v1/collections/childrens", headers=HEADERS)
+    if resp2.status_code == 200:
+        collections += resp2.json().get("items", [])
+    return collections
+def fetch_raindrops_for_collection(collection_id: int, title: str) -> list[dict]:
+    """Fetch all raindrops in a collection, paginated."""
+    items = []
+    page = 0
+    while True:
+        resp = requests.get(
+            f"https://api.raindrop.io/rest/v1/raindrops/{collection_id}",
+            headers=HEADERS,
+            params={"perpage": 50, "page": page},
+        )
+        if resp.status_code != 200:
+            break
+        batch = resp.json().get("items", [])
+        if not batch:
+            break
+        for item in batch:
+            items.append({
+                "url":      item.get("link", ""),
+                "title":    item.get("title", ""),
+                "excerpt":  item.get("excerpt", "")[:200],
+                "tags":     item.get("tags", [])[:5],
+                "folder":   title,
+                "source":   "raindrop",
+            })
+        if len(batch) < 50:
+            break
+        page += 1
+        time.sleep(0.2)
+    return items
+def fetch_unsorted() -> list[dict]:
+    """Fetch raindrops not in any collection (unsorted)."""
+    return fetch_raindrops_for_collection(-1, "Unsorted")
+def pull_all() -> list[dict]:
+    """Pull every raindrop from every collection. Returns flat list."""
+    print("Fetching Raindrop collections...")
+    collections = fetch_all_collections()
+    print(f"  Found {len(collections)} collections")
+    all_items = []
+    for col in collections:
+        cid   = col["_id"]
+        title = col.get("title", "Unknown")
+        items = fetch_raindrops_for_collection(cid, title)
+        print(f"  [{title}] {len(items)} items")
+        all_items.extend(items)
+        time.sleep(0.1)
+    unsorted = fetch_unsorted()
+    print(f"  [Unsorted] {len(unsorted)} items")
+    all_items.extend(unsorted)
+    print(f"Raindrop total: {len(all_items)}")
+    return all_items

openmark/stores/__init__.py ADDED Viewed

File without changes

openmark/stores/chroma.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+ChromaDB store — semantic vector search.
+"""
+import chromadb
+from openmark import config
+from openmark.embeddings.base import EmbeddingProvider
+COLLECTION_NAME = "openmark_bookmarks"
+def get_client() -> chromadb.PersistentClient:
+    return chromadb.PersistentClient(path=config.CHROMA_PATH)
+def get_collection(client: chromadb.PersistentClient, embedder: EmbeddingProvider):
+    """Get or create the bookmarks collection."""
+    return client.get_or_create_collection(
+        name=COLLECTION_NAME,
+        metadata={"hnsw:space": "cosine"},
+    )
+def ingest(items: list[dict], embedder: EmbeddingProvider, batch_size: int = 100):
+    """Embed all items and store in ChromaDB."""
+    client     = get_client()
+    collection = get_collection(client, embedder)
+    # Check already ingested
+    existing = set(collection.get(include=[])["ids"])
+    new_items = [i for i in items if i["url"] not in existing]
+    print(f"ChromaDB: {len(existing)} already ingested, {len(new_items)} new")
+    if not new_items:
+        return
+    total = 0
+    for start in range(0, len(new_items), batch_size):
+        batch = new_items[start:start + batch_size]
+        texts = [i["doc_text"] for i in batch]
+        ids   = [i["url"] for i in batch]
+        metas = [
+            {
+                "title":    i["title"][:500],
+                "category": i["category"],
+                "source":   i["source"],
+                "score":    float(i["score"]),
+                "tags":     ",".join(i["tags"]),
+                "folder":   i.get("folder", ""),
+            }
+            for i in batch
+        ]
+        embeddings = embedder.embed_documents(texts)
+        collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=metas,
+        )
+        total += len(batch)
+        print(f"  ChromaDB ingested {total}/{len(new_items)}")
+    print(f"ChromaDB total: {collection.count()} items")
+def search(
+    query: str,
+    embedder: EmbeddingProvider,
+    n: int = 10,
+    category: str | None = None,
+    source: str | None = None,
+    min_score: float | None = None,
+) -> list[dict]:
+    """Semantic search with optional metadata filters."""
+    client     = get_client()
+    collection = get_collection(client, embedder)
+    q_embedding = embedder.embed_query(query)
+    # Build filters
+    filters = []
+    if category:
+        filters.append({"category": {"$eq": category}})
+    if source:
+        filters.append({"source": {"$eq": source}})
+    if min_score is not None:
+        filters.append({"score": {"$gte": min_score}})
+    where = None
+    if len(filters) == 1:
+        where = filters[0]
+    elif len(filters) > 1:
+        where = {"$and": filters}
+    results = collection.query(
+        query_embeddings=[q_embedding],
+        n_results=n,
+        where=where,
+        include=["metadatas", "documents", "distances"],
+    )
+    output = []
+    for i, (meta, doc, dist) in enumerate(zip(
+        results["metadatas"][0],
+        results["documents"][0],
+        results["distances"][0],
+    )):
+        output.append({
+            "rank":       i + 1,
+            "url":        results["ids"][0][i],
+            "title":      meta.get("title", ""),
+            "category":   meta.get("category", ""),
+            "source":     meta.get("source", ""),
+            "score":      meta.get("score", 0),
+            "tags":       meta.get("tags", "").split(","),
+            "similarity": round(1 - dist, 4),
+        })
+    return output
+def get_stats() -> dict:
+    client     = get_client()
+    collection = get_collection(client, None)
+    return {"total": collection.count()}

openmark/stores/neo4j_store.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Neo4j store — knowledge graph.
+Nodes:   Bookmark, Tag, Category, Source, Domain
+Edges:   TAGGED, IN_CATEGORY, FROM_SOURCE, FROM_DOMAIN, SIMILAR_TO, CO_OCCURS_WITH
+"""
+import re
+from urllib.parse import urlparse
+from neo4j import GraphDatabase
+from openmark import config
+def get_driver():
+    return GraphDatabase.driver(
+        config.NEO4J_URI,
+        auth=(config.NEO4J_USER, config.NEO4J_PASSWORD),
+    )
+def setup_constraints(driver):
+    """Create uniqueness constraints once."""
+    constraints = [
+        "CREATE CONSTRAINT bookmark_url IF NOT EXISTS FOR (b:Bookmark) REQUIRE b.url IS UNIQUE",
+        "CREATE CONSTRAINT tag_name IF NOT EXISTS FOR (t:Tag) REQUIRE t.name IS UNIQUE",
+        "CREATE CONSTRAINT category_name IF NOT EXISTS FOR (c:Category) REQUIRE c.name IS UNIQUE",
+        "CREATE CONSTRAINT source_name IF NOT EXISTS FOR (s:Source) REQUIRE s.name IS UNIQUE",
+        "CREATE CONSTRAINT domain_name IF NOT EXISTS FOR (d:Domain) REQUIRE d.name IS UNIQUE",
+    ]
+    with driver.session(database=config.NEO4J_DATABASE) as session:
+        for cypher in constraints:
+            try:
+                session.run(cypher)
+            except Exception as e:
+                print(f"  Constraint (already exists or error): {e}")
+    print("Constraints ready.")
+def extract_domain(url: str) -> str:
+    try:
+        return urlparse(url).netloc.replace("www.", "")
+    except Exception:
+        return "unknown"
+def ingest(items: list[dict], driver=None):
+    """Write all nodes and relationships to Neo4j."""
+    own_driver = driver is None
+    if own_driver:
+        driver = get_driver()
+    setup_constraints(driver)
+    total = len(items)
+    batch_size = 200
+    print(f"Neo4j ingesting {total} items...")
+    for start in range(0, total, batch_size):
+        batch = items[start:start + batch_size]
+        with driver.session(database=config.NEO4J_DATABASE) as session:
+            session.execute_write(_write_batch, batch)
+        print(f"  Neo4j wrote {min(start + batch_size, total)}/{total}")
+    print("Building tag co-occurrence edges...")
+    _build_tag_cooccurrence(driver)
+    print("Neo4j ingestion complete.")
+    if own_driver:
+        driver.close()
+def _write_batch(tx, batch: list[dict]):
+    for item in batch:
+        url      = item["url"]
+        title    = item["title"][:500]
+        category = item["category"]
+        tags     = item["tags"]
+        score    = float(item["score"])
+        source   = item["source"]
+        domain   = extract_domain(url)
+        # Bookmark node
+        tx.run("""
+            MERGE (b:Bookmark {url: $url})
+            SET b.title = $title, b.score = $score
+        """, url=url, title=title, score=score)
+        # Category node + relationship
+        tx.run("""
+            MERGE (c:Category {name: $cat})
+            WITH c
+            MATCH (b:Bookmark {url: $url})
+            MERGE (b)-[:IN_CATEGORY]->(c)
+        """, cat=category, url=url)
+        # Source node + relationship
+        tx.run("""
+            MERGE (s:Source {name: $src})
+            WITH s
+            MATCH (b:Bookmark {url: $url})
+            MERGE (b)-[:FROM_SOURCE]->(s)
+        """, src=source, url=url)
+        # Domain node + relationship
+        if domain and domain != "unknown":
+            tx.run("""
+                MERGE (d:Domain {name: $domain})
+                WITH d
+                MATCH (b:Bookmark {url: $url})
+                MERGE (b)-[:FROM_DOMAIN]->(d)
+            """, domain=domain, url=url)
+        # Tag nodes + relationships
+        for tag in tags:
+            if not tag:
+                continue
+            tx.run("""
+                MERGE (t:Tag {name: $tag})
+                WITH t
+                MATCH (b:Bookmark {url: $url})
+                MERGE (b)-[:TAGGED]->(t)
+            """, tag=tag, url=url)
+def _build_tag_cooccurrence(driver):
+    """
+    For each bookmark with multiple tags, create CO_OCCURS_WITH edges between tags.
+    Weight = number of bookmarks where both tags appear together.
+    """
+    with driver.session(database=config.NEO4J_DATABASE) as session:
+        session.run("""
+            MATCH (b:Bookmark)-[:TAGGED]->(t1:Tag)
+            MATCH (b)-[:TAGGED]->(t2:Tag)
+            WHERE t1.name < t2.name
+            MERGE (t1)-[r:CO_OCCURS_WITH]-(t2)
+            ON CREATE SET r.count = 1
+            ON MATCH SET r.count = r.count + 1
+        """)
+    print("  Tag co-occurrence edges built.")
+def add_similar_to_edges(similar_pairs: list[tuple[str, str, float]], driver=None):
+    """
+    Write SIMILAR_TO edges derived from ChromaDB nearest-neighbor search.
+    similar_pairs = [(url_a, url_b, similarity_score), ...]
+    """
+    own_driver = driver is None
+    if own_driver:
+        driver = get_driver()
+    with driver.session(database=config.NEO4J_DATABASE) as session:
+        for url_a, url_b, score in similar_pairs:
+            session.run("""
+                MATCH (a:Bookmark {url: $url_a})
+                MATCH (b:Bookmark {url: $url_b})
+                MERGE (a)-[r:SIMILAR_TO]-(b)
+                SET r.score = $score
+            """, url_a=url_a, url_b=url_b, score=score)
+    print(f"  SIMILAR_TO: {len(similar_pairs)} edges written.")
+    if own_driver:
+        driver.close()
+def query(cypher: str, params: dict | None = None) -> list[dict]:
+    """Run arbitrary Cypher and return results as list of dicts."""
+    driver = get_driver()
+    with driver.session(database=config.NEO4J_DATABASE) as session:
+        result = session.run(cypher, params or {})
+        rows = [dict(r) for r in result]
+    driver.close()
+    return rows
+def get_stats() -> dict:
+    rows = query("""
+        MATCH (b:Bookmark) WITH count(b) AS bookmarks
+        MATCH (t:Tag)      WITH bookmarks, count(t) AS tags
+        MATCH (c:Category) WITH bookmarks, tags, count(c) AS categories
+        RETURN bookmarks, tags, categories
+    """)
+    return rows[0] if rows else {}
+def find_similar(url: str, limit: int = 10) -> list[dict]:
+    return query("""
+        MATCH (b:Bookmark {url: $url})-[r:SIMILAR_TO]-(other:Bookmark)
+        RETURN other.url AS url, other.title AS title, r.score AS similarity
+        ORDER BY r.score DESC LIMIT $limit
+    """, {"url": url, "limit": limit})
+def find_by_tag(tag: str, limit: int = 20) -> list[dict]:
+    return query("""
+        MATCH (b:Bookmark)-[:TAGGED]->(t:Tag {name: $tag})
+        RETURN b.url AS url, b.title AS title, b.score AS score
+        ORDER BY b.score DESC LIMIT $limit
+    """, {"tag": tag.lower(), "limit": limit})
+def find_tag_cluster(tag: str, hops: int = 2, limit: int = 30) -> list[dict]:
+    """Follow CO_OCCURS_WITH edges to find related tags and their bookmarks."""
+    return query(f"""
+        MATCH (t:Tag {{name: $tag}})-[:CO_OCCURS_WITH*1..{hops}]-(related:Tag)
+        MATCH (b:Bookmark)-[:TAGGED]->(related)
+        RETURN DISTINCT b.url AS url, b.title AS title, b.score AS score, related.name AS via_tag
+        ORDER BY b.score DESC LIMIT $limit
+    """, {"tag": tag.lower(), "limit": limit})

openmark/ui/__init__.py ADDED Viewed

File without changes

openmark/ui/app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+OpenMark Gradio UI — 3 tabs:
+  1. Chat    — talk to the LangGraph agent
+  2. Search  — instant semantic search with filters
+  3. Stats   — knowledge base overview
+"""
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.stdout.reconfigure(encoding="utf-8")
+import gradio as gr
+from openmark.agent.graph import build_agent, ask
+from openmark.embeddings.factory import get_embedder
+from openmark.stores import chroma as chroma_store
+from openmark.stores import neo4j_store
+from openmark import config
+# Load once at startup
+print("Loading OpenMark...")
+_embedder = get_embedder()
+_agent    = build_agent()
+print("OpenMark ready.")
+# ── Chat tab ──────────────────────────────────────────────────
+def chat_fn(message: str, history: list, thread_id: str):
+    if not message.strip():
+        return history, ""
+    response = ask(_agent, message, thread_id=thread_id or "default")
+    history.append({"role": "user",      "content": message})
+    history.append({"role": "assistant", "content": response})
+    return history, ""
+# ── Search tab ────────────────────────────────────────────────
+def search_fn(query: str, category: str, min_score: float, n_results: int):
+    if not query.strip():
+        return "Enter a search query."
+    cat = category if category != "All" else None
+    ms  = min_score if min_score > 0 else None
+    results = chroma_store.search(
+        query, _embedder, n=int(n_results),
+        category=cat, min_score=ms,
+    )
+    if not results:
+        return "No results found."
+    lines = []
+    for r in results:
+        lines.append(
+            f"**{r['rank']}. {r['title'] or r['url']}**\n"
+            f"🔗 {r['url']}\n"
+            f"📁 {r['category']} | 📌 {', '.join(t for t in r['tags'] if t)} | "
+            f"⭐ {r['score']} | 🎯 {r['similarity']:.3f} similarity\n"
+        )
+    return "\n---\n".join(lines)
+# ── Stats tab ─────────────────────────────────────────────────
+def stats_fn():
+    chroma = chroma_store.get_stats()
+    neo4j  = neo4j_store.get_stats()
+    # Category breakdown from Neo4j
+    cat_rows = neo4j_store.query("""
+        MATCH (b:Bookmark)-[:IN_CATEGORY]->(c:Category)
+        RETURN c.name AS category, count(b) AS count
+        ORDER BY count DESC
+    """)
+    cat_lines = "\n".join(f"  {r['category']:<35} {r['count']:>5}" for r in cat_rows)
+    # Top tags
+    tag_rows = neo4j_store.query("""
+        MATCH (b:Bookmark)-[:TAGGED]->(t:Tag)
+        RETURN t.name AS tag, count(b) AS count
+        ORDER BY count DESC LIMIT 20
+    """)
+    tag_lines = ", ".join(f"{r['tag']} ({r['count']})" for r in tag_rows)
+    return (
+        f"## OpenMark Knowledge Base\n\n"
+        f"**ChromaDB vectors:** {chroma.get('total', 0)}\n"
+        f"**Neo4j bookmarks:** {neo4j.get('bookmarks', 0)}\n"
+        f"**Neo4j tags:** {neo4j.get('tags', 0)}\n"
+        f"**Neo4j categories:** {neo4j.get('categories', 0)}\n\n"
+        f"### By Category\n```\n{cat_lines}\n```\n\n"
+        f"### Top Tags\n{tag_lines}"
+    )
+# ── Build UI ──────────────────────────────────────────────────
+def build_ui():
+    categories = ["All"] + config.CATEGORIES
+    with gr.Blocks(title="OpenMark", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# OpenMark — Your Personal Knowledge Graph")
+        with gr.Tabs():
+            # Tab 1: Chat
+            with gr.Tab("Chat"):
+                thread = gr.Textbox(value="default", label="Session ID", scale=1)
+                chatbot = gr.Chatbot(type="messages", height=500)
+                msg_box = gr.Textbox(
+                    placeholder="Ask anything about your saved bookmarks...",
+                    label="Message", lines=2,
+                )
+                send_btn = gr.Button("Send", variant="primary")
+                send_btn.click(
+                    chat_fn,
+                    inputs=[msg_box, chatbot, thread],
+                    outputs=[chatbot, msg_box],
+                )
+                msg_box.submit(
+                    chat_fn,
+                    inputs=[msg_box, chatbot, thread],
+                    outputs=[chatbot, msg_box],
+                )
+            # Tab 2: Search
+            with gr.Tab("Search"):
+                with gr.Row():
+                    q_input   = gr.Textbox(placeholder="Search your knowledge base...", label="Query", scale=3)
+                    cat_input = gr.Dropdown(categories, value="All", label="Category")
+                with gr.Row():
+                    score_input = gr.Slider(0, 10, value=0, step=1, label="Min Quality Score")
+                    n_input     = gr.Slider(5, 50, value=10, step=5, label="Results")
+                search_btn    = gr.Button("Search", variant="primary")
+                search_output = gr.Markdown()
+                search_btn.click(
+                    search_fn,
+                    inputs=[q_input, cat_input, score_input, n_input],
+                    outputs=search_output,
+                )
+                q_input.submit(
+                    search_fn,
+                    inputs=[q_input, cat_input, score_input, n_input],
+                    outputs=search_output,
+                )
+            # Tab 3: Stats
+            with gr.Tab("Stats"):
+                refresh_btn  = gr.Button("Refresh Stats")
+                stats_output = gr.Markdown()
+                refresh_btn.click(stats_fn, outputs=stats_output)
+                app.load(stats_fn, outputs=stats_output)
+    return app
+if __name__ == "__main__":
+    ui = build_ui()
+    ui.launch(server_name="0.0.0.0", server_port=7860, share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+chromadb>=1.5.4
+langchain>=0.3.25
+langgraph>=1.0.1
+langchain-openai>=0.3.23
+langchain-neo4j>=0.4.0
+sentence-transformers==3.3.1
+transformers>=4.57.0
+huggingface_hub>=1.6.0
+torch>=2.0.0
+neo4j>=5.28.1
+gradio>=6.6.0
+requests>=2.31.0
+python-dotenv>=1.0.0
+numpy>=1.24.0

scripts/ingest.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+OpenMark Full Ingest Pipeline
+Run this once (or again to update) to:
+  1. Merge all data sources (CATEGORIZED.json + LinkedIn + YouTube)
+  2. Embed everything with chosen provider (local pplx-embed or Azure)
+  3. Store in ChromaDB (semantic search)
+  4. Store in Neo4j (knowledge graph)
+  5. Compute SIMILAR_TO edges (top-5 neighbors per bookmark → graph edges)
+Usage:
+  C:\\Python313\\python scripts/ingest.py
+  C:\\Python313\\python scripts/ingest.py --provider azure
+  C:\\Python313\\python scripts/ingest.py --fresh-raindrop   (also pulls live from Raindrop API)
+"""
+import sys
+import os
+import argparse
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+sys.stdout.reconfigure(encoding="utf-8")
+from openmark.pipeline.merge import merge_all
+from openmark.embeddings.factory import get_embedder
+from openmark.stores import chroma as chroma_store
+from openmark.stores import neo4j_store
+from openmark import config
+def build_similar_to_edges(items: list[dict], embedder, top_k: int = 5):
+    """
+    For each item, find its top-k nearest neighbors in ChromaDB
+    and write SIMILAR_TO edges in Neo4j.
+    This creates the semantic web inside the graph.
+    """
+    print(f"\nBuilding SIMILAR_TO edges (top-{top_k} per bookmark)...")
+    pairs = []
+    total = len(items)
+    for i, item in enumerate(items):
+        url = item["url"]
+        try:
+            results = chroma_store.search(
+                item["doc_text"], embedder, n=top_k + 1
+            )
+            for r in results:
+                if r["url"] != url and r["similarity"] > 0.5:
+                    pairs.append((url, r["url"], r["similarity"]))
+        except Exception:
+            pass
+        if (i + 1) % 500 == 0:
+            print(f"  Processed {i+1}/{total} for SIMILAR_TO")
+    print(f"  Writing {len(pairs)} SIMILAR_TO edges to Neo4j...")
+    neo4j_store.add_similar_to_edges(pairs)
+    print("  SIMILAR_TO done.")
+def main():
+    parser = argparse.ArgumentParser(description="OpenMark Ingest Pipeline")
+    parser.add_argument("--provider",        default=None, help="Embedding provider: local or azure")
+    parser.add_argument("--fresh-raindrop",  action="store_true", help="Also pull fresh from Raindrop API")
+    parser.add_argument("--skip-similar",    action="store_true", help="Skip SIMILAR_TO edge computation")
+    args = parser.parse_args()
+    if args.provider:
+        os.environ["EMBEDDING_PROVIDER"] = args.provider
+    print("=" * 60)
+    print("OPENMARK INGEST PIPELINE")
+    print(f"Embedding: {config.EMBEDDING_PROVIDER}")
+    print("=" * 60)
+    # Step 1: Merge all sources
+    print("\n[1/4] Merging data sources...")
+    items = merge_all(include_fresh_raindrop=args.fresh_raindrop)
+    # Step 2: Load embedder
+    print(f"\n[2/4] Loading {config.EMBEDDING_PROVIDER} embedder...")
+    embedder = get_embedder()
+    # Step 3: ChromaDB
+    print("\n[3/4] Ingesting into ChromaDB...")
+    chroma_store.ingest(items, embedder)
+    # Step 4: Neo4j
+    print("\n[4/4] Ingesting into Neo4j...")
+    neo4j_store.ingest(items)
+    # Step 5: SIMILAR_TO edges
+    if not args.skip_similar:
+        build_similar_to_edges(items, embedder, top_k=5)
+    print("\n" + "=" * 60)
+    print("INGEST COMPLETE")
+    chroma = chroma_store.get_stats()
+    neo4j  = neo4j_store.get_stats()
+    print(f"  ChromaDB: {chroma.get('total', 0)} vectors")
+    print(f"  Neo4j:    {neo4j.get('bookmarks', 0)} bookmarks, {neo4j.get('tags', 0)} tags")
+    print("=" * 60)
+    print("\nNow run: C:\\Python313\\python scripts/search.py \"your query\"")
+    print("     or: C:\\Python313\\python -m openmark.ui.app")
+if __name__ == "__main__":
+    main()

scripts/search.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+OpenMark CLI Search — instant search from terminal.
+Usage:
+  C:\\Python313\\python scripts/search.py "RAG tools"
+  C:\\Python313\\python scripts/search.py "LangGraph" --category "Agent Development"
+  C:\\Python313\\python scripts/search.py "embeddings" --n 20
+  C:\\Python313\\python scripts/search.py --tag "rag"
+  C:\\Python313\\python scripts/search.py --stats
+"""
+import sys
+import os
+import argparse
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+sys.stdout.reconfigure(encoding="utf-8")
+from openmark.embeddings.factory import get_embedder
+from openmark.stores import chroma as chroma_store
+from openmark.stores import neo4j_store
+def print_results(results: list[dict]):
+    if not results:
+        print("No results found.")
+        return
+    for r in results:
+        title = r.get("title") or r.get("url")
+        url   = r.get("url", "")
+        cat   = r.get("category", "")
+        sim   = r.get("similarity", "")
+        score = r.get("score", "")
+        tags  = ", ".join(t for t in r.get("tags", []) if t)
+        print(f"\n  {r.get('rank', '-')}. {title}")
+        print(f"     {url}")
+        if cat:   print(f"     Category:   {cat}")
+        if tags:  print(f"     Tags:       {tags}")
+        if score: print(f"     Score:      {score}")
+        if sim:   print(f"     Similarity: {sim}")
+def main():
+    parser = argparse.ArgumentParser(description="OpenMark CLI Search")
+    parser.add_argument("query",      nargs="?", default=None, help="Search query")
+    parser.add_argument("--category", default=None, help="Filter by category")
+    parser.add_argument("--tag",      default=None, help="Search by tag (graph lookup)")
+    parser.add_argument("--n",        type=int, default=10, help="Number of results")
+    parser.add_argument("--stats",    action="store_true", help="Show knowledge base stats")
+    args = parser.parse_args()
+    if args.stats:
+        chroma = chroma_store.get_stats()
+        neo4j  = neo4j_store.get_stats()
+        print("\nOpenMark Stats:")
+        print(f"  ChromaDB vectors: {chroma.get('total', 0)}")
+        print(f"  Neo4j bookmarks:  {neo4j.get('bookmarks', 0)}")
+        print(f"  Neo4j tags:       {neo4j.get('tags', 0)}")
+        return
+    if args.tag:
+        print(f"\nSearching by tag: '{args.tag}'")
+        results = neo4j_store.find_by_tag(args.tag, limit=args.n)
+        for r in results:
+            print(f"\n  - {r.get('title', '')}")
+            print(f"    {r.get('url', '')} (score: {r.get('score', '')})")
+        return
+    if not args.query:
+        parser.print_help()
+        return
+    print(f"\nSearching: '{args.query}'")
+    if args.category:
+        print(f"Category filter: {args.category}")
+    embedder = get_embedder()
+    results  = chroma_store.search(
+        args.query, embedder, n=args.n, category=args.category
+    )
+    print_results(results)
+if __name__ == "__main__":
+    main()