File size: 3,796 Bytes
81598c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""

ChromaDB store — semantic vector search.

"""

import chromadb
from openmark import config
from openmark.embeddings.base import EmbeddingProvider

COLLECTION_NAME = "openmark_bookmarks"


def get_client() -> chromadb.PersistentClient:
    return chromadb.PersistentClient(path=config.CHROMA_PATH)


def get_collection(client: chromadb.PersistentClient, embedder: EmbeddingProvider):
    """Get or create the bookmarks collection."""
    return client.get_or_create_collection(
        name=COLLECTION_NAME,
        metadata={"hnsw:space": "cosine"},
    )


def ingest(items: list[dict], embedder: EmbeddingProvider, batch_size: int = 100):
    """Embed all items and store in ChromaDB."""
    client     = get_client()
    collection = get_collection(client, embedder)

    # Check already ingested
    existing = set(collection.get(include=[])["ids"])
    new_items = [i for i in items if i["url"] not in existing]
    print(f"ChromaDB: {len(existing)} already ingested, {len(new_items)} new")

    if not new_items:
        return

    total = 0
    for start in range(0, len(new_items), batch_size):
        batch = new_items[start:start + batch_size]

        texts = [i["doc_text"] for i in batch]
        ids   = [i["url"] for i in batch]
        metas = [
            {
                "title":    i["title"][:500],
                "category": i["category"],
                "source":   i["source"],
                "score":    float(i["score"]),
                "tags":     ",".join(i["tags"]),
                "folder":   i.get("folder", ""),
            }
            for i in batch
        ]

        embeddings = embedder.embed_documents(texts)

        collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=texts,
            metadatas=metas,
        )
        total += len(batch)
        print(f"  ChromaDB ingested {total}/{len(new_items)}")

    print(f"ChromaDB total: {collection.count()} items")


def search(

    query: str,

    embedder: EmbeddingProvider,

    n: int = 10,

    category: str | None = None,

    source: str | None = None,

    min_score: float | None = None,

) -> list[dict]:
    """Semantic search with optional metadata filters."""
    client     = get_client()
    collection = get_collection(client, embedder)

    q_embedding = embedder.embed_query(query)

    # Build filters
    filters = []
    if category:
        filters.append({"category": {"$eq": category}})
    if source:
        filters.append({"source": {"$eq": source}})
    if min_score is not None:
        filters.append({"score": {"$gte": min_score}})

    where = None
    if len(filters) == 1:
        where = filters[0]
    elif len(filters) > 1:
        where = {"$and": filters}

    results = collection.query(
        query_embeddings=[q_embedding],
        n_results=n,
        where=where,
        include=["metadatas", "documents", "distances"],
    )

    output = []
    for i, (meta, doc, dist) in enumerate(zip(
        results["metadatas"][0],
        results["documents"][0],
        results["distances"][0],
    )):
        output.append({
            "rank":       i + 1,
            "url":        results["ids"][0][i],
            "title":      meta.get("title", ""),
            "category":   meta.get("category", ""),
            "source":     meta.get("source", ""),
            "score":      meta.get("score", 0),
            "tags":       meta.get("tags", "").split(","),
            "similarity": round(1 - dist, 4),
        })
    return output


def get_stats() -> dict:
    client     = get_client()
    collection = get_collection(client, None)
    return {"total": collection.count()}