File size: 1,665 Bytes
d0abef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# src/embed_service/cache_manager.py
import os
import json
import numpy as np

CACHE_DIR = "cache"
META_PATH = f"{CACHE_DIR}/embed_meta.json"
EMB_PATH = f"{CACHE_DIR}/embeddings.npy"

class CacheManager:
    def __init__(self):
        os.makedirs(CACHE_DIR, exist_ok=True)
        if os.path.exists(META_PATH):
            with open(META_PATH, "r") as f:
                self.meta = json.load(f)
        else:
            self.meta = {}  # filename -> {"hash":..., "index": int}

        if os.path.exists(EMB_PATH):
            self.embeddings = np.load(EMB_PATH)
        else:
            # empty array shaped (0, dim) — we'll resize when first embedding arrives
            self.embeddings = np.zeros((0, 384), dtype="float32")

    def save(self):
        with open(META_PATH, "w") as f:
            json.dump(self.meta, f, indent=2)
        np.save(EMB_PATH, self.embeddings)

    def exists(self, filename: str, file_hash: str) -> bool:
        return filename in self.meta and self.meta[filename]["hash"] == file_hash

    def get_embedding(self, filename: str):
        idx = int(self.meta[filename]["index"])
        return self.embeddings[idx]

    def add_embedding(self, filename: str, file_hash: str, embedding):
        embedding = embedding.astype("float32")
        idx = len(self.embeddings)
        self.meta[filename] = {"hash": file_hash, "index": idx}
        if self.embeddings.shape[0] == 0:
            self.embeddings = embedding.reshape(1, -1)
        else:
            self.embeddings = np.vstack([self.embeddings, embedding.reshape(1, -1)])
        self.save()

    def all_embeddings(self):
        return self.meta, self.embeddings