garvitcpp commited on
Commit
a914655
Β·
verified Β·
1 Parent(s): f5ab068

Update app/services/vector_service.py

Browse files
Files changed (1) hide show
  1. app/services/vector_service.py +44 -15
app/services/vector_service.py CHANGED
@@ -9,31 +9,48 @@ logger = logging.getLogger(__name__)
9
 
10
  class VectorService:
11
  def __init__(self):
12
- self.client = chromadb.PersistentClient(
13
- path="./chroma_db",
14
- settings=Settings(
 
 
 
 
 
 
 
 
15
  anonymized_telemetry=False,
16
  allow_reset=True
17
- )
18
- )
19
- logger.info("πŸ—„οΈ ChromaDB client initialized")
 
 
 
 
 
 
20
 
21
  def create_collection(self, repository_id: int) -> chromadb.Collection:
22
  collection_name = f"repo_{repository_id}"
23
 
24
  try:
25
  collection = self.client.get_collection(collection_name)
 
26
  logger.info(f"πŸ“š Using existing collection: {collection_name}")
27
  except:
28
  collection = self.client.create_collection(
29
  name=collection_name,
30
  metadata={"repository_id": repository_id}
31
  )
 
32
  logger.info(f"πŸ†• Created new collection: {collection_name}")
33
 
34
  return collection
35
 
36
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
 
37
  logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
38
 
39
  collection = self.create_collection(repository_id)
@@ -59,16 +76,26 @@ class VectorService:
59
  ids.append(chunk_id)
60
 
61
  batch_size = 100
62
- for i in range(0, len(documents), batch_size):
 
 
63
  end_idx = min(i + batch_size, len(documents))
64
 
65
- collection.add(
66
- documents=documents[i:end_idx],
67
- embeddings=embeddings[i:end_idx],
68
- metadatas=metadatas[i:end_idx],
69
- ids=ids[i:end_idx]
70
- )
 
 
 
 
 
 
 
71
 
 
72
  logger.info(f"βœ… Successfully stored all embeddings for repository {repository_id}")
73
 
74
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
@@ -108,11 +135,13 @@ class VectorService:
108
  logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
109
  return search_results
110
 
111
- def delete_repository_data(self, repository_id: int):
112
  collection_name = f"repo_{repository_id}"
113
 
114
  try:
115
  self.client.delete_collection(collection_name)
 
116
  logger.info(f"πŸ—‘οΈ Deleted collection: {collection_name}")
117
- except:
 
118
  logger.warning(f"⚠️ Collection {collection_name} not found for deletion")
 
9
 
10
  class VectorService:
11
  def __init__(self):
12
+ try:
13
+ # Create absolute path for ChromaDB in HuggingFace Spaces
14
+ chroma_path = "/app/chroma_db"
15
+ os.makedirs(chroma_path, exist_ok=True)
16
+
17
+ print(f"πŸ”§ [CHROMA] Using database path: {chroma_path}", flush=True)
18
+
19
+ # Use Client instead of PersistentClient for HuggingFace compatibility
20
+ self.client = chromadb.Client(Settings(
21
+ chroma_db_impl="duckdb+parquet",
22
+ persist_directory=chroma_path,
23
  anonymized_telemetry=False,
24
  allow_reset=True
25
+ ))
26
+
27
+ print("βœ… [CHROMA] ChromaDB client initialized successfully!", flush=True)
28
+ logger.info("πŸ—„οΈ ChromaDB client initialized")
29
+
30
+ except Exception as e:
31
+ print(f"❌ [CHROMA] Failed to initialize ChromaDB: {e}", flush=True)
32
+ logger.error(f"❌ Failed to initialize ChromaDB: {e}")
33
+ raise Exception(f"Failed to initialize ChromaDB: {e}")
34
 
35
  def create_collection(self, repository_id: int) -> chromadb.Collection:
36
  collection_name = f"repo_{repository_id}"
37
 
38
  try:
39
  collection = self.client.get_collection(collection_name)
40
+ print(f"πŸ“š [CHROMA] Using existing collection: {collection_name}", flush=True)
41
  logger.info(f"πŸ“š Using existing collection: {collection_name}")
42
  except:
43
  collection = self.client.create_collection(
44
  name=collection_name,
45
  metadata={"repository_id": repository_id}
46
  )
47
+ print(f"πŸ†• [CHROMA] Created new collection: {collection_name}", flush=True)
48
  logger.info(f"πŸ†• Created new collection: {collection_name}")
49
 
50
  return collection
51
 
52
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
53
+ print(f"πŸ’Ύ [CHROMA] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
54
  logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
55
 
56
  collection = self.create_collection(repository_id)
 
76
  ids.append(chunk_id)
77
 
78
  batch_size = 100
79
+ total_batches = (len(documents) + batch_size - 1) // batch_size
80
+
81
+ for batch_num, i in enumerate(range(0, len(documents), batch_size), 1):
82
  end_idx = min(i + batch_size, len(documents))
83
 
84
+ try:
85
+ collection.add(
86
+ documents=documents[i:end_idx],
87
+ embeddings=embeddings[i:end_idx],
88
+ metadatas=metadatas[i:end_idx],
89
+ ids=ids[i:end_idx]
90
+ )
91
+
92
+ print(f"βœ… [CHROMA] Stored batch {batch_num}/{total_batches} ({end_idx-i} embeddings)", flush=True)
93
+
94
+ except Exception as e:
95
+ print(f"❌ [CHROMA] Error storing batch {batch_num}: {e}", flush=True)
96
+ raise
97
 
98
+ print(f"πŸŽ‰ [CHROMA] Successfully stored all {len(embedded_chunks)} embeddings for repository {repository_id}!", flush=True)
99
  logger.info(f"βœ… Successfully stored all embeddings for repository {repository_id}")
100
 
101
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
 
135
  logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
136
  return search_results
137
 
138
+ async def delete_repository_data(self, repository_id: int):
139
  collection_name = f"repo_{repository_id}"
140
 
141
  try:
142
  self.client.delete_collection(collection_name)
143
+ print(f"πŸ—‘οΈ [CHROMA] Deleted collection: {collection_name}", flush=True)
144
  logger.info(f"πŸ—‘οΈ Deleted collection: {collection_name}")
145
+ except Exception as e:
146
+ print(f"⚠️ [CHROMA] Collection {collection_name} not found for deletion: {e}", flush=True)
147
  logger.warning(f"⚠️ Collection {collection_name} not found for deletion")