garvitcpp commited on
Commit
699d2be
Β·
verified Β·
1 Parent(s): 5b86222

Update app/services/vector_service.py

Browse files
Files changed (1) hide show
  1. app/services/vector_service.py +16 -131
app/services/vector_service.py CHANGED
@@ -1,147 +1,32 @@
1
- import chromadb
2
- from chromadb.config import Settings
3
- import os
4
  from typing import List, Dict, Optional
5
  import logging
6
- import numpy as np
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
  class VectorService:
 
 
11
  def __init__(self):
12
  try:
13
- # Create absolute path for ChromaDB in HuggingFace Spaces
14
- chroma_path = "/app/chroma_db"
15
- os.makedirs(chroma_path, exist_ok=True)
16
-
17
- print(f"πŸ”§ [CHROMA] Using database path: {chroma_path}", flush=True)
18
-
19
- # Use Client instead of PersistentClient for HuggingFace compatibility
20
- self.client = chromadb.Client(Settings(
21
- chroma_db_impl="duckdb+parquet",
22
- persist_directory=chroma_path,
23
- anonymized_telemetry=False,
24
- allow_reset=True
25
- ))
26
-
27
- print("βœ… [CHROMA] ChromaDB client initialized successfully!", flush=True)
28
- logger.info("πŸ—„οΈ ChromaDB client initialized")
29
 
30
  except Exception as e:
31
- print(f"❌ [CHROMA] Failed to initialize ChromaDB: {e}", flush=True)
32
- logger.error(f"❌ Failed to initialize ChromaDB: {e}")
33
- raise Exception(f"Failed to initialize ChromaDB: {e}")
34
-
35
- def create_collection(self, repository_id: int) -> chromadb.Collection:
36
- collection_name = f"repo_{repository_id}"
37
-
38
- try:
39
- collection = self.client.get_collection(collection_name)
40
- print(f"πŸ“š [CHROMA] Using existing collection: {collection_name}", flush=True)
41
- logger.info(f"πŸ“š Using existing collection: {collection_name}")
42
- except:
43
- collection = self.client.create_collection(
44
- name=collection_name,
45
- metadata={"repository_id": repository_id}
46
- )
47
- print(f"πŸ†• [CHROMA] Created new collection: {collection_name}", flush=True)
48
- logger.info(f"πŸ†• Created new collection: {collection_name}")
49
-
50
- return collection
51
 
52
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
53
- print(f"πŸ’Ύ [CHROMA] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
54
- logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
55
-
56
- collection = self.create_collection(repository_id)
57
-
58
- documents = []
59
- embeddings = []
60
- metadatas = []
61
- ids = []
62
-
63
- for i, chunk in enumerate(embedded_chunks):
64
- chunk_id = f"chunk_{repository_id}_{chunk['chunk_index']}_{i}"
65
-
66
- documents.append(chunk['content'])
67
- embeddings.append(chunk['embedding'])
68
- metadatas.append({
69
- 'file_path': chunk['file_path'],
70
- 'start_line': chunk['start_line'],
71
- 'end_line': chunk['end_line'],
72
- 'chunk_type': chunk['chunk_type'],
73
- 'content_length': chunk['content_length'],
74
- 'repository_id': repository_id
75
- })
76
- ids.append(chunk_id)
77
-
78
- batch_size = 100
79
- total_batches = (len(documents) + batch_size - 1) // batch_size
80
-
81
- for batch_num, i in enumerate(range(0, len(documents), batch_size), 1):
82
- end_idx = min(i + batch_size, len(documents))
83
-
84
- try:
85
- collection.add(
86
- documents=documents[i:end_idx],
87
- embeddings=embeddings[i:end_idx],
88
- metadatas=metadatas[i:end_idx],
89
- ids=ids[i:end_idx]
90
- )
91
-
92
- print(f"βœ… [CHROMA] Stored batch {batch_num}/{total_batches} ({end_idx-i} embeddings)", flush=True)
93
-
94
- except Exception as e:
95
- print(f"❌ [CHROMA] Error storing batch {batch_num}: {e}", flush=True)
96
- raise
97
-
98
- print(f"πŸŽ‰ [CHROMA] Successfully stored all {len(embedded_chunks)} embeddings for repository {repository_id}!", flush=True)
99
- logger.info(f"βœ… Successfully stored all embeddings for repository {repository_id}")
100
 
101
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
102
- collection_name = f"repo_{repository_id}"
103
-
104
- try:
105
- collection = self.client.get_collection(collection_name)
106
- except:
107
- logger.warning(f"⚠️ Collection {collection_name} not found")
108
- return []
109
-
110
- results = collection.query(
111
- query_embeddings=[query_embedding],
112
- n_results=top_k,
113
- include=['documents', 'metadatas', 'distances']
114
- )
115
-
116
- search_results = []
117
- for i in range(len(results['documents'][0])):
118
- # Fix similarity calculation
119
- distance = results['distances'][0][i]
120
- # Convert distance to similarity (higher is better)
121
- similarity = max(0.0, 1.0 - distance) # Ensure positive similarity
122
-
123
- search_results.append({
124
- 'content': results['documents'][0][i],
125
- 'metadata': results['metadatas'][0][i],
126
- 'similarity': similarity,
127
- 'file_path': results['metadatas'][0][i]['file_path'],
128
- 'start_line': results['metadatas'][0][i]['start_line'],
129
- 'end_line': results['metadatas'][0][i]['end_line']
130
- })
131
-
132
- # Sort by similarity (highest first)
133
- search_results.sort(key=lambda x: x['similarity'], reverse=True)
134
-
135
- logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
136
- return search_results
137
 
138
  async def delete_repository_data(self, repository_id: int):
139
- collection_name = f"repo_{repository_id}"
140
-
141
- try:
142
- self.client.delete_collection(collection_name)
143
- print(f"πŸ—‘οΈ [CHROMA] Deleted collection: {collection_name}", flush=True)
144
- logger.info(f"πŸ—‘οΈ Deleted collection: {collection_name}")
145
- except Exception as e:
146
- print(f"⚠️ [CHROMA] Collection {collection_name} not found for deletion: {e}", flush=True)
147
- logger.warning(f"⚠️ Collection {collection_name} not found for deletion")
 
 
 
 
1
  from typing import List, Dict, Optional
2
  import logging
3
+ from .pinecone_service import PineconeService
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
  class VectorService:
8
+ """Vector service that uses Pinecone for production-ready vector storage"""
9
+
10
  def __init__(self):
11
  try:
12
+ print("πŸš€ [VECTOR] Initializing production vector service with Pinecone", flush=True)
13
+ self.pinecone_service = PineconeService()
14
+ print("βœ… [VECTOR] Vector service initialized successfully!", flush=True)
15
+ logger.info("πŸ—„οΈ Vector service initialized with Pinecone")
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  except Exception as e:
18
+ print(f"❌ [VECTOR] Failed to initialize vector service: {e}", flush=True)
19
+ logger.error(f"❌ Failed to initialize vector service: {e}")
20
+ raise Exception(f"Failed to initialize vector service: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
23
+ """Store embeddings using Pinecone"""
24
+ return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
27
+ """Search for similar code using Pinecone"""
28
+ return await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  async def delete_repository_data(self, repository_id: int):
31
+ """Delete repository data using Pinecone"""
32
+ return await self.pinecone_service.delete_repository_data(repository_id)