garvitcpp commited on
Commit
5b86222
Β·
verified Β·
1 Parent(s): 5f1d522

Create pinecone_service.py

Browse files
Files changed (1) hide show
  1. app/services/pinecone_service.py +160 -0
app/services/pinecone_service.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pinecone import Pinecone, ServerlessSpec
2
+ from typing import List, Dict, Optional
3
+ import logging
4
+ import os
5
+ from app.core.config import settings
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class PineconeService:
10
+ def __init__(self):
11
+ try:
12
+ print("πŸ”§ [PINECONE] Initializing Pinecone client...", flush=True)
13
+
14
+ if not settings.pinecone_api_key:
15
+ raise Exception("PINECONE_API_KEY environment variable is required")
16
+
17
+ # Initialize Pinecone client
18
+ self.pc = Pinecone(api_key=settings.pinecone_api_key)
19
+
20
+ # Check if index exists, create if not
21
+ self.index_name = settings.pinecone_index_name
22
+ self._ensure_index_exists()
23
+
24
+ # Connect to index
25
+ self.index = self.pc.Index(self.index_name)
26
+
27
+ print(f"βœ… [PINECONE] Connected to index: {self.index_name}", flush=True)
28
+ logger.info(f"🎯 Pinecone service initialized with index: {self.index_name}")
29
+
30
+ except Exception as e:
31
+ print(f"❌ [PINECONE] Failed to initialize: {e}", flush=True)
32
+ logger.error(f"❌ Failed to initialize Pinecone: {e}")
33
+ raise Exception(f"Failed to initialize Pinecone: {e}")
34
+
35
+ def _ensure_index_exists(self):
36
+ """Create index if it doesn't exist"""
37
+ try:
38
+ existing_indexes = [index.name for index in self.pc.list_indexes()]
39
+
40
+ if self.index_name not in existing_indexes:
41
+ print(f"πŸ†• [PINECONE] Creating new index: {self.index_name}", flush=True)
42
+
43
+ self.pc.create_index(
44
+ name=self.index_name,
45
+ dimension=384, # all-MiniLM-L6-v2 embedding dimension
46
+ metric='cosine',
47
+ spec=ServerlessSpec(
48
+ cloud='aws',
49
+ region='us-east-1'
50
+ )
51
+ )
52
+
53
+ print(f"βœ… [PINECONE] Index created successfully: {self.index_name}", flush=True)
54
+ else:
55
+ print(f"πŸ“š [PINECONE] Using existing index: {self.index_name}", flush=True)
56
+
57
+ except Exception as e:
58
+ print(f"❌ [PINECONE] Error with index: {e}", flush=True)
59
+ raise
60
+
61
+ async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
62
+ """Store embeddings in Pinecone with repository namespace"""
63
+ print(f"πŸ’Ύ [PINECONE] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
64
+ logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
65
+
66
+ try:
67
+ vectors = []
68
+ for i, chunk in enumerate(embedded_chunks):
69
+ vector_id = f"repo_{repository_id}_chunk_{chunk['chunk_index']}_{i}"
70
+
71
+ vector = {
72
+ "id": vector_id,
73
+ "values": chunk['embedding'],
74
+ "metadata": {
75
+ "repository_id": repository_id,
76
+ "file_path": chunk['file_path'],
77
+ "start_line": chunk['start_line'],
78
+ "end_line": chunk['end_line'],
79
+ "chunk_type": chunk['chunk_type'],
80
+ "content_length": chunk['content_length'],
81
+ "content": chunk['content'][:1000] # Pinecone metadata limit
82
+ }
83
+ }
84
+ vectors.append(vector)
85
+
86
+ # Batch upsert in chunks of 100
87
+ batch_size = 100
88
+ total_batches = (len(vectors) + batch_size - 1) // batch_size
89
+
90
+ for batch_num, i in enumerate(range(0, len(vectors), batch_size), 1):
91
+ end_idx = min(i + batch_size, len(vectors))
92
+ batch_vectors = vectors[i:end_idx]
93
+
94
+ # Upsert to Pinecone
95
+ self.index.upsert(
96
+ vectors=batch_vectors,
97
+ namespace=f"repo_{repository_id}"
98
+ )
99
+
100
+ print(f"βœ… [PINECONE] Stored batch {batch_num}/{total_batches} ({len(batch_vectors)} vectors)", flush=True)
101
+
102
+ print(f"πŸŽ‰ [PINECONE] Successfully stored all {len(embedded_chunks)} embeddings for repository {repository_id}!", flush=True)
103
+ logger.info(f"βœ… Successfully stored all embeddings for repository {repository_id}")
104
+
105
+ except Exception as e:
106
+ print(f"❌ [PINECONE] Error storing embeddings: {e}", flush=True)
107
+ logger.error(f"❌ Error storing embeddings in Pinecone: {e}")
108
+ raise
109
+
110
+ async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
111
+ """Search for similar code using Pinecone"""
112
+ try:
113
+ print(f"πŸ” [PINECONE] Searching for {top_k} similar chunks in repository {repository_id}", flush=True)
114
+
115
+ # Query Pinecone with repository namespace
116
+ results = self.index.query(
117
+ vector=query_embedding,
118
+ top_k=top_k,
119
+ namespace=f"repo_{repository_id}",
120
+ include_metadata=True,
121
+ include_values=False
122
+ )
123
+
124
+ search_results = []
125
+ for match in results.matches:
126
+ similarity = match.score # Cosine similarity (0-1, higher is better)
127
+ metadata = match.metadata
128
+
129
+ search_results.append({
130
+ 'content': metadata.get('content', ''),
131
+ 'metadata': metadata,
132
+ 'similarity': similarity,
133
+ 'file_path': metadata.get('file_path', ''),
134
+ 'start_line': metadata.get('start_line', 0),
135
+ 'end_line': metadata.get('end_line', 0)
136
+ })
137
+
138
+ print(f"βœ… [PINECONE] Found {len(search_results)} similar code chunks", flush=True)
139
+ logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
140
+ return search_results
141
+
142
+ except Exception as e:
143
+ print(f"❌ [PINECONE] Error searching: {e}", flush=True)
144
+ logger.error(f"❌ Error searching in Pinecone: {e}")
145
+ return []
146
+
147
+ async def delete_repository_data(self, repository_id: int):
148
+ """Delete all vectors for a repository"""
149
+ try:
150
+ namespace = f"repo_{repository_id}"
151
+
152
+ # Delete all vectors in the namespace
153
+ self.index.delete(delete_all=True, namespace=namespace)
154
+
155
+ print(f"πŸ—‘οΈ [PINECONE] Deleted all data for repository {repository_id}", flush=True)
156
+ logger.info(f"πŸ—‘οΈ Deleted all data for repository {repository_id}")
157
+
158
+ except Exception as e:
159
+ print(f"⚠️ [PINECONE] Error deleting repository data: {e}", flush=True)
160
+ logger.warning(f"⚠️ Error deleting repository data: {e}")