Update app/services/pinecone_service.py
Browse files- app/services/pinecone_service.py +13 -10
app/services/pinecone_service.py
CHANGED
|
@@ -59,7 +59,7 @@ class PineconeService:
|
|
| 59 |
raise
|
| 60 |
|
| 61 |
async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
|
| 62 |
-
"""Store embeddings in Pinecone with
|
| 63 |
print(f"πΎ [PINECONE] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
|
| 64 |
logger.info(f"πΎ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
|
| 65 |
|
|
@@ -68,17 +68,18 @@ class PineconeService:
|
|
| 68 |
for i, chunk in enumerate(embedded_chunks):
|
| 69 |
vector_id = f"repo_{repository_id}_chunk_{chunk['chunk_index']}_{i}"
|
| 70 |
|
|
|
|
| 71 |
vector = {
|
| 72 |
"id": vector_id,
|
| 73 |
"values": chunk['embedding'],
|
| 74 |
"metadata": {
|
| 75 |
"repository_id": repository_id,
|
| 76 |
"file_path": chunk['file_path'],
|
|
|
|
| 77 |
"start_line": chunk['start_line'],
|
| 78 |
"end_line": chunk['end_line'],
|
| 79 |
-
"chunk_type": chunk['chunk_type']
|
| 80 |
-
|
| 81 |
-
"content": chunk['content'][:1000] # Pinecone metadata limit
|
| 82 |
}
|
| 83 |
}
|
| 84 |
vectors.append(vector)
|
|
@@ -108,7 +109,7 @@ class PineconeService:
|
|
| 108 |
raise
|
| 109 |
|
| 110 |
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
|
| 111 |
-
"""Search for similar code using Pinecone"""
|
| 112 |
try:
|
| 113 |
print(f"π [PINECONE] Searching for {top_k} similar chunks in repository {repository_id}", flush=True)
|
| 114 |
|
|
@@ -126,16 +127,18 @@ class PineconeService:
|
|
| 126 |
similarity = match.score # Cosine similarity (0-1, higher is better)
|
| 127 |
metadata = match.metadata
|
| 128 |
|
|
|
|
| 129 |
search_results.append({
|
| 130 |
-
'
|
| 131 |
-
'metadata': metadata,
|
| 132 |
-
'similarity': similarity,
|
| 133 |
'file_path': metadata.get('file_path', ''),
|
|
|
|
| 134 |
'start_line': metadata.get('start_line', 0),
|
| 135 |
-
'end_line': metadata.get('end_line', 0)
|
|
|
|
|
|
|
| 136 |
})
|
| 137 |
|
| 138 |
-
print(f"β
[PINECONE] Found {len(search_results)} similar code chunks", flush=True)
|
| 139 |
logger.info(f"π Found {len(search_results)} similar code chunks")
|
| 140 |
return search_results
|
| 141 |
|
|
|
|
| 59 |
raise
|
| 60 |
|
| 61 |
async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
|
| 62 |
+
"""Store embeddings in Pinecone with minimal metadata (content stored in PostgreSQL)"""
|
| 63 |
print(f"πΎ [PINECONE] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
|
| 64 |
logger.info(f"πΎ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
|
| 65 |
|
|
|
|
| 68 |
for i, chunk in enumerate(embedded_chunks):
|
| 69 |
vector_id = f"repo_{repository_id}_chunk_{chunk['chunk_index']}_{i}"
|
| 70 |
|
| 71 |
+
# Store ONLY identifiers - full content is in PostgreSQL
|
| 72 |
vector = {
|
| 73 |
"id": vector_id,
|
| 74 |
"values": chunk['embedding'],
|
| 75 |
"metadata": {
|
| 76 |
"repository_id": repository_id,
|
| 77 |
"file_path": chunk['file_path'],
|
| 78 |
+
"chunk_index": chunk['chunk_index'],
|
| 79 |
"start_line": chunk['start_line'],
|
| 80 |
"end_line": chunk['end_line'],
|
| 81 |
+
"chunk_type": chunk['chunk_type']
|
| 82 |
+
# NO content field - saves Pinecone storage!
|
|
|
|
| 83 |
}
|
| 84 |
}
|
| 85 |
vectors.append(vector)
|
|
|
|
| 109 |
raise
|
| 110 |
|
| 111 |
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
|
| 112 |
+
"""Search for similar code using Pinecone - returns identifiers only"""
|
| 113 |
try:
|
| 114 |
print(f"π [PINECONE] Searching for {top_k} similar chunks in repository {repository_id}", flush=True)
|
| 115 |
|
|
|
|
| 127 |
similarity = match.score # Cosine similarity (0-1, higher is better)
|
| 128 |
metadata = match.metadata
|
| 129 |
|
| 130 |
+
# Return identifiers to fetch full content from PostgreSQL
|
| 131 |
search_results.append({
|
| 132 |
+
'repository_id': metadata.get('repository_id'),
|
|
|
|
|
|
|
| 133 |
'file_path': metadata.get('file_path', ''),
|
| 134 |
+
'chunk_index': metadata.get('chunk_index', 0),
|
| 135 |
'start_line': metadata.get('start_line', 0),
|
| 136 |
+
'end_line': metadata.get('end_line', 0),
|
| 137 |
+
'chunk_type': metadata.get('chunk_type', ''),
|
| 138 |
+
'similarity': similarity
|
| 139 |
})
|
| 140 |
|
| 141 |
+
print(f"β
[PINECONE] Found {len(search_results)} similar code chunks (identifiers only)", flush=True)
|
| 142 |
logger.info(f"π Found {len(search_results)} similar code chunks")
|
| 143 |
return search_results
|
| 144 |
|