Update app/services/vector_service.py
Browse files
app/services/vector_service.py
CHANGED
|
@@ -23,9 +23,67 @@ class VectorService:
|
|
| 23 |
"""Store embeddings using Pinecone"""
|
| 24 |
return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
|
| 25 |
|
| 26 |
-
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
|
| 27 |
-
"""Search for similar code using Pinecone"""
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
async def delete_repository_data(self, repository_id: int):
|
| 31 |
"""Delete repository data using Pinecone"""
|
|
|
|
| 23 |
"""Store embeddings using Pinecone"""
|
| 24 |
return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
|
| 25 |
|
| 26 |
+
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
|
| 27 |
+
"""Search for similar code using Pinecone with hybrid search"""
|
| 28 |
+
|
| 29 |
+
# Get initial results from Pinecone
|
| 30 |
+
results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k * 3)
|
| 31 |
+
|
| 32 |
+
# If query text provided, apply keyword boosting
|
| 33 |
+
if query_text and results:
|
| 34 |
+
print(f"π [HYBRID] Applying keyword boost for: '{query_text}'", flush=True)
|
| 35 |
+
results = self._apply_keyword_boost(results, query_text)
|
| 36 |
+
results = results[:top_k] # Return only top_k after re-ranking
|
| 37 |
+
print(f"β
[HYBRID] Re-ranked and returning top {len(results)} results", flush=True)
|
| 38 |
+
|
| 39 |
+
return results
|
| 40 |
+
|
| 41 |
+
def _apply_keyword_boost(self, results: List[Dict], query: str) -> List[Dict]:
|
| 42 |
+
"""Apply keyword-based boosting to semantic search results"""
|
| 43 |
+
|
| 44 |
+
# Extract important keywords from query
|
| 45 |
+
query_lower = query.lower()
|
| 46 |
+
query_words = set(query_lower.split())
|
| 47 |
+
|
| 48 |
+
# Remove common stop words
|
| 49 |
+
stop_words = {'the', 'is', 'a', 'an', 'in', 'on', 'at', 'for', 'to', 'of', 'and', 'or', 'how', 'what', 'why', 'where', 'when', 'it', 'this', 'that', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must'}
|
| 50 |
+
query_keywords = query_words - stop_words
|
| 51 |
+
|
| 52 |
+
print(f"π [KEYWORDS] Extracted: {query_keywords}", flush=True)
|
| 53 |
+
|
| 54 |
+
for result in results:
|
| 55 |
+
content = result.get('content', '').lower()
|
| 56 |
+
file_path = result.get('file_path', '').lower()
|
| 57 |
+
|
| 58 |
+
# Count keyword matches in content
|
| 59 |
+
content_matches = sum(1 for keyword in query_keywords if keyword in content)
|
| 60 |
+
|
| 61 |
+
# Count keyword matches in file path (weighted higher)
|
| 62 |
+
path_matches = sum(1 for keyword in query_keywords if keyword in file_path)
|
| 63 |
+
|
| 64 |
+
# Calculate keyword score (0 to 1)
|
| 65 |
+
if query_keywords:
|
| 66 |
+
keyword_score = (content_matches + path_matches * 2) / (len(query_keywords) * 3)
|
| 67 |
+
else:
|
| 68 |
+
keyword_score = 0
|
| 69 |
+
|
| 70 |
+
# Original semantic similarity
|
| 71 |
+
semantic_score = result.get('similarity', 0)
|
| 72 |
+
|
| 73 |
+
# Hybrid score: 70% semantic + 30% keyword
|
| 74 |
+
hybrid_score = (semantic_score * 0.7) + (keyword_score * 0.3)
|
| 75 |
+
|
| 76 |
+
# Update the result
|
| 77 |
+
result['similarity'] = hybrid_score
|
| 78 |
+
result['semantic_score'] = semantic_score
|
| 79 |
+
result['keyword_score'] = keyword_score
|
| 80 |
+
|
| 81 |
+
print(f"π [SCORE] {file_path}: semantic={semantic_score:.3f}, keyword={keyword_score:.3f}, hybrid={hybrid_score:.3f}", flush=True)
|
| 82 |
+
|
| 83 |
+
# Re-sort by hybrid score
|
| 84 |
+
results.sort(key=lambda x: x['similarity'], reverse=True)
|
| 85 |
+
|
| 86 |
+
return results
|
| 87 |
|
| 88 |
async def delete_repository_data(self, repository_id: int):
|
| 89 |
"""Delete repository data using Pinecone"""
|