Update app/services/vector_service.py
Browse files
app/services/vector_service.py
CHANGED
|
@@ -24,64 +24,13 @@ class VectorService:
|
|
| 24 |
return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
|
| 25 |
|
| 26 |
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
|
| 27 |
-
"""Search for similar code using Pinecone
|
| 28 |
|
| 29 |
-
# Get
|
| 30 |
-
results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
print(f"π [HYBRID] Applying keyword boost for: '{query_text}'", flush=True)
|
| 35 |
-
results = self._apply_keyword_boost(results, query_text)
|
| 36 |
-
results = results[:top_k] # Return only top_k after re-ranking
|
| 37 |
-
print(f"β
[HYBRID] Re-ranked and returning top {len(results)} results", flush=True)
|
| 38 |
-
|
| 39 |
-
return results
|
| 40 |
-
|
| 41 |
-
def _apply_keyword_boost(self, results: List[Dict], query: str) -> List[Dict]:
|
| 42 |
-
"""Apply keyword-based boosting to semantic search results"""
|
| 43 |
-
|
| 44 |
-
# Extract important keywords from query
|
| 45 |
-
query_lower = query.lower()
|
| 46 |
-
query_words = set(query_lower.split())
|
| 47 |
-
|
| 48 |
-
# Remove common stop words
|
| 49 |
-
stop_words = {'the', 'is', 'a', 'an', 'in', 'on', 'at', 'for', 'to', 'of', 'and', 'or', 'how', 'what', 'why', 'where', 'when', 'it', 'this', 'that', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must'}
|
| 50 |
-
query_keywords = query_words - stop_words
|
| 51 |
-
|
| 52 |
-
print(f"π [KEYWORDS] Extracted: {query_keywords}", flush=True)
|
| 53 |
-
|
| 54 |
-
for result in results:
|
| 55 |
-
content = result.get('content', '').lower()
|
| 56 |
-
file_path = result.get('file_path', '').lower()
|
| 57 |
-
|
| 58 |
-
# Count keyword matches in content
|
| 59 |
-
content_matches = sum(1 for keyword in query_keywords if keyword in content)
|
| 60 |
-
|
| 61 |
-
# Count keyword matches in file path (weighted higher)
|
| 62 |
-
path_matches = sum(1 for keyword in query_keywords if keyword in file_path)
|
| 63 |
-
|
| 64 |
-
# Calculate keyword score (0 to 1)
|
| 65 |
-
if query_keywords:
|
| 66 |
-
keyword_score = (content_matches + path_matches * 2) / (len(query_keywords) * 3)
|
| 67 |
-
else:
|
| 68 |
-
keyword_score = 0
|
| 69 |
-
|
| 70 |
-
# Original semantic similarity
|
| 71 |
-
semantic_score = result.get('similarity', 0)
|
| 72 |
-
|
| 73 |
-
# Hybrid score: 70% semantic + 30% keyword
|
| 74 |
-
hybrid_score = (semantic_score * 0.7) + (keyword_score * 0.3)
|
| 75 |
-
|
| 76 |
-
# Update the result
|
| 77 |
-
result['similarity'] = hybrid_score
|
| 78 |
-
result['semantic_score'] = semantic_score
|
| 79 |
-
result['keyword_score'] = keyword_score
|
| 80 |
-
|
| 81 |
-
print(f"π [SCORE] {file_path}: semantic={semantic_score:.3f}, keyword={keyword_score:.3f}, hybrid={hybrid_score:.3f}", flush=True)
|
| 82 |
-
|
| 83 |
-
# Re-sort by hybrid score
|
| 84 |
-
results.sort(key=lambda x: x['similarity'], reverse=True)
|
| 85 |
|
| 86 |
return results
|
| 87 |
|
|
|
|
| 24 |
return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
|
| 25 |
|
| 26 |
async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
|
| 27 |
+
"""Search for similar code using Pinecone - returns identifiers only"""
|
| 28 |
|
| 29 |
+
# Get results from Pinecone (now returns identifiers only, no content)
|
| 30 |
+
results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k)
|
| 31 |
|
| 32 |
+
# Note: Keyword boosting now happens at the PostgreSQL level when fetching content
|
| 33 |
+
# Pinecone handles semantic similarity, PostgreSQL provides full content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
return results
|
| 36 |
|