garvitcpp commited on
Commit
e85c608
Β·
verified Β·
1 Parent(s): a722b8e

Update app/services/vector_service.py

Browse files
Files changed (1) hide show
  1. app/services/vector_service.py +61 -3
app/services/vector_service.py CHANGED
@@ -23,9 +23,67 @@ class VectorService:
23
  """Store embeddings using Pinecone"""
24
  return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
25
 
26
- async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
27
- """Search for similar code using Pinecone"""
28
- return await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  async def delete_repository_data(self, repository_id: int):
31
  """Delete repository data using Pinecone"""
 
23
  """Store embeddings using Pinecone"""
24
  return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
25
 
26
+ async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
27
+ """Search for similar code using Pinecone with hybrid search"""
28
+
29
+ # Get initial results from Pinecone
30
+ results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k * 3)
31
+
32
+ # If query text provided, apply keyword boosting
33
+ if query_text and results:
34
+ print(f"πŸ”„ [HYBRID] Applying keyword boost for: '{query_text}'", flush=True)
35
+ results = self._apply_keyword_boost(results, query_text)
36
+ results = results[:top_k] # Return only top_k after re-ranking
37
+ print(f"βœ… [HYBRID] Re-ranked and returning top {len(results)} results", flush=True)
38
+
39
+ return results
40
+
41
+ def _apply_keyword_boost(self, results: List[Dict], query: str) -> List[Dict]:
42
+ """Apply keyword-based boosting to semantic search results"""
43
+
44
+ # Extract important keywords from query
45
+ query_lower = query.lower()
46
+ query_words = set(query_lower.split())
47
+
48
+ # Remove common stop words
49
+ stop_words = {'the', 'is', 'a', 'an', 'in', 'on', 'at', 'for', 'to', 'of', 'and', 'or', 'how', 'what', 'why', 'where', 'when', 'it', 'this', 'that', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must'}
50
+ query_keywords = query_words - stop_words
51
+
52
+ print(f"πŸ”‘ [KEYWORDS] Extracted: {query_keywords}", flush=True)
53
+
54
+ for result in results:
55
+ content = result.get('content', '').lower()
56
+ file_path = result.get('file_path', '').lower()
57
+
58
+ # Count keyword matches in content
59
+ content_matches = sum(1 for keyword in query_keywords if keyword in content)
60
+
61
+ # Count keyword matches in file path (weighted higher)
62
+ path_matches = sum(1 for keyword in query_keywords if keyword in file_path)
63
+
64
+ # Calculate keyword score (0 to 1)
65
+ if query_keywords:
66
+ keyword_score = (content_matches + path_matches * 2) / (len(query_keywords) * 3)
67
+ else:
68
+ keyword_score = 0
69
+
70
+ # Original semantic similarity
71
+ semantic_score = result.get('similarity', 0)
72
+
73
+ # Hybrid score: 70% semantic + 30% keyword
74
+ hybrid_score = (semantic_score * 0.7) + (keyword_score * 0.3)
75
+
76
+ # Update the result
77
+ result['similarity'] = hybrid_score
78
+ result['semantic_score'] = semantic_score
79
+ result['keyword_score'] = keyword_score
80
+
81
+ print(f"πŸ“Š [SCORE] {file_path}: semantic={semantic_score:.3f}, keyword={keyword_score:.3f}, hybrid={hybrid_score:.3f}", flush=True)
82
+
83
+ # Re-sort by hybrid score
84
+ results.sort(key=lambda x: x['similarity'], reverse=True)
85
+
86
+ return results
87
 
88
  async def delete_repository_data(self, repository_id: int):
89
  """Delete repository data using Pinecone"""