garvitcpp commited on
Commit
a3346fc
Β·
verified Β·
1 Parent(s): 99a1431

Update app/services/vector_service.py

Browse files
Files changed (1) hide show
  1. app/services/vector_service.py +5 -56
app/services/vector_service.py CHANGED
@@ -24,64 +24,13 @@ class VectorService:
24
  return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
25
 
26
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
27
- """Search for similar code using Pinecone with hybrid search"""
28
 
29
- # Get initial results from Pinecone
30
- results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k * 3)
31
 
32
- # If query text provided, apply keyword boosting
33
- if query_text and results:
34
- print(f"πŸ”„ [HYBRID] Applying keyword boost for: '{query_text}'", flush=True)
35
- results = self._apply_keyword_boost(results, query_text)
36
- results = results[:top_k] # Return only top_k after re-ranking
37
- print(f"βœ… [HYBRID] Re-ranked and returning top {len(results)} results", flush=True)
38
-
39
- return results
40
-
41
- def _apply_keyword_boost(self, results: List[Dict], query: str) -> List[Dict]:
42
- """Apply keyword-based boosting to semantic search results"""
43
-
44
- # Extract important keywords from query
45
- query_lower = query.lower()
46
- query_words = set(query_lower.split())
47
-
48
- # Remove common stop words
49
- stop_words = {'the', 'is', 'a', 'an', 'in', 'on', 'at', 'for', 'to', 'of', 'and', 'or', 'how', 'what', 'why', 'where', 'when', 'it', 'this', 'that', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must'}
50
- query_keywords = query_words - stop_words
51
-
52
- print(f"πŸ”‘ [KEYWORDS] Extracted: {query_keywords}", flush=True)
53
-
54
- for result in results:
55
- content = result.get('content', '').lower()
56
- file_path = result.get('file_path', '').lower()
57
-
58
- # Count keyword matches in content
59
- content_matches = sum(1 for keyword in query_keywords if keyword in content)
60
-
61
- # Count keyword matches in file path (weighted higher)
62
- path_matches = sum(1 for keyword in query_keywords if keyword in file_path)
63
-
64
- # Calculate keyword score (0 to 1)
65
- if query_keywords:
66
- keyword_score = (content_matches + path_matches * 2) / (len(query_keywords) * 3)
67
- else:
68
- keyword_score = 0
69
-
70
- # Original semantic similarity
71
- semantic_score = result.get('similarity', 0)
72
-
73
- # Hybrid score: 70% semantic + 30% keyword
74
- hybrid_score = (semantic_score * 0.7) + (keyword_score * 0.3)
75
-
76
- # Update the result
77
- result['similarity'] = hybrid_score
78
- result['semantic_score'] = semantic_score
79
- result['keyword_score'] = keyword_score
80
-
81
- print(f"πŸ“Š [SCORE] {file_path}: semantic={semantic_score:.3f}, keyword={keyword_score:.3f}, hybrid={hybrid_score:.3f}", flush=True)
82
-
83
- # Re-sort by hybrid score
84
- results.sort(key=lambda x: x['similarity'], reverse=True)
85
 
86
  return results
87
 
 
24
  return await self.pinecone_service.store_embeddings(repository_id, embedded_chunks)
25
 
26
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5, query_text: str = "") -> List[Dict]:
27
+ """Search for similar code using Pinecone - returns identifiers only"""
28
 
29
+ # Get results from Pinecone (now returns identifiers only, no content)
30
+ results = await self.pinecone_service.search_similar_code(repository_id, query_embedding, top_k)
31
 
32
+ # Note: Keyword boosting now happens at the PostgreSQL level when fetching content
33
+ # Pinecone handles semantic similarity, PostgreSQL provides full content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  return results
36