garvitcpp commited on
Commit
99a1431
Β·
verified Β·
1 Parent(s): 71c1c20

Update app/services/pinecone_service.py

Browse files
Files changed (1) hide show
  1. app/services/pinecone_service.py +13 -10
app/services/pinecone_service.py CHANGED
@@ -59,7 +59,7 @@ class PineconeService:
59
  raise
60
 
61
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
62
- """Store embeddings in Pinecone with repository namespace"""
63
  print(f"πŸ’Ύ [PINECONE] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
64
  logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
65
 
@@ -68,17 +68,18 @@ class PineconeService:
68
  for i, chunk in enumerate(embedded_chunks):
69
  vector_id = f"repo_{repository_id}_chunk_{chunk['chunk_index']}_{i}"
70
 
 
71
  vector = {
72
  "id": vector_id,
73
  "values": chunk['embedding'],
74
  "metadata": {
75
  "repository_id": repository_id,
76
  "file_path": chunk['file_path'],
 
77
  "start_line": chunk['start_line'],
78
  "end_line": chunk['end_line'],
79
- "chunk_type": chunk['chunk_type'],
80
- "content_length": chunk['content_length'],
81
- "content": chunk['content'][:1000] # Pinecone metadata limit
82
  }
83
  }
84
  vectors.append(vector)
@@ -108,7 +109,7 @@ class PineconeService:
108
  raise
109
 
110
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
111
- """Search for similar code using Pinecone"""
112
  try:
113
  print(f"πŸ” [PINECONE] Searching for {top_k} similar chunks in repository {repository_id}", flush=True)
114
 
@@ -126,16 +127,18 @@ class PineconeService:
126
  similarity = match.score # Cosine similarity (0-1, higher is better)
127
  metadata = match.metadata
128
 
 
129
  search_results.append({
130
- 'content': metadata.get('content', ''),
131
- 'metadata': metadata,
132
- 'similarity': similarity,
133
  'file_path': metadata.get('file_path', ''),
 
134
  'start_line': metadata.get('start_line', 0),
135
- 'end_line': metadata.get('end_line', 0)
 
 
136
  })
137
 
138
- print(f"βœ… [PINECONE] Found {len(search_results)} similar code chunks", flush=True)
139
  logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
140
  return search_results
141
 
 
59
  raise
60
 
61
  async def store_embeddings(self, repository_id: int, embedded_chunks: List[Dict]):
62
+ """Store embeddings in Pinecone with minimal metadata (content stored in PostgreSQL)"""
63
  print(f"πŸ’Ύ [PINECONE] Storing {len(embedded_chunks)} embeddings for repository {repository_id}", flush=True)
64
  logger.info(f"πŸ’Ύ Storing {len(embedded_chunks)} embeddings for repository {repository_id}")
65
 
 
68
  for i, chunk in enumerate(embedded_chunks):
69
  vector_id = f"repo_{repository_id}_chunk_{chunk['chunk_index']}_{i}"
70
 
71
+ # Store ONLY identifiers - full content is in PostgreSQL
72
  vector = {
73
  "id": vector_id,
74
  "values": chunk['embedding'],
75
  "metadata": {
76
  "repository_id": repository_id,
77
  "file_path": chunk['file_path'],
78
+ "chunk_index": chunk['chunk_index'],
79
  "start_line": chunk['start_line'],
80
  "end_line": chunk['end_line'],
81
+ "chunk_type": chunk['chunk_type']
82
+ # NO content field - saves Pinecone storage!
 
83
  }
84
  }
85
  vectors.append(vector)
 
109
  raise
110
 
111
  async def search_similar_code(self, repository_id: int, query_embedding: List[float], top_k: int = 5) -> List[Dict]:
112
+ """Search for similar code using Pinecone - returns identifiers only"""
113
  try:
114
  print(f"πŸ” [PINECONE] Searching for {top_k} similar chunks in repository {repository_id}", flush=True)
115
 
 
127
  similarity = match.score # Cosine similarity (0-1, higher is better)
128
  metadata = match.metadata
129
 
130
+ # Return identifiers to fetch full content from PostgreSQL
131
  search_results.append({
132
+ 'repository_id': metadata.get('repository_id'),
 
 
133
  'file_path': metadata.get('file_path', ''),
134
+ 'chunk_index': metadata.get('chunk_index', 0),
135
  'start_line': metadata.get('start_line', 0),
136
+ 'end_line': metadata.get('end_line', 0),
137
+ 'chunk_type': metadata.get('chunk_type', ''),
138
+ 'similarity': similarity
139
  })
140
 
141
+ print(f"βœ… [PINECONE] Found {len(search_results)} similar code chunks (identifiers only)", flush=True)
142
  logger.info(f"πŸ” Found {len(search_results)} similar code chunks")
143
  return search_results
144