VcRlAgent commited on
Commit
74b575c
·
1 Parent(s): a59bb17

Debug retriever and search

Browse files
app/routes/ingest_routes.py CHANGED
@@ -48,6 +48,7 @@ async def ingest_data(file: UploadFile = File(...)):
48
 
49
  # Load data from temporary file
50
  records = DataIngestionService.load_data(temp_file_path)
 
51
 
52
  if not records:
53
  raise HTTPException(status_code=400, detail="No records found in file")
 
48
 
49
  # Load data from temporary file
50
  records = DataIngestionService.load_data(temp_file_path)
51
+ logger.debug(f"Loaded {len(records)} records from file")
52
 
53
  if not records:
54
  raise HTTPException(status_code=400, detail="No records found in file")
app/services/embeddings.py CHANGED
@@ -19,8 +19,10 @@ class EmbeddingService:
19
 
20
  def embed_text(self, text: str) -> List[float]:
21
  """Generate embedding for a single text"""
22
- embedding = self.model.encode(text, convert_to_numpy=True)
 
23
  return embedding.tolist()
 
24
 
25
  def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
26
  """Generate embeddings for a batch of texts"""
@@ -29,12 +31,14 @@ class EmbeddingService:
29
  texts,
30
  batch_size=batch_size,
31
  show_progress_bar=True,
32
- convert_to_numpy=True
 
33
  )
34
  return embeddings.tolist()
35
 
36
  def get_dimension(self) -> int:
37
  """Return embedding dimension"""
 
38
  return self.dimension
39
 
40
  # Global instance
 
19
 
20
  def embed_text(self, text: str) -> List[float]:
21
  """Generate embedding for a single text"""
22
+ embedding = self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
23
+ logger.debug(f"Generated embedding for text: {embedding}")
24
  return embedding.tolist()
25
+
26
 
27
  def embed_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
28
  """Generate embeddings for a batch of texts"""
 
31
  texts,
32
  batch_size=batch_size,
33
  show_progress_bar=True,
34
+ convert_to_numpy=True,
35
+ normalize_embeddings=True
36
  )
37
  return embeddings.tolist()
38
 
39
  def get_dimension(self) -> int:
40
  """Return embedding dimension"""
41
+ logger.debug(f"Embedding dimension requested: {self.dimension}")
42
  return self.dimension
43
 
44
  # Global instance
app/services/retriever.py CHANGED
@@ -16,12 +16,15 @@ class RetrieverService:
16
 
17
  def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
18
  """Retrieve relevant documents for a query"""
 
 
19
  if top_k is None:
20
  top_k = settings.TOP_K
21
 
22
  # Generate query embedding
23
  logger.info(f"Retrieving documents for query: {query}")
24
  query_embedding = self.embedding_service.embed_text(query)
 
25
 
26
  #FAISS
27
  results = self.vector_store.search(
@@ -30,6 +33,10 @@ class RetrieverService:
30
  score_threshold=settings.SCORE_THRESHOLD
31
  )
32
 
 
 
 
 
33
  #Qdrant
34
  # Search vector database
35
  # results = self.vector_store.search(
 
16
 
17
  def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
18
  """Retrieve relevant documents for a query"""
19
+ logger.debug(f"top_k: {top_k}")
20
+ logger.debug(f"User Query: {query}")
21
  if top_k is None:
22
  top_k = settings.TOP_K
23
 
24
  # Generate query embedding
25
  logger.info(f"Retrieving documents for query: {query}")
26
  query_embedding = self.embedding_service.embed_text(query)
27
+ logger.debug(f"Embedded query: {query_embedding}")
28
 
29
  #FAISS
30
  results = self.vector_store.search(
 
33
  score_threshold=settings.SCORE_THRESHOLD
34
  )
35
 
36
+ logger.debug(f"FAISS total vectors: {index.ntotal}")
37
+ D, I = self.vector_store.index.search(np.array([query_embedding]).astype("float32"), k=3)
38
+ logger.debug(f"Distances: {D}, Indices: {I}")
39
+
40
  #Qdrant
41
  # Search vector database
42
  # results = self.vector_store.search(
app/utils/vector_store.py DELETED
@@ -1,96 +0,0 @@
1
- """Qdrant vector store service"""
2
- from qdrant_client import QdrantClient
3
- from qdrant_client.http import models
4
- from typing import List, Dict, Any
5
- from app.config import settings
6
- from app.utils.logger import setup_logger
7
-
8
- logger = setup_logger(__name__)
9
-
10
- class VectorStoreService:
11
- """Manages Qdrant vector database operations"""
12
-
13
- def __init__(self):
14
- """Initialize Qdrant client"""
15
- logger.info(f"Connecting to Qdrant at {settings.QDRANT_URL}")
16
- self.client = QdrantClient(
17
- url=settings.QDRANT_URL,
18
- api_key=settings.QDRANT_API_KEY if settings.QDRANT_API_KEY else None
19
- )
20
- self.collection_name = settings.QDRANT_COLLECTION_NAME
21
-
22
- def create_collection(self, vector_size: int):
23
- """Create or recreate the collection"""
24
- try:
25
- # Delete if exists
26
- self.client.delete_collection(collection_name=self.collection_name)
27
- logger.info(f"Deleted existing collection: {self.collection_name}")
28
- except:
29
- pass
30
-
31
- # Create new collection
32
- self.client.create_collection(
33
- collection_name=self.collection_name,
34
- vectors_config=models.VectorParams(
35
- size=vector_size,
36
- distance=models.Distance.COSINE
37
- )
38
- )
39
- logger.info(f"Created collection: {self.collection_name}")
40
-
41
- def upsert_vectors(
42
- self,
43
- vectors: List[List[float]],
44
- payloads: List[Dict[str, Any]]
45
- ) -> int:
46
- """Insert vectors with metadata"""
47
- points = [
48
- models.PointStruct(
49
- id=idx,
50
- vector=vector,
51
- payload=payload
52
- )
53
- for idx, (vector, payload) in enumerate(zip(vectors, payloads))
54
- ]
55
-
56
- self.client.upsert(
57
- collection_name=self.collection_name,
58
- points=points
59
- )
60
-
61
- logger.info(f"Upserted {len(points)} vectors")
62
- return len(points)
63
-
64
- def search(
65
- self,
66
- query_vector: List[float],
67
- limit: int = 5,
68
- score_threshold: float = 0.5
69
- ) -> List[Dict[str, Any]]:
70
- """Search for similar vectors"""
71
- results = self.client.search(
72
- collection_name=self.collection_name,
73
- query_vector=query_vector,
74
- limit=limit,
75
- score_threshold=score_threshold
76
- )
77
-
78
- return [
79
- {
80
- "id": result.id,
81
- "score": result.score,
82
- "payload": result.payload
83
- }
84
- for result in results
85
- ]
86
-
87
- def get_collection_info(self) -> Dict[str, Any]:
88
- """Get collection statistics"""
89
- info = self.client.get_collection(collection_name=self.collection_name)
90
- return {
91
- "vectors_count": info.vectors_count,
92
- "status": info.status
93
- }
94
-
95
- # Global instance
96
- vector_store = VectorStoreService()