Update services/pdf_service.py
Browse files- services/pdf_service.py +10 -6
services/pdf_service.py
CHANGED
|
@@ -121,30 +121,34 @@ class PDFService:
|
|
| 121 |
top_k: int = 5,
|
| 122 |
min_score: float = 0.5
|
| 123 |
) -> List[Dict[str, Any]]:
|
| 124 |
-
"""Search indexed PDFs"""
|
| 125 |
print("--------------------------- query ----------------------------------")
|
| 126 |
print(query)
|
| 127 |
if not self.index or not self.chunks:
|
| 128 |
await self.index_pdfs()
|
| 129 |
-
|
| 130 |
try:
|
| 131 |
# Create query embedding
|
| 132 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
| 133 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
|
|
|
| 134 |
|
| 135 |
# Search in FAISS index
|
| 136 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Process results
|
| 139 |
results = []
|
| 140 |
for i, idx in enumerate(indices[0]):
|
| 141 |
if idx >= len(self.chunks):
|
| 142 |
continue # Skip invalid indices
|
| 143 |
-
|
| 144 |
-
score = 1 - distances[0][i] #
|
|
|
|
| 145 |
if score < min_score:
|
| 146 |
continue # Skip low scores
|
| 147 |
-
|
| 148 |
chunk = self.chunks[idx].copy()
|
| 149 |
chunk['score'] = score
|
| 150 |
results.append(chunk)
|
|
@@ -156,7 +160,7 @@ class PDFService:
|
|
| 156 |
print(results)
|
| 157 |
|
| 158 |
return results[:top_k]
|
| 159 |
-
|
| 160 |
except Exception as e:
|
| 161 |
logger.error(f"Error searching PDFs: {e}")
|
| 162 |
raise
|
|
|
|
| 121 |
top_k: int = 5,
|
| 122 |
min_score: float = 0.5
|
| 123 |
) -> List[Dict[str, Any]]:
|
| 124 |
+
"""Search indexed PDFs with debug logs"""
|
| 125 |
print("--------------------------- query ----------------------------------")
|
| 126 |
print(query)
|
| 127 |
if not self.index or not self.chunks:
|
| 128 |
await self.index_pdfs()
|
| 129 |
+
|
| 130 |
try:
|
| 131 |
# Create query embedding
|
| 132 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
| 133 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
| 134 |
+
print("Query Embedding Shape:", query_embedding_np.shape)
|
| 135 |
|
| 136 |
# Search in FAISS index
|
| 137 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
| 138 |
+
print("Distances:", distances)
|
| 139 |
+
print("Indices:", indices)
|
| 140 |
|
| 141 |
# Process results
|
| 142 |
results = []
|
| 143 |
for i, idx in enumerate(indices[0]):
|
| 144 |
if idx >= len(self.chunks):
|
| 145 |
continue # Skip invalid indices
|
| 146 |
+
|
| 147 |
+
score = 1 - distances[0][i] # Convert distance to similarity score
|
| 148 |
+
print(f"Chunk Index: {idx}, Distance: {distances[0][i]}, Score: {score}")
|
| 149 |
if score < min_score:
|
| 150 |
continue # Skip low scores
|
| 151 |
+
|
| 152 |
chunk = self.chunks[idx].copy()
|
| 153 |
chunk['score'] = score
|
| 154 |
results.append(chunk)
|
|
|
|
| 160 |
print(results)
|
| 161 |
|
| 162 |
return results[:top_k]
|
| 163 |
+
|
| 164 |
except Exception as e:
|
| 165 |
logger.error(f"Error searching PDFs: {e}")
|
| 166 |
raise
|