Spaces:
Running
Running
SUBHRAJIT MOHANTY
commited on
Commit
·
5d2f302
1
Parent(s):
8a3e144
app.py updated
Browse files
app.py
CHANGED
|
@@ -65,8 +65,8 @@ class Config:
|
|
| 65 |
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 66 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "documents")
|
| 67 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 68 |
-
TOP_K = int(os.getenv("TOP_K", "
|
| 69 |
-
SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.
|
| 70 |
DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
|
| 71 |
|
| 72 |
class ApplicationState:
|
|
@@ -299,9 +299,13 @@ class DocumentManager:
|
|
| 299 |
try:
|
| 300 |
await self._ensure_collection_exists()
|
| 301 |
|
|
|
|
|
|
|
| 302 |
# Generate query embedding
|
| 303 |
query_embedding = await self.embedding_service.get_query_embedding(query)
|
| 304 |
|
|
|
|
|
|
|
| 305 |
# Search in Qdrant
|
| 306 |
search_results = await self.qdrant_client.search(
|
| 307 |
collection_name=self.collection_name,
|
|
@@ -310,22 +314,29 @@ class DocumentManager:
|
|
| 310 |
score_threshold=min_score
|
| 311 |
)
|
| 312 |
|
|
|
|
|
|
|
| 313 |
# Format results
|
| 314 |
results = []
|
| 315 |
-
for result in search_results:
|
|
|
|
|
|
|
|
|
|
| 316 |
results.append({
|
| 317 |
"score": result.score,
|
| 318 |
-
"text":
|
| 319 |
"file_path": result.payload.get("file_path", ""),
|
| 320 |
"document_id": result.payload.get("document_id", ""),
|
| 321 |
"chunk_index": result.payload.get("chunk_index", 0)
|
| 322 |
})
|
| 323 |
|
| 324 |
-
print(f"✓ Found {len(results)} results for query: '{query}'")
|
| 325 |
return results
|
| 326 |
|
| 327 |
except Exception as e:
|
| 328 |
print(f"Error searching: {e}")
|
|
|
|
|
|
|
| 329 |
return []
|
| 330 |
|
| 331 |
async def list_documents(self) -> List[Dict[str, Any]]:
|
|
@@ -409,13 +420,31 @@ class RAGService:
|
|
| 409 |
print("Error: Document manager is not initialized")
|
| 410 |
return []
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
# Use the document manager's search functionality
|
| 413 |
results = await app_state.document_manager.search_documents(
|
| 414 |
query=query,
|
| 415 |
limit=top_k,
|
| 416 |
-
min_score=
|
| 417 |
)
|
| 418 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
return results
|
| 420 |
|
| 421 |
except Exception as e:
|
|
|
|
| 65 |
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 66 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "documents")
|
| 67 |
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 68 |
+
TOP_K = int(os.getenv("TOP_K", "10")) # Increased from 5
|
| 69 |
+
SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.1")) # Lowered from 0.7
|
| 70 |
DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
|
| 71 |
|
| 72 |
class ApplicationState:
|
|
|
|
| 299 |
try:
|
| 300 |
await self._ensure_collection_exists()
|
| 301 |
|
| 302 |
+
print(f"Document Search - Query: '{query}', Limit: {limit}, Min Score: {min_score}")
|
| 303 |
+
|
| 304 |
# Generate query embedding
|
| 305 |
query_embedding = await self.embedding_service.get_query_embedding(query)
|
| 306 |
|
| 307 |
+
print(f"Document Search - Generated embedding vector of size: {len(query_embedding)}")
|
| 308 |
+
|
| 309 |
# Search in Qdrant
|
| 310 |
search_results = await self.qdrant_client.search(
|
| 311 |
collection_name=self.collection_name,
|
|
|
|
| 314 |
score_threshold=min_score
|
| 315 |
)
|
| 316 |
|
| 317 |
+
print(f"Document Search - Qdrant returned {len(search_results)} results")
|
| 318 |
+
|
| 319 |
# Format results
|
| 320 |
results = []
|
| 321 |
+
for i, result in enumerate(search_results):
|
| 322 |
+
content = result.payload.get("content", result.payload.get("chunk_text", ""))
|
| 323 |
+
print(f"Document Search - Result {i+1}: Score={result.score:.4f}, Content preview: {content[:100]}...")
|
| 324 |
+
|
| 325 |
results.append({
|
| 326 |
"score": result.score,
|
| 327 |
+
"text": content,
|
| 328 |
"file_path": result.payload.get("file_path", ""),
|
| 329 |
"document_id": result.payload.get("document_id", ""),
|
| 330 |
"chunk_index": result.payload.get("chunk_index", 0)
|
| 331 |
})
|
| 332 |
|
| 333 |
+
print(f"✓ Document Search - Found {len(results)} results for query: '{query}'")
|
| 334 |
return results
|
| 335 |
|
| 336 |
except Exception as e:
|
| 337 |
print(f"Error searching: {e}")
|
| 338 |
+
import traceback
|
| 339 |
+
traceback.print_exc()
|
| 340 |
return []
|
| 341 |
|
| 342 |
async def list_documents(self) -> List[Dict[str, Any]]:
|
|
|
|
| 420 |
print("Error: Document manager is not initialized")
|
| 421 |
return []
|
| 422 |
|
| 423 |
+
# Use a lower similarity threshold for RAG to get more results
|
| 424 |
+
# Try multiple thresholds if needed
|
| 425 |
+
min_score = 0.1 # Lower threshold for RAG
|
| 426 |
+
|
| 427 |
+
print(f"RAG Search - Query: '{query}', Limit: {top_k}, Min Score: {min_score}")
|
| 428 |
+
|
| 429 |
# Use the document manager's search functionality
|
| 430 |
results = await app_state.document_manager.search_documents(
|
| 431 |
query=query,
|
| 432 |
limit=top_k,
|
| 433 |
+
min_score=min_score
|
| 434 |
)
|
| 435 |
|
| 436 |
+
print(f"RAG Search - Found {len(results)} results")
|
| 437 |
+
|
| 438 |
+
# If no results with low threshold, try even lower
|
| 439 |
+
if not results:
|
| 440 |
+
print("No results with min_score=0.1, trying with min_score=0.0")
|
| 441 |
+
results = await app_state.document_manager.search_documents(
|
| 442 |
+
query=query,
|
| 443 |
+
limit=top_k,
|
| 444 |
+
min_score=0.0
|
| 445 |
+
)
|
| 446 |
+
print(f"RAG Search - Found {len(results)} results with min_score=0.0")
|
| 447 |
+
|
| 448 |
return results
|
| 449 |
|
| 450 |
except Exception as e:
|