senatus123 commited on
Commit
87ab461
·
verified ·
1 Parent(s): 8d33417

Upload doc_searcher_v2.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. doc_searcher_v2.py +15 -9
doc_searcher_v2.py CHANGED
@@ -4,6 +4,7 @@ from qdrant_client import QdrantClient, models
4
  from reranker import Reranker
5
  from sentence_transformers import SentenceTransformer
6
  from config import DENSE_MODEL, SPARSE_MODEL, QDRANT_URL, QDRANT_API_KEY
 
7
 
8
  class DocSearcherV2:
9
 
@@ -11,7 +12,7 @@ class DocSearcherV2:
11
  self.collection_name = collection_name
12
  self.reranker = Reranker()
13
  self.model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B",device="cpu")
14
- self.sparse_model = SparseTextEmbedding(SPARSE_MODEL)
15
  self.qdrant_client = QdrantClient(QDRANT_URL,api_key=QDRANT_API_KEY,timeout=30)
16
 
17
  async def search_semantic(self, text: str, qdrant_limit: int = 20, top_k: int = 3):
@@ -25,20 +26,15 @@ class DocSearcherV2:
25
  """
26
  queries = [text]
27
  dense_query = self.model.encode(text).tolist()
28
- sparse_query = next(self.sparse_model.query_embed(text))
29
 
30
- # Hybrid search: dense + sparse za bolje rezultate
31
  prefetch = [
32
  models.Prefetch(
33
  query=dense_query,
34
  using="Qwen/Qwen3-Embedding-0.6B",
35
  limit=qdrant_limit
36
  ),
37
- models.Prefetch(
38
- query=models.SparseVector(**sparse_query.as_object()),
39
- using=SPARSE_MODEL,
40
- limit=qdrant_limit
41
- ),
42
  ]
43
 
44
  search_result = self.qdrant_client.query_points(
@@ -80,9 +76,19 @@ class DocSearcherV2:
80
  # Pronađi originalni hit po tekstu
81
  if document_text in text_to_hit:
82
  hit = text_to_hit[document_text]
 
 
 
 
 
 
 
 
 
 
83
  # Vrati kompletan payload sa skorom
84
  result = {
85
- "score": float(score),
86
  "id": str(hit.id),
87
  "text": document_text,
88
  "payload": hit.payload # Kompletan payload sa svim podacima
 
4
  from reranker import Reranker
5
  from sentence_transformers import SentenceTransformer
6
  from config import DENSE_MODEL, SPARSE_MODEL, QDRANT_URL, QDRANT_API_KEY
7
+ import math
8
 
9
  class DocSearcherV2:
10
 
 
12
  self.collection_name = collection_name
13
  self.reranker = Reranker()
14
  self.model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B",device="cpu")
15
+ # self.sparse_model = SparseTextEmbedding(SPARSE_MODEL) # Disabled - not needed without sparse search
16
  self.qdrant_client = QdrantClient(QDRANT_URL,api_key=QDRANT_API_KEY,timeout=30)
17
 
18
  async def search_semantic(self, text: str, qdrant_limit: int = 20, top_k: int = 3):
 
26
  """
27
  queries = [text]
28
  dense_query = self.model.encode(text).tolist()
29
+ # sparse_query = next(self.sparse_model.query_embed(text)) # Disabled - collection not configured
30
 
31
+ # Dense-only search (sparse disabled due to collection config)
32
  prefetch = [
33
  models.Prefetch(
34
  query=dense_query,
35
  using="Qwen/Qwen3-Embedding-0.6B",
36
  limit=qdrant_limit
37
  ),
 
 
 
 
 
38
  ]
39
 
40
  search_result = self.qdrant_client.query_points(
 
76
  # Pronađi originalni hit po tekstu
77
  if document_text in text_to_hit:
78
  hit = text_to_hit[document_text]
79
+
80
+ # Sanitizuj score - osiguraj da je validna float vrijednost za JSON
81
+ score_float = float(score)
82
+ if math.isnan(score_float) or math.isinf(score_float):
83
+ score_float = 0.0 # Default za invalid skorove
84
+ elif score_float < 0:
85
+ score_float = 0.0
86
+ elif score_float > 1:
87
+ score_float = 1.0
88
+
89
  # Vrati kompletan payload sa skorom
90
  result = {
91
+ "score": score_float,
92
  "id": str(hit.id),
93
  "text": document_text,
94
  "payload": hit.payload # Kompletan payload sa svim podacima