minh-4T commited on
Commit
f42dd10
·
1 Parent(s): d942ae3

Update& change payload

Browse files
core/collection_router_retriever.py CHANGED
@@ -1,8 +1,10 @@
1
  import hashlib
2
  import logging
3
- from typing import List
 
4
 
5
  from langchain_core.documents import Document as LangChainDocument
 
6
 
7
  from .collection_utils import collection_matches_year
8
  from .document_db import SessionLocal, list_active_collection_names
@@ -10,6 +12,47 @@ from .document_db import SessionLocal, list_active_collection_names
10
  logger = logging.getLogger(__name__)
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class CollectionRouterRetriever:
14
  def __init__(
15
  self,
@@ -61,7 +104,7 @@ class CollectionRouterRetriever:
61
 
62
  return active_collections[: self.top_n_collections]
63
 
64
- def _search_target_collections(self, query: str, collections: List[str], limit: int) -> List:
65
  if not collections:
66
  return []
67
 
@@ -71,6 +114,11 @@ class CollectionRouterRetriever:
71
  logger.exception("Failed to embed query for collection routing")
72
  return []
73
 
 
 
 
 
 
74
  scored_docs = []
75
  for collection_name in collections:
76
  try:
@@ -79,9 +127,10 @@ class CollectionRouterRetriever:
79
  query_vector=query_vector,
80
  limit=limit,
81
  with_payload=True,
 
82
  )
83
- except Exception:
84
- logger.exception("Qdrant search failed for collection=%s", collection_name)
85
  continue
86
 
87
  for point in points:
@@ -95,9 +144,11 @@ class CollectionRouterRetriever:
95
  "source_file": payload.get("filename") or payload.get("stored_name") or "",
96
  "source_relpath": payload.get("object_path") or payload.get("path") or "",
97
  "object_path": payload.get("object_path") or "",
 
98
  "folder_key": payload.get("folder_key") or "",
99
  "collection_name": collection_name,
100
  "academic_year": payload.get("academic_year") or "",
 
101
  "chunk_index": payload.get("chunk_index"),
102
  "page_number": payload.get("page_number"),
103
  }
@@ -126,6 +177,7 @@ class CollectionRouterRetriever:
126
  query=query,
127
  collections=target_collections,
128
  limit=candidate_k,
 
129
  )
130
 
131
  if year_scoped:
 
1
  import hashlib
2
  import logging
3
+ import re
4
+ from typing import List, Optional
5
 
6
  from langchain_core.documents import Document as LangChainDocument
7
+ from qdrant_client.models import Filter, FieldCondition, HasIdCondition, MatchAny
8
 
9
  from .collection_utils import collection_matches_year
10
  from .document_db import SessionLocal, list_active_collection_names
 
12
  logger = logging.getLogger(__name__)
13
 
14
 
15
+ def _build_year_filter(year_scope: Optional[str]) -> Optional[Filter]:
16
+ """Tạo Qdrant Filter từ year_scope (ví dụ: '2023-2024' hoặc '2023')."""
17
+ if not year_scope:
18
+ return None
19
+
20
+ year_targets = []
21
+ year_scope = year_scope.strip()
22
+
23
+ # Parse year_scope: có thể là "2023-2024" hoặc "2023"
24
+ if "-" in year_scope:
25
+ parts = year_scope.split("-")
26
+ for p in parts:
27
+ try:
28
+ year_targets.append(int(p.strip()))
29
+ except ValueError:
30
+ pass
31
+ else:
32
+ try:
33
+ year_targets.append(int(year_scope))
34
+ except ValueError:
35
+ pass
36
+
37
+ if not year_targets:
38
+ return None
39
+
40
+ # Sử dụng MatchAny để filter theo danh sách years
41
+ from qdrant_client.models import HasIdCondition as QdrantHasId
42
+ try:
43
+ return Filter(
44
+ must=[
45
+ FieldCondition(
46
+ key="years",
47
+ match=MatchAny(any=year_targets),
48
+ )
49
+ ]
50
+ )
51
+ except Exception:
52
+ # Fallback nếu MatchAny không work
53
+ return None
54
+
55
+
56
  class CollectionRouterRetriever:
57
  def __init__(
58
  self,
 
104
 
105
  return active_collections[: self.top_n_collections]
106
 
107
+ def _search_target_collections(self, query: str, collections: List[str], limit: int, year_scope: Optional[str] = None) -> List:
108
  if not collections:
109
  return []
110
 
 
114
  logger.exception("Failed to embed query for collection routing")
115
  return []
116
 
117
+ # Tạo filter Qdrant nếu có year_scope
118
+ year_filter = _build_year_filter(year_scope)
119
+ if year_filter:
120
+ logger.info(f"Áp dụng Qdrant Filter cho year_scope: {year_scope}")
121
+
122
  scored_docs = []
123
  for collection_name in collections:
124
  try:
 
127
  query_vector=query_vector,
128
  limit=limit,
129
  with_payload=True,
130
+ query_filter=year_filter, # NEW: Áp dụng Qdrant Filter native
131
  )
132
+ except Exception as e:
133
+ logger.exception(f"Qdrant search failed for collection={collection_name}: {e}")
134
  continue
135
 
136
  for point in points:
 
144
  "source_file": payload.get("filename") or payload.get("stored_name") or "",
145
  "source_relpath": payload.get("object_path") or payload.get("path") or "",
146
  "object_path": payload.get("object_path") or "",
147
+ "source_url": payload.get("source_url") or "", # NEW: Thêm source_url
148
  "folder_key": payload.get("folder_key") or "",
149
  "collection_name": collection_name,
150
  "academic_year": payload.get("academic_year") or "",
151
+ "years": payload.get("years") or [], # NEW: Thêm years array
152
  "chunk_index": payload.get("chunk_index"),
153
  "page_number": payload.get("page_number"),
154
  }
 
177
  query=query,
178
  collections=target_collections,
179
  limit=candidate_k,
180
+ year_scope=year_scope, # NEW: Pass year_scope để Qdrant Filter
181
  )
182
 
183
  if year_scoped:
core/document_ingest_service.py CHANGED
@@ -1,5 +1,7 @@
 
1
  import logging
2
  import os
 
3
  import uuid
4
  from datetime import datetime, timezone
5
  from typing import List, Optional
@@ -18,7 +20,7 @@ from qdrant_client.models import (
18
  )
19
 
20
  from .chunking import smart_chunking
21
- from .config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL
22
  from .document_db import Document, DocumentChunk, SessionLocal
23
  from .models import embeddings
24
  from .text_utils import clean_text
@@ -26,10 +28,38 @@ from .vectorstore import extract_academic_year, load_documents_from_file
26
 
27
  logger = logging.getLogger(__name__)
28
 
 
29
  _ALLOWED_EXTENSIONS = {".pdf", ".docx", ".txt"}
30
  _ENSURED_PAYLOAD_INDEX_COLLECTIONS = set()
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def _load_documents_for_ingest(path: str, extension: str) -> List[LangChainDocument]:
34
  extension = extension.lower()
35
  if extension not in _ALLOWED_EXTENSIONS:
@@ -111,13 +141,28 @@ def _ensure_payload_indexes(client: QdrantClient, collection_name: str) -> None:
111
  if collection_name in _ENSURED_PAYLOAD_INDEX_COLLECTIONS:
112
  return
113
 
114
- for field_name in ("object_path", "document_id"):
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  client.create_payload_index(
116
  collection_name=collection_name,
117
- field_name=field_name,
118
- field_schema=PayloadSchemaType.KEYWORD,
119
  wait=True,
120
  )
 
 
121
 
122
  _ENSURED_PAYLOAD_INDEX_COLLECTIONS.add(collection_name)
123
 
@@ -127,6 +172,163 @@ def _is_missing_payload_index_error(error: Exception) -> bool:
127
  return "Index required but not found" in message
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _delete_existing_document_points(
131
  client: QdrantClient,
132
  collection_name: str,
@@ -242,41 +444,20 @@ def process_document_ingest(
242
  _delete_existing_document_points(client, target_collection, source_object_ref, document.id)
243
 
244
  created_at = datetime.now(timezone.utc).isoformat()
245
- points: List[PointStruct] = []
246
- db_chunk_rows: List[DocumentChunk] = []
247
-
248
- for index, (chunk_doc, vector) in enumerate(zip(chunk_docs, vectors)):
249
- chunk_text = chunk_doc.page_content
250
- metadata = chunk_doc.metadata if isinstance(chunk_doc.metadata, dict) else {}
251
- point_id = str(uuid.uuid4())
252
- payload = {
253
- "document_id": document.id,
254
- "filename": document.original_name,
255
- "stored_name": document.stored_name,
256
- "path": effective_source_path or document.path,
257
- "object_path": source_object_ref,
258
- "folder_key": document.folder_key,
259
- "collection_name": target_collection,
260
- "source_file": metadata.get("source_file") or source_name,
261
- "source_relpath": metadata.get("source_relpath") or source_relpath,
262
- "academic_year": metadata.get("academic_year") or "ALL",
263
- "page_number": metadata.get("page_number"),
264
- "source_updated_at": source_updated_at,
265
- "source_etag": source_etag,
266
- "chunk_index": index,
267
- "created_at": created_at,
268
- "content": chunk_text,
269
- }
270
-
271
- points.append(PointStruct(id=point_id, vector=vector, payload=payload))
272
- db_chunk_rows.append(
273
- DocumentChunk(
274
- document_id=document.id,
275
- chunk_index=index,
276
- content_preview=chunk_text[:200],
277
- qdrant_point_id=point_id,
278
- )
279
- )
280
 
281
  client.upsert(collection_name=target_collection, points=points, wait=True)
282
 
 
1
+ import hashlib
2
  import logging
3
  import os
4
+ import re
5
  import uuid
6
  from datetime import datetime, timezone
7
  from typing import List, Optional
 
20
  )
21
 
22
  from .chunking import smart_chunking
23
+ from .config import QDRANT_API_KEY, QDRANT_COLLECTION, QDRANT_URL, SUPABASE_URL, SUPABASE_STORAGE_BUCKET
24
  from .document_db import Document, DocumentChunk, SessionLocal
25
  from .models import embeddings
26
  from .text_utils import clean_text
 
28
 
29
  logger = logging.getLogger(__name__)
30
 
31
+ ACTIVE_CODE_PATTERN = re.compile(r"(20\d{2})\s*[-_/]\s*(20\d{2})")
32
  _ALLOWED_EXTENSIONS = {".pdf", ".docx", ".txt"}
33
  _ENSURED_PAYLOAD_INDEX_COLLECTIONS = set()
34
 
35
 
36
+ def _build_supabase_file_url(object_path: str) -> str:
37
+ """Tạo URL đầy đủ cho tài liệu từ Supabase Storage."""
38
+ if not SUPABASE_URL or not SUPABASE_STORAGE_BUCKET or not object_path:
39
+ return ""
40
+
41
+ clean_path = object_path.lstrip("/")
42
+ return f"{SUPABASE_URL}/storage/v1/object/public/{SUPABASE_STORAGE_BUCKET}/{clean_path}"
43
+
44
+
45
+ def _extract_years_from_academic_year(academic_year: str) -> List[int]:
46
+ """Trích xuất danh sách năm từ chuỗi năm học (ví dụ '2023-2024' -> [2023, 2024])."""
47
+ if not academic_year or academic_year == "ALL":
48
+ return []
49
+
50
+ years = []
51
+ match = ACTIVE_CODE_PATTERN.search(academic_year)
52
+ if match:
53
+ try:
54
+ start_year = int(match.group(1))
55
+ end_year = int(match.group(2))
56
+ years = [start_year, end_year]
57
+ except (ValueError, IndexError):
58
+ pass
59
+
60
+ return years
61
+
62
+
63
  def _load_documents_for_ingest(path: str, extension: str) -> List[LangChainDocument]:
64
  extension = extension.lower()
65
  if extension not in _ALLOWED_EXTENSIONS:
 
141
  if collection_name in _ENSURED_PAYLOAD_INDEX_COLLECTIONS:
142
  return
143
 
144
+ # KEYWORD indexes cho filtering nhanh
145
+ for field_name in ("object_path", "document_id", "content_hash"):
146
+ try:
147
+ client.create_payload_index(
148
+ collection_name=collection_name,
149
+ field_name=field_name,
150
+ field_schema=PayloadSchemaType.KEYWORD,
151
+ wait=True,
152
+ )
153
+ except Exception as e:
154
+ logger.warning(f"Failed to create KEYWORD index for {field_name}: {e}")
155
+
156
+ # INTEGER array index cho years
157
+ try:
158
  client.create_payload_index(
159
  collection_name=collection_name,
160
+ field_name="years",
161
+ field_schema=PayloadSchemaType.INTEGER,
162
  wait=True,
163
  )
164
+ except Exception as e:
165
+ logger.warning(f"Failed to create INTEGER index for years: {e}")
166
 
167
  _ENSURED_PAYLOAD_INDEX_COLLECTIONS.add(collection_name)
168
 
 
172
  return "Index required but not found" in message
173
 
174
 
175
+ def _get_or_create_deduplicated_points(
176
+ client: QdrantClient,
177
+ collection_name: str,
178
+ chunk_docs: List[LangChainDocument],
179
+ vectors: List,
180
+ source_object_ref: str,
181
+ document: Document,
182
+ source_updated_at: Optional[str],
183
+ source_etag: Optional[str],
184
+ created_at: str,
185
+ effective_source_path: Optional[str] = None,
186
+ ) -> tuple[List[PointStruct], List[DocumentChunk]]:
187
+ """
188
+ Tích hợp MD5 deduplication: nếu content hash trùng, cập nhật years array thay vì tạo mới.
189
+ """
190
+ points: List[PointStruct] = []
191
+ db_chunk_rows: List[DocumentChunk] = []
192
+
193
+ for index, (chunk_doc, vector) in enumerate(zip(chunk_docs, vectors)):
194
+ chunk_text = chunk_doc.page_content
195
+ metadata = chunk_doc.metadata if isinstance(chunk_doc.metadata, dict) else {}
196
+
197
+ # Tính content hash
198
+ content_hash = hashlib.md5(chunk_text.encode('utf-8')).hexdigest()
199
+
200
+ # Trích académie năm học
201
+ academic_year = metadata.get("academic_year") or "ALL"
202
+ years = _extract_years_from_academic_year(academic_year)
203
+
204
+ # Tạo source URL
205
+ source_url = _build_supabase_file_url(source_object_ref)
206
+
207
+ # Kiểm tra xem content_hash đã tồn tại
208
+ existing_point_id = None
209
+ try:
210
+ existing_points = client.scroll(
211
+ collection_name=collection_name,
212
+ limit=1,
213
+ scroll_filter=Filter(
214
+ must=[
215
+ FieldCondition(
216
+ key="content_hash",
217
+ match=MatchValue(value=content_hash),
218
+ )
219
+ ]
220
+ ),
221
+ )
222
+
223
+ if existing_points and existing_points[0]:
224
+ # Nếu tìm thấy point với hash trùng
225
+ existing_point_id = existing_points[0][0].id
226
+ logger.info(f"Tìm thấy content đã tồn tại hash={content_hash[:8]}..., sẽ cập nhật years")
227
+ except Exception as e:
228
+ logger.debug(f"Không thể tìm kiếm existing points: {e}")
229
+
230
+ if existing_point_id:
231
+ # Merge years array
232
+ try:
233
+ existing_payload = client.retrieve(collection_name, [existing_point_id])[0].payload
234
+ existing_years = set(existing_payload.get("years", []))
235
+ merged_years = sorted(list(set(years) | existing_years))
236
+
237
+ # Update payload với years mới
238
+ updated_payload = {
239
+ **existing_payload,
240
+ "years": merged_years,
241
+ "document_id": document.id, # Update document_id nếu tài liệu mới
242
+ "source_updated_at": source_updated_at or existing_payload.get("source_updated_at"),
243
+ }
244
+
245
+ client.update_payload(
246
+ collection_name=collection_name,
247
+ payload_update=updated_payload,
248
+ points=[existing_point_id],
249
+ )
250
+ logger.info(f"Đã cập nhật years cho hash {content_hash[:8]}...: {merged_years}")
251
+ except Exception as e:
252
+ logger.warning(f"Lỗi cập nhật years cho point đã tồn tại: {e}, sẽ tạo point mới")
253
+ # Fallback: tạo point mới
254
+ point_id = str(uuid.uuid4())
255
+ payload = _build_payload(
256
+ document, source_object_ref, chunk_text, index, metadata,
257
+ academic_year, years, content_hash, source_url,
258
+ source_updated_at, source_etag, created_at, effective_source_path
259
+ )
260
+ points.append(PointStruct(id=point_id, vector=vector, payload=payload))
261
+ db_chunk_rows.append(
262
+ DocumentChunk(
263
+ document_id=document.id,
264
+ chunk_index=index,
265
+ content_preview=chunk_text[:200],
266
+ qdrant_point_id=point_id,
267
+ )
268
+ )
269
+ else:
270
+ # Tạo point mới
271
+ point_id = str(uuid.uuid4())
272
+ payload = _build_payload(
273
+ document, source_object_ref, chunk_text, index, metadata,
274
+ academic_year, years, content_hash, source_url,
275
+ source_updated_at, source_etag, created_at, effective_source_path
276
+ )
277
+ points.append(PointStruct(id=point_id, vector=vector, payload=payload))
278
+ db_chunk_rows.append(
279
+ DocumentChunk(
280
+ document_id=document.id,
281
+ chunk_index=index,
282
+ content_preview=chunk_text[:200],
283
+ qdrant_point_id=point_id,
284
+ )
285
+ )
286
+
287
+ return points, db_chunk_rows
288
+
289
+
290
+ def _build_payload(
291
+ document: Document,
292
+ source_object_ref: str,
293
+ chunk_text: str,
294
+ index: int,
295
+ metadata: dict,
296
+ academic_year: str,
297
+ years: List[int],
298
+ content_hash: str,
299
+ source_url: str,
300
+ source_updated_at: Optional[str],
301
+ source_etag: Optional[str],
302
+ created_at: str,
303
+ effective_source_path: Optional[str] = None,
304
+ ) -> dict:
305
+ """Xây dựng payload dictionary cho point."""
306
+ source_name = os.path.basename(source_object_ref) if source_object_ref else document.stored_name
307
+ source_relpath = source_object_ref or source_name
308
+
309
+ return {
310
+ "document_id": document.id,
311
+ "filename": document.original_name,
312
+ "stored_effective_source_path or name": document.stored_name,
313
+ "path": document.path,
314
+ "object_path": source_object_ref,
315
+ "folder_key": document.folder_key,
316
+ "collection_name": document.collection_name or "",
317
+ "source_file": metadata.get("source_file") or source_name,
318
+ "source_relpath": metadata.get("source_relpath") or source_relpath,
319
+ "source_url": source_url,
320
+ "academic_year": academic_year,
321
+ "years": years,
322
+ "content_hash": content_hash,
323
+ "page_number": metadata.get("page_number"),
324
+ "source_updated_at": source_updated_at,
325
+ "source_etag": source_etag,
326
+ "chunk_index": index,
327
+ "created_at": created_at,
328
+ "content": chunk_text,
329
+ }
330
+
331
+
332
  def _delete_existing_document_points(
333
  client: QdrantClient,
334
  collection_name: str,
 
444
  _delete_existing_document_points(client, target_collection, source_object_ref, document.id)
445
 
446
  created_at = datetime.now(timezone.utc).isoformat()
447
+
448
+ # NEW: Sử dụng deduplication logic
449
+ points, db_chunk_rows = _get_or_create_deduplicated_points(
450
+ client=client,
451
+ collection_name=target_collection,
452
+ chunk_docs=chunk_docs,
453
+ vectors=vectors,
454
+ source_object_ref=source_object_ref,
455
+ document=document,
456
+ source_updated_at=source_updated_at,
457
+ source_etag=source_etag,
458
+ created_at=created_at,
459
+ effective_source_path=effective_source_path,
460
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
  client.upsert(collection_name=target_collection, points=points, wait=True)
463
 
core/retriever.py CHANGED
@@ -13,6 +13,57 @@ class HybridRetriever:
13
  self.rrf_c = 60
14
  print(" BM25 sẵn sàng!")
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  @staticmethod
17
  def _doc_key(doc) -> str:
18
  metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
@@ -23,7 +74,6 @@ class HybridRetriever:
23
  return f"{source}|{page}|{digest}"
24
 
25
  def search(self, query: str, k: int = 10, alpha: float = 0.6, year_scope: str | None = None) -> List:
26
- del year_scope
27
  if not self.documents or k <= 0:
28
  return []
29
 
@@ -34,7 +84,15 @@ class HybridRetriever:
34
  # Lấy top k từ BM25
35
  tokenized_query = query.lower().split()
36
  candidate_k = min(max(k * 4, k), len(self.documents))
37
- bm25_top_docs = self.bm25.get_top_n(tokenized_query, self.documents, n=candidate_k)
 
 
 
 
 
 
 
 
38
 
39
  bm25_ranked = {}
40
  all_retrieved = {}
 
13
  self.rrf_c = 60
14
  print(" BM25 sẵn sàng!")
15
 
16
+ @staticmethod
17
+ def _filter_by_year_scope(documents: List, year_scope: str | None) -> List:
18
+ """Filter documents theo year_scope (ví dụ: '2023-2024' hoặc '2023')."""
19
+ if not year_scope:
20
+ return documents
21
+
22
+ filtered = []
23
+ year_targets = set()
24
+
25
+ # Parse year_scope: có thể là "2023-2024" hoặc "2023"
26
+ if "-" in year_scope:
27
+ parts = year_scope.split("-")
28
+ try:
29
+ year_targets = {int(p.strip()) for p in parts if p.strip()}
30
+ except ValueError:
31
+ return documents
32
+ else:
33
+ try:
34
+ year_targets = {int(year_scope.strip())}
35
+ except ValueError:
36
+ return documents
37
+
38
+ for doc in documents:
39
+ metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
40
+
41
+ # Check years array (mới)
42
+ doc_years = metadata.get("years", [])
43
+ if isinstance(doc_years, list) and any(y in year_targets for y in doc_years):
44
+ filtered.append(doc)
45
+ continue
46
+
47
+ # Check academic_year string (cũ, để backwards compatibility)
48
+ academic_year = metadata.get("academic_year", "")
49
+ if academic_year and academic_year != "ALL":
50
+ doc_year_tokens = set()
51
+ for potential_year in academic_year.split("-"):
52
+ try:
53
+ doc_year_tokens.add(int(potential_year.strip()))
54
+ except ValueError:
55
+ pass
56
+
57
+ if doc_year_tokens.intersection(year_targets):
58
+ filtered.append(doc)
59
+ continue
60
+
61
+ # Include ALL documents không có year info
62
+ if not doc_years and academic_year == "ALL":
63
+ filtered.append(doc)
64
+
65
+ return filtered if filtered else documents
66
+
67
  @staticmethod
68
  def _doc_key(doc) -> str:
69
  metadata = doc.metadata if isinstance(doc.metadata, dict) else {}
 
74
  return f"{source}|{page}|{digest}"
75
 
76
  def search(self, query: str, k: int = 10, alpha: float = 0.6, year_scope: str | None = None) -> List:
 
77
  if not self.documents or k <= 0:
78
  return []
79
 
 
84
  # Lấy top k từ BM25
85
  tokenized_query = query.lower().split()
86
  candidate_k = min(max(k * 4, k), len(self.documents))
87
+
88
+ # Filter documents theo year_scope nếu có
89
+ docs_to_search = self.documents
90
+ if year_scope:
91
+ docs_to_search = self._filter_by_year_scope(self.documents, year_scope)
92
+ if not docs_to_search:
93
+ docs_to_search = self.documents # Fallback nếu không có doc match year
94
+
95
+ bm25_top_docs = self.bm25.get_top_n(tokenized_query, docs_to_search, n=candidate_k)
96
 
97
  bm25_ranked = {}
98
  all_retrieved = {}