MrSimple07 commited on
Commit
9f55dc6
·
1 Parent(s): b867de8

top k = 80 + max chunk size is 3000

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +41 -0
  3. index_retriever.py +3 -3
config.py CHANGED
@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
- MAX_CHARS_TABLE = 2500
56
  MAX_ROWS_TABLE = 10
57
 
58
  CUSTOM_PROMPT = """
 
52
  CHUNK_SIZE = 1500
53
  CHUNK_OVERLAP = 128
54
 
55
+ MAX_CHARS_TABLE = 3000
56
  MAX_ROWS_TABLE = 10
57
 
58
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -196,8 +196,43 @@ def format_table_rows(rows):
196
 
197
 
198
  def format_table_footer(table_identifier, doc_id):
 
199
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def load_json_documents(repo_id, hf_token, json_dir):
202
  import zipfile
203
  import tempfile
@@ -327,6 +362,7 @@ def load_json_documents(repo_id, hf_token, json_dir):
327
  return documents
328
 
329
  def extract_sections_from_json(json_path):
 
330
  documents = []
331
 
332
  try:
@@ -378,6 +414,7 @@ def extract_sections_from_json(json_path):
378
 
379
 
380
  def load_table_documents(repo_id, hf_token, table_dir):
 
381
  log_message("Loading tables...")
382
 
383
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -395,11 +432,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
395
 
396
  with open(local_path, 'r', encoding='utf-8') as f:
397
  data = json.load(f)
 
 
398
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
399
 
400
  for sheet in data.get('sheets', []):
 
401
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
402
 
 
403
  chunks = chunk_table_by_content(sheet, sheet_doc_id)
404
  all_chunks.extend(chunks)
405
 
 
196
 
197
 
198
  def format_table_footer(table_identifier, doc_id):
199
+ """Format table footer"""
200
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
201
 
202
+ def load_table_documents(repo_id, hf_token, table_dir):
203
+ log_message("Loading tables...")
204
+
205
+ files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
206
+ table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
207
+
208
+ all_chunks = []
209
+ for file_path in table_files:
210
+ try:
211
+ local_path = hf_hub_download(
212
+ repo_id=repo_id,
213
+ filename=file_path,
214
+ repo_type="dataset",
215
+ token=hf_token
216
+ )
217
+
218
+ with open(local_path, 'r', encoding='utf-8') as f:
219
+ data = json.load(f)
220
+
221
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
222
+
223
+ for sheet in data.get('sheets', []):
224
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
225
+
226
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
227
+ all_chunks.extend(chunks)
228
+
229
+ except Exception as e:
230
+ log_message(f"Error loading {file_path}: {e}")
231
+
232
+ log_message(f"✓ Loaded {len(all_chunks)} table chunks")
233
+ return all_chunks
234
+
235
+
236
  def load_json_documents(repo_id, hf_token, json_dir):
237
  import zipfile
238
  import tempfile
 
362
  return documents
363
 
364
  def extract_sections_from_json(json_path):
365
+ """Extract sections from a single JSON file"""
366
  documents = []
367
 
368
  try:
 
414
 
415
 
416
  def load_table_documents(repo_id, hf_token, table_dir):
417
+ """Load and chunk tables"""
418
  log_message("Loading tables...")
419
 
420
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
432
 
433
  with open(local_path, 'r', encoding='utf-8') as f:
434
  data = json.load(f)
435
+
436
+ # Extract file-level document_id
437
  file_doc_id = data.get('document_id', data.get('document', 'unknown'))
438
 
439
  for sheet in data.get('sheets', []):
440
+ # Use sheet-level document_id if available, otherwise use file-level
441
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
442
 
443
+ # CRITICAL: Pass document_id to chunk function
444
  chunks = chunk_table_by_content(sheet, sheet_doc_id)
445
  all_chunks.extend(chunks)
446
 
index_retriever.py CHANGED
@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
- similarity_top_k=70
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
- similarity_top_k=70,
55
  similarity_cutoff=0.55
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
- similarity_top_k=70,
61
  num_queries=1
62
  )
63
 
 
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
+ similarity_top_k=80
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
+ similarity_top_k=80,
55
  similarity_cutoff=0.55
56
  )
57
 
58
  hybrid_retriever = QueryFusionRetriever(
59
  [vector_retriever, bm25_retriever],
60
+ similarity_top_k=80,
61
  num_queries=1
62
  )
63