MrSimple07 commited on
Commit
c697463
·
1 Parent(s): ff92caa

added the load_table_data function

Browse files
Files changed (3) hide show
  1. documents_prep.py +25 -6
  2. index_retriever.py +30 -15
  3. table_prep.py +50 -18
documents_prep.py CHANGED
@@ -391,7 +391,6 @@ def load_image_data(repo_id, hf_token, image_data_dir):
391
  log_message(f"Ошибка загрузки данных изображений: {str(e)}")
392
  return []
393
 
394
-
395
  def load_table_data(repo_id, hf_token, table_data_dir):
396
  """Load and process table data from HuggingFace repo"""
397
  log_message("=" * 60)
@@ -431,7 +430,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
431
  table_data = json.load(f)
432
 
433
  if isinstance(table_data, dict):
434
- document_id = table_data.get('document', 'unknown')
 
 
 
 
 
 
435
 
436
  # Handle multiple sheets
437
  if 'sheets' in table_data:
@@ -441,8 +446,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
441
  )
442
 
443
  for sheet in sorted_sheets:
444
- sheet['document'] = document_id
445
- docs_list = table_to_document(sheet, document_id)
 
 
 
 
 
446
  table_documents.extend(docs_list)
447
 
448
  for doc in docs_list:
@@ -452,8 +462,13 @@ def load_table_data(repo_id, hf_token, table_data_dir):
452
  stats['by_document'][document_id]['count'] += 1
453
  stats['by_document'][document_id]['size'] += size
454
  else:
455
- # Single table
456
- docs_list = table_to_document(table_data, document_id)
 
 
 
 
 
457
  table_documents.extend(docs_list)
458
 
459
  for doc in docs_list:
@@ -465,6 +480,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
465
 
466
  except Exception as e:
467
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
 
 
468
  continue
469
 
470
  # Log summary
@@ -486,6 +503,8 @@ def load_table_data(repo_id, hf_token, table_data_dir):
486
 
487
  except Exception as e:
488
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
 
 
489
  return []
490
 
491
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
 
391
  log_message(f"Ошибка загрузки данных изображений: {str(e)}")
392
  return []
393
 
 
394
  def load_table_data(repo_id, hf_token, table_data_dir):
395
  """Load and process table data from HuggingFace repo"""
396
  log_message("=" * 60)
 
430
  table_data = json.load(f)
431
 
432
  if isinstance(table_data, dict):
433
+ # FIXED: Properly extract document_id from multiple possible sources
434
+ document_id = (
435
+ table_data.get('document_id') or
436
+ table_data.get('document') or
437
+ table_data.get('Обозначение документа') or
438
+ 'unknown'
439
+ )
440
 
441
  # Handle multiple sheets
442
  if 'sheets' in table_data:
 
446
  )
447
 
448
  for sheet in sorted_sheets:
449
+ # FIXED: Ensure document_id is always set in sheet data
450
+ if 'document' not in sheet and 'document_id' not in sheet:
451
+ sheet['document'] = document_id
452
+ sheet['document_id'] = document_id
453
+
454
+ # FIXED: Pass document_id explicitly
455
+ docs_list = table_to_document(sheet, document_id=document_id)
456
  table_documents.extend(docs_list)
457
 
458
  for doc in docs_list:
 
462
  stats['by_document'][document_id]['count'] += 1
463
  stats['by_document'][document_id]['size'] += size
464
  else:
465
+ # Single table - FIXED: Ensure document_id is in table_data
466
+ if 'document_id' not in table_data:
467
+ table_data['document_id'] = document_id
468
+ if 'document' not in table_data:
469
+ table_data['document'] = document_id
470
+
471
+ docs_list = table_to_document(table_data, document_id=document_id)
472
  table_documents.extend(docs_list)
473
 
474
  for doc in docs_list:
 
480
 
481
  except Exception as e:
482
  log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
483
+ import traceback
484
+ log_message(f"Traceback: {traceback.format_exc()}")
485
  continue
486
 
487
  # Log summary
 
503
 
504
  except Exception as e:
505
  log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
506
+ import traceback
507
+ log_message(f"Traceback: {traceback.format_exc()}")
508
  return []
509
 
510
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
index_retriever.py CHANGED
@@ -14,20 +14,21 @@ def create_vector_index(documents):
14
 
15
  def create_query_engine(vector_index):
16
  try:
 
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
- similarity_top_k=30
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
23
  index=vector_index,
24
- similarity_top_k=30,
25
- similarity_cutoff=0.65
26
  )
27
 
28
  hybrid_retriever = QueryFusionRetriever(
29
  [vector_retriever, bm25_retriever],
30
- similarity_top_k=40,
31
  num_queries=1
32
  )
33
 
@@ -42,14 +43,15 @@ def create_query_engine(vector_index):
42
  response_synthesizer=response_synthesizer
43
  )
44
 
45
- log_message("Query engine успешно создан")
46
  return query_engine
47
 
48
  except Exception as e:
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
 
52
- def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
 
53
  if not nodes or not reranker:
54
  return nodes[:top_k]
55
 
@@ -62,6 +64,7 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
62
 
63
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
64
 
 
65
  if min_score_threshold is not None:
66
  scored_nodes = [(node, score) for node, score in scored_nodes
67
  if score >= min_score_threshold]
@@ -71,13 +74,14 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
71
  log_message("Нет узлов после фильтрации, снижаю порог")
72
  scored_nodes = list(zip(nodes, scores))
73
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
74
- min_score_threshold = scored_nodes[0][1] * 0.6
75
  scored_nodes = [(node, score) for node, score in scored_nodes
76
  if score >= min_score_threshold]
77
 
78
  selected_nodes = []
79
  selected_docs = set()
80
  selected_sections = set()
 
81
 
82
  for node, score in scored_nodes:
83
  if len(selected_nodes) >= top_k:
@@ -85,25 +89,36 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
85
 
86
  metadata = node.metadata if hasattr(node, 'metadata') else {}
87
  doc_id = metadata.get('document_id', 'unknown')
 
88
  section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
 
89
 
90
- # Apply diversity penalty
91
  penalty = 0
92
- if doc_id in selected_docs:
93
- penalty += diversity_penalty * 0.5
94
- if section_key in selected_sections:
95
- penalty += diversity_penalty
 
 
 
 
 
 
 
96
 
97
  adjusted_score = score * (1 - penalty)
98
 
99
- # Add if still competitive
100
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.6:
101
  selected_nodes.append((node, score))
102
  selected_docs.add(doc_id)
103
  selected_sections.add(section_key)
 
 
104
 
105
  log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
106
- log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}")
107
 
108
  if selected_nodes:
109
  log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
 
14
 
15
  def create_query_engine(vector_index):
16
  try:
17
+ # FIXED: Increase retrieval numbers for tables
18
  bm25_retriever = BM25Retriever.from_defaults(
19
  docstore=vector_index.docstore,
20
+ similarity_top_k=50 # Increased from 30
21
  )
22
 
23
  vector_retriever = VectorIndexRetriever(
24
  index=vector_index,
25
+ similarity_top_k=50, # Increased from 30
26
+ similarity_cutoff=0.55 # FIXED: Lowered from 0.65 to catch more tables
27
  )
28
 
29
  hybrid_retriever = QueryFusionRetriever(
30
  [vector_retriever, bm25_retriever],
31
+ similarity_top_k=60, # Increased from 40
32
  num_queries=1
33
  )
34
 
 
43
  response_synthesizer=response_synthesizer
44
  )
45
 
46
+ log_message("Query engine успешно создан с улучшенными параметрами поиска таблиц")
47
  return query_engine
48
 
49
  except Exception as e:
50
  log_message(f"Ошибка создания query engine: {str(e)}")
51
  raise
52
 
53
+
54
+ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, diversity_penalty=0.2): # FIXED: Adjusted defaults
55
  if not nodes or not reranker:
56
  return nodes[:top_k]
57
 
 
64
 
65
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
66
 
67
+ # FIXED: Lower threshold and add special handling for tables
68
  if min_score_threshold is not None:
69
  scored_nodes = [(node, score) for node, score in scored_nodes
70
  if score >= min_score_threshold]
 
74
  log_message("Нет узлов после фильтрации, снижаю порог")
75
  scored_nodes = list(zip(nodes, scores))
76
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
77
+ min_score_threshold = scored_nodes[0][1] * 0.5 # FIXED: Lower threshold
78
  scored_nodes = [(node, score) for node, score in scored_nodes
79
  if score >= min_score_threshold]
80
 
81
  selected_nodes = []
82
  selected_docs = set()
83
  selected_sections = set()
84
+ selected_tables = set() # FIXED: Track tables separately
85
 
86
  for node, score in scored_nodes:
87
  if len(selected_nodes) >= top_k:
 
89
 
90
  metadata = node.metadata if hasattr(node, 'metadata') else {}
91
  doc_id = metadata.get('document_id', 'unknown')
92
+ node_type = metadata.get('type', 'text')
93
  section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
94
+ table_key = f"{doc_id}_{metadata.get('table_number', '')}" if node_type == 'table' else None
95
 
96
+ # FIXED: Lower diversity penalty for tables
97
  penalty = 0
98
+ if node_type == 'table':
99
+ # Tables get less penalty - we want multiple tables from same document
100
+ if table_key and table_key in selected_tables:
101
+ penalty += diversity_penalty * 0.3
102
+ else:
103
+ penalty += diversity_penalty * 0.1 if doc_id in selected_docs else 0
104
+ else:
105
+ if doc_id in selected_docs:
106
+ penalty += diversity_penalty * 0.5
107
+ if section_key in selected_sections:
108
+ penalty += diversity_penalty
109
 
110
  adjusted_score = score * (1 - penalty)
111
 
112
+ # FIXED: More lenient threshold for adding nodes
113
+ if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
114
  selected_nodes.append((node, score))
115
  selected_docs.add(doc_id)
116
  selected_sections.add(section_key)
117
+ if table_key:
118
+ selected_tables.add(table_key)
119
 
120
  log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
121
+ log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}, таблиц: {len(selected_tables)}")
122
 
123
  if selected_nodes:
124
  log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
table_prep.py CHANGED
@@ -5,28 +5,42 @@ from my_logging import log_message
5
 
6
  def create_table_content(table_data):
7
  """Create formatted content from table data"""
8
- doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
 
 
 
 
 
 
9
  table_num = table_data.get('table_number', 'Неизвестно')
10
  table_title = table_data.get('table_title', 'Неизвестно')
11
- section = table_data.get('section', 'Неизвестно')
 
 
 
 
12
 
13
- # Header section
14
- content = f"Таблица: {table_num}\n"
15
- content += f"Название: {table_title}\n"
16
- content += f"Документ: {doc_id}\n"
17
- content += f"Раздел: {section}\n"
 
18
 
19
  headers = table_data.get('headers', [])
20
  if headers:
21
- content += f"\nЗаголовки: {' | '.join(headers)}\n"
22
 
23
  # Data section
24
  if 'data' in table_data and isinstance(table_data['data'], list):
25
- content += "\nДанные таблицы:\n"
26
  for row_idx, row in enumerate(table_data['data'], start=1):
27
  if isinstance(row, dict):
28
  row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
29
  content += f"Строка {row_idx}: {row_text}\n"
 
 
 
30
 
31
  return content
32
 
@@ -122,10 +136,25 @@ def table_to_document(table_data, document_id=None):
122
  if not isinstance(table_data, dict):
123
  return []
124
 
125
- doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
 
 
 
 
 
 
 
 
126
  table_num = table_data.get('table_number', 'Неизвестно')
127
  table_title = table_data.get('table_title', 'Неизвестно')
128
- section = table_data.get('section', 'Неизвестно')
 
 
 
 
 
 
 
129
 
130
  table_rows = table_data.get('data', [])
131
  if not table_rows:
@@ -135,17 +164,20 @@ def table_to_document(table_data, document_id=None):
135
  content = create_table_content(table_data)
136
  content_size = len(content)
137
 
 
138
  base_doc = Document(
139
  text=content,
140
  metadata={
141
  "type": "table",
142
- "table_number": table_num,
143
- "table_title": table_title,
144
- "document_id": doc_id,
145
- "section": section,
146
- "section_id": section,
147
  "total_rows": len(table_rows),
148
- "content_size": content_size
 
 
149
  }
150
  )
151
 
@@ -154,5 +186,5 @@ def table_to_document(table_data, document_id=None):
154
  log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
155
  return chunk_table_document(base_doc)
156
  else:
157
- log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов)")
158
  return [base_doc]
 
5
 
6
  def create_table_content(table_data):
7
  """Create formatted content from table data"""
8
+ # FIXED: More robust field extraction
9
+ doc_id = (
10
+ table_data.get('document_id') or
11
+ table_data.get('document') or
12
+ table_data.get('Обозначение документа') or
13
+ 'Неизвестно'
14
+ )
15
  table_num = table_data.get('table_number', 'Неизвестно')
16
  table_title = table_data.get('table_title', 'Неизвестно')
17
+ section = (
18
+ table_data.get('section') or
19
+ table_data.get('Раздел документа') or
20
+ 'Неизвестно'
21
+ )
22
 
23
+ # FIXED: Add more context in content for better semantic search
24
+ content = f"Документ: {doc_id}\n"
25
+ content += f"Таблица: {table_num}\n"
26
+ content += f"Название таблицы: {table_title}\n"
27
+ content += f"Раздел документа: {section}\n"
28
+ content += f"Стандарт/ГОСТ: {doc_id}\n" # Explicitly mention GOST for queries
29
 
30
  headers = table_data.get('headers', [])
31
  if headers:
32
+ content += f"\nЗаголовки колонок: {' | '.join(str(h) for h in headers)}\n"
33
 
34
  # Data section
35
  if 'data' in table_data and isinstance(table_data['data'], list):
36
+ content += "\nСодержимое таблицы:\n"
37
  for row_idx, row in enumerate(table_data['data'], start=1):
38
  if isinstance(row, dict):
39
  row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
40
  content += f"Строка {row_idx}: {row_text}\n"
41
+ elif isinstance(row, list):
42
+ row_text = " | ".join([str(v) for v in row if v])
43
+ content += f"Строка {row_idx}: {row_text}\n"
44
 
45
  return content
46
 
 
136
  if not isinstance(table_data, dict):
137
  return []
138
 
139
+ # FIXED: More robust document_id extraction with multiple fallbacks
140
+ doc_id = (
141
+ document_id or
142
+ table_data.get('document_id') or
143
+ table_data.get('document') or
144
+ table_data.get('Обозначение документа') or
145
+ 'Неизвестно'
146
+ )
147
+
148
  table_num = table_data.get('table_number', 'Неизвестно')
149
  table_title = table_data.get('table_title', 'Неизвестно')
150
+
151
+ # FIXED: More robust section extraction
152
+ section = (
153
+ table_data.get('section') or
154
+ table_data.get('Раздел документа') or
155
+ table_data.get('section_id') or
156
+ 'Неизвестно'
157
+ )
158
 
159
  table_rows = table_data.get('data', [])
160
  if not table_rows:
 
164
  content = create_table_content(table_data)
165
  content_size = len(content)
166
 
167
+ # FIXED: Enhanced metadata with more searchable fields
168
  base_doc = Document(
169
  text=content,
170
  metadata={
171
  "type": "table",
172
+ "table_number": str(table_num),
173
+ "table_title": str(table_title),
174
+ "document_id": str(doc_id),
175
+ "section": str(section),
176
+ "section_id": str(section),
177
  "total_rows": len(table_rows),
178
+ "content_size": content_size,
179
+ # FIXED: Add searchable composite field for better retrieval
180
+ "search_key": f"{doc_id} {table_num} {table_title} {section}".lower()
181
  }
182
  )
183
 
 
186
  log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
187
  return chunk_table_document(base_doc)
188
  else:
189
+ log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
190
  return [base_doc]