MrSimple07 commited on
Commit
b01a551
·
1 Parent(s): c697463

added the load_table_data function

Browse files
Files changed (2) hide show
  1. index_retriever.py +49 -26
  2. table_prep.py +109 -66
index_retriever.py CHANGED
@@ -14,21 +14,21 @@ def create_vector_index(documents):
14
 
15
  def create_query_engine(vector_index):
16
  try:
17
- # FIXED: Increase retrieval numbers for tables
18
  bm25_retriever = BM25Retriever.from_defaults(
19
  docstore=vector_index.docstore,
20
- similarity_top_k=50 # Increased from 30
21
  )
22
 
23
  vector_retriever = VectorIndexRetriever(
24
  index=vector_index,
25
- similarity_top_k=50, # Increased from 30
26
- similarity_cutoff=0.55 # FIXED: Lowered from 0.65 to catch more tables
27
  )
28
 
29
  hybrid_retriever = QueryFusionRetriever(
30
  [vector_retriever, bm25_retriever],
31
- similarity_top_k=60, # Increased from 40
32
  num_queries=1
33
  )
34
 
@@ -51,7 +51,7 @@ def create_query_engine(vector_index):
51
  raise
52
 
53
 
54
- def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, diversity_penalty=0.2): # FIXED: Adjusted defaults
55
  if not nodes or not reranker:
56
  return nodes[:top_k]
57
 
@@ -64,24 +64,37 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
64
 
65
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
66
 
67
- # FIXED: Lower threshold and add special handling for tables
 
 
 
 
 
 
 
 
 
 
 
 
68
  if min_score_threshold is not None:
69
- scored_nodes = [(node, score) for node, score in scored_nodes
70
- if score >= min_score_threshold]
71
- log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
72
-
73
- if not scored_nodes:
74
- log_message("Нет узлов после фильтрации, снижаю порог")
75
- scored_nodes = list(zip(nodes, scores))
76
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
77
- min_score_threshold = scored_nodes[0][1] * 0.5 # FIXED: Lower threshold
78
- scored_nodes = [(node, score) for node, score in scored_nodes
79
- if score >= min_score_threshold]
80
 
81
  selected_nodes = []
82
  selected_docs = set()
83
  selected_sections = set()
84
- selected_tables = set() # FIXED: Track tables separately
 
85
 
86
  for node, score in scored_nodes:
87
  if len(selected_nodes) >= top_k:
@@ -91,16 +104,26 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
91
  doc_id = metadata.get('document_id', 'unknown')
92
  node_type = metadata.get('type', 'text')
93
  section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
94
- table_key = f"{doc_id}_{metadata.get('table_number', '')}" if node_type == 'table' else None
95
 
96
- # FIXED: Lower diversity penalty for tables
 
 
 
 
 
 
 
 
 
 
 
97
  penalty = 0
98
  if node_type == 'table':
99
- # Tables get less penalty - we want multiple tables from same document
100
  if table_key and table_key in selected_tables:
101
- penalty += diversity_penalty * 0.3
102
  else:
103
- penalty += diversity_penalty * 0.1 if doc_id in selected_docs else 0
104
  else:
105
  if doc_id in selected_docs:
106
  penalty += diversity_penalty * 0.5
@@ -109,8 +132,8 @@ def rerank_nodes(query, nodes, reranker, top_k=30, min_score_threshold=0.45, div
109
 
110
  adjusted_score = score * (1 - penalty)
111
 
112
- # FIXED: More lenient threshold for adding nodes
113
- if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
114
  selected_nodes.append((node, score))
115
  selected_docs.add(doc_id)
116
  selected_sections.add(section_key)
 
14
 
15
  def create_query_engine(vector_index):
16
  try:
17
+ # FIXED: Significantly increased retrieval for tables and lowered BM25 threshold
18
  bm25_retriever = BM25Retriever.from_defaults(
19
  docstore=vector_index.docstore,
20
+ similarity_top_k=80 # Increased from 50
21
  )
22
 
23
  vector_retriever = VectorIndexRetriever(
24
  index=vector_index,
25
+ similarity_top_k=80, # Increased from 50
26
+ similarity_cutoff=0.45 # FIXED: Lowered from 0.55 to catch more tables
27
  )
28
 
29
  hybrid_retriever = QueryFusionRetriever(
30
  [vector_retriever, bm25_retriever],
31
+ similarity_top_k=100, # Increased from 60 to ensure tables aren't filtered early
32
  num_queries=1
33
  )
34
 
 
51
  raise
52
 
53
 
54
+ def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15): # FIXED: More lenient
55
  if not nodes or not reranker:
56
  return nodes[:top_k]
57
 
 
64
 
65
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
66
 
67
+ # FIXED: Much lower threshold and special boost for tables
68
+ table_boost = 0.15 # Boost table scores
69
+ boosted_scored_nodes = []
70
+ for node, score in scored_nodes:
71
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
72
+ if metadata.get('type') == 'table':
73
+ boosted_score = min(1.0, score * (1 + table_boost))
74
+ boosted_scored_nodes.append((node, boosted_score))
75
+ else:
76
+ boosted_scored_nodes.append((node, score))
77
+
78
+ boosted_scored_nodes.sort(key=lambda x: x[1], reverse=True)
79
+
80
  if min_score_threshold is not None:
81
+ filtered_nodes = [(node, score) for node, score in boosted_scored_nodes
82
+ if score >= min_score_threshold]
83
+ log_message(f"После фильтрации по порогу {min_score_threshold}: {len(filtered_nodes)} узлов")
84
+ if filtered_nodes:
85
+ scored_nodes = filtered_nodes
86
+ else:
87
+ # Fallback: take top nodes even if below threshold
88
+ log_message("⚠️ Нет узлов после фильтрации, беру топ-40 без порога")
89
+ scored_nodes = boosted_scored_nodes[:40]
90
+ else:
91
+ scored_nodes = boosted_scored_nodes
92
 
93
  selected_nodes = []
94
  selected_docs = set()
95
  selected_sections = set()
96
+ selected_tables = set()
97
+ selected_appendix_tables = set() # FIXED: Track appendix tables separately
98
 
99
  for node, score in scored_nodes:
100
  if len(selected_nodes) >= top_k:
 
104
  doc_id = metadata.get('document_id', 'unknown')
105
  node_type = metadata.get('type', 'text')
106
  section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
 
107
 
108
+ # FIXED: Better table tracking with appendix awareness
109
+ if node_type == 'table':
110
+ table_num = metadata.get('table_number_clean', metadata.get('table_number', ''))
111
+ appendix_num = metadata.get('appendix_number')
112
+ if appendix_num:
113
+ table_key = f"{doc_id}_appendix_{appendix_num}_table_{table_num}"
114
+ else:
115
+ table_key = f"{doc_id}_table_{table_num}"
116
+ else:
117
+ table_key = None
118
+
119
+ # FIXED: Even lower diversity penalty for tables
120
  penalty = 0
121
  if node_type == 'table':
122
+ # Tables get minimal penalty - we want all relevant tables
123
  if table_key and table_key in selected_tables:
124
+ penalty += diversity_penalty * 0.2
125
  else:
126
+ penalty += diversity_penalty * 0.05 if doc_id in selected_docs else 0
127
  else:
128
  if doc_id in selected_docs:
129
  penalty += diversity_penalty * 0.5
 
132
 
133
  adjusted_score = score * (1 - penalty)
134
 
135
+ # FIXED: Very lenient threshold for adding nodes
136
+ if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.3:
137
  selected_nodes.append((node, score))
138
  selected_docs.add(doc_id)
139
  selected_sections.add(section_key)
table_prep.py CHANGED
@@ -5,7 +5,6 @@ from my_logging import log_message
5
 
6
  def create_table_content(table_data):
7
  """Create formatted content from table data"""
8
- # FIXED: More robust field extraction
9
  doc_id = (
10
  table_data.get('document_id') or
11
  table_data.get('document') or
@@ -20,31 +19,134 @@ def create_table_content(table_data):
20
  'Неизвестно'
21
  )
22
 
23
- # FIXED: Add more context in content for better semantic search
 
 
 
24
  content = f"Документ: {doc_id}\n"
25
- content += f"Таблица: {table_num}\n"
 
 
26
  content += f"Название таблицы: {table_title}\n"
27
  content += f"Раздел документа: {section}\n"
28
- content += f"Стандарт/ГОСТ: {doc_id}\n" # Explicitly mention GOST for queries
 
 
 
 
29
 
30
  headers = table_data.get('headers', [])
31
  if headers:
32
- content += f"\nЗаголовки колонок: {' | '.join(str(h) for h in headers)}\n"
 
 
 
33
 
34
- # Data section
35
  if 'data' in table_data and isinstance(table_data['data'], list):
36
  content += "\nСодержимое таблицы:\n"
 
 
 
37
  for row_idx, row in enumerate(table_data['data'], start=1):
38
  if isinstance(row, dict):
39
  row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
40
  content += f"Строка {row_idx}: {row_text}\n"
 
 
41
  elif isinstance(row, list):
42
  row_text = " | ".join([str(v) for v in row if v])
43
  content += f"Строка {row_idx}: {row_text}\n"
 
 
 
 
 
44
 
45
  return content
46
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
49
  if chunk_size is None:
50
  chunk_size = CHUNK_SIZE
@@ -128,63 +230,4 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
128
  )
129
  chunked_docs.append(chunked_doc)
130
 
131
- return chunked_docs
132
-
133
-
134
- def table_to_document(table_data, document_id=None):
135
- """Convert table data to Document, with smart chunking if needed"""
136
- if not isinstance(table_data, dict):
137
- return []
138
-
139
- # FIXED: More robust document_id extraction with multiple fallbacks
140
- doc_id = (
141
- document_id or
142
- table_data.get('document_id') or
143
- table_data.get('document') or
144
- table_data.get('Обозначение документа') or
145
- 'Неизвестно'
146
- )
147
-
148
- table_num = table_data.get('table_number', 'Неизвестно')
149
- table_title = table_data.get('table_title', 'Неизвестно')
150
-
151
- # FIXED: More robust section extraction
152
- section = (
153
- table_data.get('section') or
154
- table_data.get('Раздел документа') or
155
- table_data.get('section_id') or
156
- 'Неизвестно'
157
- )
158
-
159
- table_rows = table_data.get('data', [])
160
- if not table_rows:
161
- log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
162
- return []
163
-
164
- content = create_table_content(table_data)
165
- content_size = len(content)
166
-
167
- # FIXED: Enhanced metadata with more searchable fields
168
- base_doc = Document(
169
- text=content,
170
- metadata={
171
- "type": "table",
172
- "table_number": str(table_num),
173
- "table_title": str(table_title),
174
- "document_id": str(doc_id),
175
- "section": str(section),
176
- "section_id": str(section),
177
- "total_rows": len(table_rows),
178
- "content_size": content_size,
179
- # FIXED: Add searchable composite field for better retrieval
180
- "search_key": f"{doc_id} {table_num} {table_title} {section}".lower()
181
- }
182
- )
183
-
184
- # Apply smart chunking if too large
185
- if content_size > CHUNK_SIZE:
186
- log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
187
- return chunk_table_document(base_doc)
188
- else:
189
- log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
190
- return [base_doc]
 
5
 
6
  def create_table_content(table_data):
7
  """Create formatted content from table data"""
 
8
  doc_id = (
9
  table_data.get('document_id') or
10
  table_data.get('document') or
 
19
  'Неизвестно'
20
  )
21
 
22
+ # FIXED: Normalize table number and create variations
23
+ table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
24
+
25
+ # FIXED: Enhanced content with multiple references for better matching
26
  content = f"Документ: {doc_id}\n"
27
+ content += f"ГОСТ/Стандарт: {doc_id}\n"
28
+ content += f"Таблица номер: {table_num}\n"
29
+ content += f"Таблица: {table_num_clean}\n"
30
  content += f"Название таблицы: {table_title}\n"
31
  content += f"Раздел документа: {section}\n"
32
+
33
+ # FIXED: Add explicit appendix reference if present
34
+ if 'приложени' in section.lower():
35
+ appendix_match = section.lower().split('приложени')[1].split()[0] if 'приложени' in section.lower() else ''
36
+ content += f"Таблица {table_num_clean} Приложения {appendix_match}\n"
37
 
38
  headers = table_data.get('headers', [])
39
  if headers:
40
+ # FIXED: Add headers as searchable keywords
41
+ headers_text = ' | '.join(str(h) for h in headers)
42
+ content += f"\nЗаголовки колонок: {headers_text}\n"
43
+ content += f"Параметры: {headers_text}\n" # Alternative keyword
44
 
45
+ # FIXED: Extract and emphasize key data values for better semantic search
46
  if 'data' in table_data and isinstance(table_data['data'], list):
47
  content += "\nСодержимое таблицы:\n"
48
+ # Extract unique values for search enhancement
49
+ all_values = set()
50
+
51
  for row_idx, row in enumerate(table_data['data'], start=1):
52
  if isinstance(row, dict):
53
  row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
54
  content += f"Строка {row_idx}: {row_text}\n"
55
+ # Collect values
56
+ all_values.update([str(v) for v in row.values() if v and str(v).strip()])
57
  elif isinstance(row, list):
58
  row_text = " | ".join([str(v) for v in row if v])
59
  content += f"Строка {row_idx}: {row_text}\n"
60
+ all_values.update([str(v) for v in row if v and str(v).strip()])
61
+
62
+ # FIXED: Add searchable keywords from data
63
+ if all_values:
64
+ content += f"\nКлючевые значения: {' '.join(list(all_values)[:50])}\n"
65
 
66
  return content
67
 
68
 
69
+ def table_to_document(table_data, document_id=None):
70
+ """Convert table data to Document, with smart chunking if needed"""
71
+ if not isinstance(table_data, dict):
72
+ return []
73
+
74
+ doc_id = (
75
+ document_id or
76
+ table_data.get('document_id') or
77
+ table_data.get('document') or
78
+ table_data.get('Обозначение документа') or
79
+ 'Неизвестно'
80
+ )
81
+
82
+ table_num = table_data.get('table_number', 'Неизвестно')
83
+ table_num_clean = str(table_num).replace('№', '').replace('№', '').strip()
84
+ table_title = table_data.get('table_title', 'Неизвестно')
85
+
86
+ section = (
87
+ table_data.get('section') or
88
+ table_data.get('Раздел документа') or
89
+ table_data.get('section_id') or
90
+ 'Неизвестно'
91
+ )
92
+
93
+ table_rows = table_data.get('data', [])
94
+ if not table_rows:
95
+ log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
96
+ return []
97
+
98
+ content = create_table_content(table_data)
99
+ content_size = len(content)
100
+
101
+ # FIXED: Extract appendix info for better identification
102
+ appendix_num = None
103
+ if 'приложени' in section.lower():
104
+ import re
105
+ match = re.search(r'приложени[ея]\s*(\d+)', section.lower())
106
+ if match:
107
+ appendix_num = match.group(1)
108
+
109
+ # FIXED: Create comprehensive search variations
110
+ search_variations = [
111
+ f"{doc_id} таблица {table_num_clean}",
112
+ f"{doc_id} {table_num}",
113
+ f"таблица {table_num_clean} {doc_id}",
114
+ table_title.lower(),
115
+ section.lower()
116
+ ]
117
+
118
+ if appendix_num:
119
+ search_variations.extend([
120
+ f"таблица {table_num_clean} приложения {appendix_num}",
121
+ f"приложение {appendix_num} таблица {table_num_clean}"
122
+ ])
123
+
124
+ base_doc = Document(
125
+ text=content,
126
+ metadata={
127
+ "type": "table",
128
+ "table_number": str(table_num),
129
+ "table_number_clean": str(table_num_clean), # FIXED: Add normalized version
130
+ "table_title": str(table_title),
131
+ "document_id": str(doc_id),
132
+ "section": str(section),
133
+ "section_id": str(section),
134
+ "appendix_number": str(appendix_num) if appendix_num else None, # FIXED: Add appendix tracking
135
+ "total_rows": len(table_rows),
136
+ "content_size": content_size,
137
+ "search_key": " | ".join(search_variations), # FIXED: Enhanced search key
138
+ "headers": " ".join(str(h) for h in table_data.get('headers', [])) # FIXED: Add headers as metadata
139
+ }
140
+ )
141
+
142
+ # Apply smart chunking if too large
143
+ if content_size > CHUNK_SIZE:
144
+ log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
145
+ return chunk_table_document(base_doc)
146
+ else:
147
+ log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов, doc_id={doc_id})")
148
+ return [base_doc]
149
+
150
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
151
  if chunk_size is None:
152
  chunk_size = CHUNK_SIZE
 
230
  )
231
  chunked_docs.append(chunked_doc)
232
 
233
+ return chunked_docs