MrSimple07 commited on
Commit
6370d73
·
1 Parent(s): afcac41

added sheet_name

Browse files
Files changed (2) hide show
  1. index_retriever.py +97 -63
  2. table_prep.py +94 -51
index_retriever.py CHANGED
@@ -13,77 +13,111 @@ def create_vector_index(documents):
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
  def create_query_engine(vector_index):
16
- bm25_retriever = BM25Retriever.from_defaults(
17
- docstore=vector_index.docstore,
18
- similarity_top_k=80
19
- )
20
-
21
- vector_retriever = VectorIndexRetriever(
22
- index=vector_index,
23
- similarity_top_k=80,
24
- similarity_cutoff=0.45
25
- )
26
-
27
- hybrid_retriever = QueryFusionRetriever(
28
- [vector_retriever, bm25_retriever],
29
- similarity_top_k=100,
30
- num_queries=1
31
- )
32
-
33
- custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
34
- response_synthesizer = get_response_synthesizer(
35
- response_mode=ResponseMode.TREE_SUMMARIZE,
36
- text_qa_template=custom_prompt_template
37
- )
38
-
39
- query_engine = RetrieverQueryEngine(
40
- retriever=hybrid_retriever,
41
- response_synthesizer=response_synthesizer
42
- )
43
-
44
- return query_engine
 
 
 
 
 
 
45
 
46
-
47
- def rerank_nodes(query, nodes, reranker, top_k=40, min_score_threshold=0.35, diversity_penalty=0.15):
48
  if not nodes or not reranker:
49
  return nodes[:top_k]
50
 
51
- pairs = [[query, node.text] for node in nodes]
52
- scores = reranker.predict(pairs)
53
- scored_nodes = list(zip(nodes, scores))
54
- scored_nodes.sort(key=lambda x: x[1], reverse=True)
55
-
56
- if min_score_threshold:
57
- scored_nodes = [(node, score) for node, score in scored_nodes
58
- if score >= min_score_threshold]
59
-
60
- if not scored_nodes:
61
  scored_nodes = list(zip(nodes, scores))
 
62
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
63
- scored_nodes = scored_nodes[:top_k]
64
-
65
- selected = []
66
- seen_docs = {}
67
-
68
- for node, score in scored_nodes:
69
- if len(selected) >= top_k:
70
- break
71
 
72
- meta = node.metadata if hasattr(node, 'metadata') else {}
73
- doc_id = meta.get('document_id', 'unknown')
74
- node_type = meta.get('type', 'text')
75
- table_num = meta.get('table_number', '')
 
76
 
77
- key = f"{doc_id}_{table_num}" if node_type == 'table' else f"{doc_id}_{meta.get('section_id', '')}"
 
 
 
 
 
 
78
 
79
- if key in seen_docs:
80
- penalty = diversity_penalty * 0.2 if node_type == 'table' else diversity_penalty
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  adjusted_score = score * (1 - penalty)
82
- else:
83
- adjusted_score = score
84
- seen_docs[key] = 1
 
 
85
 
86
- if not selected or adjusted_score >= selected[0][1] * 0.4:
87
- selected.append((node, score))
88
-
89
- return [node for node, score in selected]
 
 
 
 
 
 
 
 
 
13
  return VectorStoreIndex.from_documents(documents)
14
 
15
  def create_query_engine(vector_index):
16
+ try:
17
+ bm25_retriever = BM25Retriever.from_defaults(
18
+ docstore=vector_index.docstore,
19
+ similarity_top_k=30
20
+ )
21
+
22
+ vector_retriever = VectorIndexRetriever(
23
+ index=vector_index,
24
+ similarity_top_k=30,
25
+ similarity_cutoff=0.65
26
+ )
27
+
28
+ hybrid_retriever = QueryFusionRetriever(
29
+ [vector_retriever, bm25_retriever],
30
+ similarity_top_k=40,
31
+ num_queries=1
32
+ )
33
+
34
+ custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
35
+ response_synthesizer = get_response_synthesizer(
36
+ response_mode=ResponseMode.TREE_SUMMARIZE,
37
+ text_qa_template=custom_prompt_template
38
+ )
39
+
40
+ query_engine = RetrieverQueryEngine(
41
+ retriever=hybrid_retriever,
42
+ response_synthesizer=response_synthesizer
43
+ )
44
+
45
+ log_message("Query engine успешно создан")
46
+ return query_engine
47
+
48
+ except Exception as e:
49
+ log_message(f"Ошибка создания query engine: {str(e)}")
50
+ raise
51
 
52
+ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.45, diversity_penalty=0.2):
53
+ """Rerank with better handling of specific technical queries"""
54
  if not nodes or not reranker:
55
  return nodes[:top_k]
56
 
57
+ try:
58
+ log_message(f"Переранжирую {len(nodes)} узлов для запроса: {query[:50]}...")
59
+
60
+ pairs = [[query, node.text] for node in nodes]
61
+ scores = reranker.predict(pairs)
 
 
 
 
 
62
  scored_nodes = list(zip(nodes, scores))
63
+
64
  scored_nodes.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
 
 
 
 
65
 
66
+ # Lower threshold for technical queries
67
+ if min_score_threshold is not None:
68
+ scored_nodes = [(node, score) for node, score in scored_nodes
69
+ if score >= min_score_threshold]
70
+ log_message(f"После фильтрации (порог {min_score_threshold}): {len(scored_nodes)} узлов")
71
 
72
+ if not scored_nodes:
73
+ log_message("⚠️ Нет узлов после фильтрации, снижаю порог до 0.3")
74
+ scored_nodes = list(zip(nodes, scores))
75
+ scored_nodes.sort(key=lambda x: x[1], reverse=True)
76
+ min_score_threshold = max(0.3, scored_nodes[0][1] * 0.5)
77
+ scored_nodes = [(node, score) for node, score in scored_nodes
78
+ if score >= min_score_threshold]
79
 
80
+ selected_nodes = []
81
+ selected_docs = {} # Track count per document
82
+ selected_tables = set()
83
+
84
+ for node, score in scored_nodes:
85
+ if len(selected_nodes) >= top_k:
86
+ break
87
+
88
+ metadata = node.metadata if hasattr(node, 'metadata') else {}
89
+ doc_id = metadata.get('document_id', 'unknown')
90
+ node_type = metadata.get('type', 'text')
91
+
92
+ # Track table uniqueness
93
+ if node_type == 'table':
94
+ table_id = metadata.get('full_table_id', '')
95
+ if table_id in selected_tables:
96
+ continue # Skip duplicate table chunks
97
+ selected_tables.add(table_id)
98
+
99
+ # Apply lighter diversity penalty
100
+ penalty = 0
101
+ doc_count = selected_docs.get(doc_id, 0)
102
+ if doc_count > 0:
103
+ penalty = min(diversity_penalty * doc_count, 0.5)
104
+
105
  adjusted_score = score * (1 - penalty)
106
+
107
+ # Accept if competitive
108
+ if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.5:
109
+ selected_nodes.append((node, score))
110
+ selected_docs[doc_id] = doc_count + 1
111
 
112
+ log_message(f"✓ Выбрано {len(selected_nodes)} узлов")
113
+ log_message(f" Уникальных документов: {len(selected_docs)}")
114
+ log_message(f" Уникальных таблиц: {len(selected_tables)}")
115
+
116
+ if selected_nodes:
117
+ log_message(f" Score: {selected_nodes[0][1]:.3f} → {selected_nodes[-1][1]:.3f}")
118
+
119
+ return [node for node, score in selected_nodes]
120
+
121
+ except Exception as e:
122
+ log_message(f"❌ Ошибка переранжировки: {str(e)}")
123
+ return nodes[:top_k]
table_prep.py CHANGED
@@ -17,9 +17,8 @@ def normalize_table_number(table_num, section):
17
 
18
  return f"№{tn}"
19
 
20
-
21
  def create_table_content(table_data):
22
- """Create formatted content with strong contextual anchors"""
23
  doc_id = (
24
  table_data.get('document_id') or
25
  table_data.get('document') or
@@ -33,43 +32,48 @@ def create_table_content(table_data):
33
  table_data.get('Раздел документа') or
34
  'Неизвестно'
35
  )
 
36
 
37
- # Normalize table number
38
  normalized_num = normalize_table_number(table_num, section)
 
 
 
 
 
 
 
39
 
40
- # STRONG ANCHOR: Unique identification for semantic search
41
- content = f"=== ИСТОЧНИК ДАННЫХ ===\n"
42
- content += f"Документ: {doc_id}\n"
43
- content += f"Стандарт: {doc_id}\n"
44
  content += f"Раздел: {section}\n"
45
  content += f"Таблица: {normalized_num}\n"
46
- content += f"Полное название: {table_title}\n"
47
- content += f"Уникальный ID: {doc_id} | {section} | {normalized_num}\n"
48
- content += f"======================\n\n"
 
49
 
50
  headers = table_data.get('headers', [])
51
  if headers:
52
- content += f"Заголовки колонок: {' | '.join(str(h) for h in headers)}\n\n"
 
53
 
54
- # Structured row data with JSON-like clarity
55
  if 'data' in table_data and isinstance(table_data['data'], list):
56
- content += "Содержимое таблицы:\n"
57
  for row_idx, row in enumerate(table_data['data'], start=1):
58
  if isinstance(row, dict):
59
- # Add row identifier if available
60
- row_id = row.get('Условное обозначение сварного соединения',
61
- row.get('Обозначение', ''))
62
- if row_id:
63
- content += f"Строка {row_idx} ({row_id}): "
64
- else:
65
- content += f"Строка {row_idx}: "
66
 
67
- # Structured key-value pairs for better semantic understanding
68
- row_parts = [f"{k}={v}" for k, v in row.items() if v and str(v).strip()]
69
- content += " | ".join(row_parts) + "\n"
70
  elif isinstance(row, list):
71
- content += f"Строка {row_idx}: "
72
- content += " | ".join([str(v) for v in row if v and str(v).strip()]) + "\n"
 
73
 
74
  return content, normalized_num
75
 
@@ -83,28 +87,20 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
83
  table_num = doc.metadata.get('table_number', 'unknown')
84
  doc_id = doc.metadata.get('document_id', 'unknown')
85
  section = doc.metadata.get('section', 'Неизвестно')
86
- table_title = doc.metadata.get('table_title', 'Неизвестно')
87
 
88
- # Create unique anchor for this table
89
  full_table_id = f"{doc_id} | {section} | {table_num}"
90
 
91
  lines = doc.text.strip().split('\n')
92
 
93
- # Extract header (everything before data rows)
94
- table_header_lines = []
95
- data_rows = []
96
- in_data = False
97
-
98
- for line in lines:
99
- if line.startswith('Содержимое таблицы:'):
100
- in_data = True
101
- table_header_lines.append(line)
102
- elif in_data and line.startswith('Строка'):
103
- data_rows.append(line)
104
- elif not in_data:
105
- table_header_lines.append(line)
106
 
107
- table_header = '\n'.join(table_header_lines) + '\n'
 
108
 
109
  if not data_rows or len(doc.text) < chunk_size * 1.5:
110
  log_message(f" 📊 {full_table_id}: малая таблица, без разбиения")
@@ -113,7 +109,7 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
113
  log_message(f" 📋 {full_table_id}: {len(data_rows)} строк → chunking")
114
 
115
  header_size = len(table_header)
116
- available_size = chunk_size - header_size - 200 # More reserve for anchor
117
 
118
  text_chunks = []
119
  current_chunk_rows = []
@@ -123,11 +119,11 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
123
  row_size = len(row) + 1
124
 
125
  if current_size + row_size > available_size and current_chunk_rows:
126
- chunk_text = table_header + '\n'.join(current_chunk_rows)
127
  text_chunks.append(chunk_text)
128
 
129
- # Overlap: keep last 3 rows for better context
130
- overlap_count = min(3, len(current_chunk_rows))
131
  current_chunk_rows = current_chunk_rows[-overlap_count:]
132
  current_size = sum(len(r) + 1 for r in current_chunk_rows)
133
 
@@ -135,12 +131,11 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
135
  current_size += row_size
136
 
137
  if current_chunk_rows:
138
- chunk_text = table_header + '\n'.join(current_chunk_rows)
139
  text_chunks.append(chunk_text)
140
 
141
  log_message(f" ✂️ {full_table_id} → {len(text_chunks)} чанков")
142
 
143
- # Create chunks with strong anchors
144
  chunked_docs = []
145
  for i, chunk_text in enumerate(text_chunks):
146
  chunk_metadata = doc.metadata.copy()
@@ -149,11 +144,8 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
149
  "total_chunks": len(text_chunks),
150
  "chunk_size": len(chunk_text),
151
  "is_chunked": True,
152
- # CRITICAL: Add unique identifiers
153
  "full_table_id": full_table_id,
154
- "chunk_anchor": f"{full_table_id} | chunk_{i+1}/{len(text_chunks)}",
155
- "document_section": section,
156
- "table_number_normalized": table_num
157
  })
158
 
159
  chunked_doc = Document(
@@ -165,6 +157,57 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
165
  return chunked_docs
166
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  def table_to_document(table_data, document_id=None):
169
  """Convert table data to Document with proper metadata"""
170
  if not isinstance(table_data, dict):
 
17
 
18
  return f"№{tn}"
19
 
 
20
  def create_table_content(table_data):
21
+ """Create formatted content optimized for semantic search"""
22
  doc_id = (
23
  table_data.get('document_id') or
24
  table_data.get('document') or
 
32
  table_data.get('Раздел документа') or
33
  'Неизвестно'
34
  )
35
+ sheet_name = table_data.get('sheet_name', '')
36
 
37
+ # Enhanced table number with appendix context
38
  normalized_num = normalize_table_number(table_num, section)
39
+ if 'Приложени' in str(section):
40
+ # Extract appendix number
41
+ import re
42
+ appendix_match = re.search(r'Приложени[ея]\s*(\d+)', str(section))
43
+ if appendix_match:
44
+ appendix_num = appendix_match.group(1)
45
+ normalized_num = f"{normalized_num} Приложения {appendix_num}"
46
 
47
+ # Build searchable header
48
+ content = f"Документ: {doc_id}\n"
 
 
49
  content += f"Раздел: {section}\n"
50
  content += f"Таблица: {normalized_num}\n"
51
+ content += f"Название: {table_title}\n"
52
+ if sheet_name:
53
+ content += f"Лист: {sheet_name}\n"
54
+ content += f"\n"
55
 
56
  headers = table_data.get('headers', [])
57
  if headers:
58
+ header_str = ' | '.join(str(h) for h in headers)
59
+ content += f"Колонки: {header_str}\n\n"
60
 
61
+ # CRITICAL: Preserve searchable row identifiers
62
  if 'data' in table_data and isinstance(table_data['data'], list):
 
63
  for row_idx, row in enumerate(table_data['data'], start=1):
64
  if isinstance(row, dict):
65
+ # Extract ALL key-value pairs naturally
66
+ row_parts = []
67
+ for k, v in row.items():
68
+ if v and str(v).strip() and str(v) != 'nan':
69
+ row_parts.append(f"{k}: {v}")
 
 
70
 
71
+ if row_parts:
72
+ content += ' | '.join(row_parts) + "\n"
 
73
  elif isinstance(row, list):
74
+ row_str = ' | '.join([str(v) for v in row if v and str(v).strip() and str(v) != 'nan'])
75
+ if row_str:
76
+ content += row_str + "\n"
77
 
78
  return content, normalized_num
79
 
 
87
  table_num = doc.metadata.get('table_number', 'unknown')
88
  doc_id = doc.metadata.get('document_id', 'unknown')
89
  section = doc.metadata.get('section', 'Неизвестно')
 
90
 
 
91
  full_table_id = f"{doc_id} | {section} | {table_num}"
92
 
93
  lines = doc.text.strip().split('\n')
94
 
95
+ # Find where data rows start
96
+ data_start_idx = 0
97
+ for i, line in enumerate(lines):
98
+ if line.startswith('Колонки:'):
99
+ data_start_idx = i + 2 # Skip header and blank line
100
+ break
 
 
 
 
 
 
 
101
 
102
+ table_header = '\n'.join(lines[:data_start_idx])
103
+ data_rows = lines[data_start_idx:]
104
 
105
  if not data_rows or len(doc.text) < chunk_size * 1.5:
106
  log_message(f" 📊 {full_table_id}: малая таблица, без разбиения")
 
109
  log_message(f" 📋 {full_table_id}: {len(data_rows)} строк → chunking")
110
 
111
  header_size = len(table_header)
112
+ available_size = chunk_size - header_size - 100
113
 
114
  text_chunks = []
115
  current_chunk_rows = []
 
119
  row_size = len(row) + 1
120
 
121
  if current_size + row_size > available_size and current_chunk_rows:
122
+ chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
123
  text_chunks.append(chunk_text)
124
 
125
+ # Keep last 2 rows for overlap
126
+ overlap_count = min(2, len(current_chunk_rows))
127
  current_chunk_rows = current_chunk_rows[-overlap_count:]
128
  current_size = sum(len(r) + 1 for r in current_chunk_rows)
129
 
 
131
  current_size += row_size
132
 
133
  if current_chunk_rows:
134
+ chunk_text = table_header + '\n' + '\n'.join(current_chunk_rows)
135
  text_chunks.append(chunk_text)
136
 
137
  log_message(f" ✂️ {full_table_id} → {len(text_chunks)} чанков")
138
 
 
139
  chunked_docs = []
140
  for i, chunk_text in enumerate(text_chunks):
141
  chunk_metadata = doc.metadata.copy()
 
144
  "total_chunks": len(text_chunks),
145
  "chunk_size": len(chunk_text),
146
  "is_chunked": True,
 
147
  "full_table_id": full_table_id,
148
+ "table_number_normalized": doc.metadata.get('table_number_normalized')
 
 
149
  })
150
 
151
  chunked_doc = Document(
 
157
  return chunked_docs
158
 
159
 
160
+ def table_to_document(table_data, document_id=None):
161
+ """Convert table data to Document with complete metadata"""
162
+ if not isinstance(table_data, dict):
163
+ return []
164
+
165
+ sheet_doc_id = (
166
+ table_data.get('document_id') or
167
+ table_data.get('document') or
168
+ table_data.get('Обозначение документа')
169
+ )
170
+
171
+ doc_id = sheet_doc_id or document_id or 'Неизвестно'
172
+
173
+ table_num = table_data.get('table_number', 'Неизвестно')
174
+ table_title = table_data.get('table_title', 'Неизвестно')
175
+ section = table_data.get('section', table_data.get('Раздел документа', 'Неизвестно'))
176
+ sheet_name = table_data.get('sheet_name', '')
177
+
178
+ table_rows = table_data.get('data', [])
179
+ if not table_rows:
180
+ log_message(f"⚠️ Таблица {table_num} ({doc_id}) пропущена: нет данных")
181
+ return []
182
+
183
+ content, normalized_num = create_table_content(table_data)
184
+ content_size = len(content)
185
+
186
+ base_doc = Document(
187
+ text=content,
188
+ metadata={
189
+ "type": "table",
190
+ "table_number": table_num,
191
+ "table_number_normalized": normalized_num,
192
+ "table_title": table_title,
193
+ "document_id": doc_id,
194
+ "section": section,
195
+ "section_id": section,
196
+ "sheet_name": sheet_name,
197
+ "total_rows": len(table_rows),
198
+ "content_size": content_size,
199
+ "full_table_id": f"{doc_id} | {section} | {normalized_num}"
200
+ }
201
+ )
202
+
203
+ if content_size > CHUNK_SIZE:
204
+ log_message(f"📊 CHUNKING: {doc_id} | {normalized_num} | {content_size} > {CHUNK_SIZE}")
205
+ return chunk_table_document(base_doc)
206
+ else:
207
+ log_message(f"✓ {doc_id} | {normalized_num} ({content_size} символов)")
208
+ return [base_doc]
209
+
210
+
211
  def table_to_document(table_data, document_id=None):
212
  """Convert table data to Document with proper metadata"""
213
  if not isinstance(table_data, dict):