MrSimple07 commited on
Commit
90e6b4c
·
1 Parent(s): f85ad1c

a new way with keywords

Browse files
Files changed (5) hide show
  1. documents_prep.py +68 -127
  2. index_retriever.py +139 -1
  3. requirements.txt +2 -1
  4. table_prep.py +108 -100
  5. utils.py +20 -10
documents_prep.py CHANGED
@@ -14,206 +14,147 @@ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
14
  chunk_size = CHUNK_SIZE
15
  if chunk_overlap is None:
16
  chunk_overlap = CHUNK_OVERLAP
 
 
 
 
 
17
 
18
- text = doc.text
19
 
20
- # Try to split by double newlines (paragraphs) first
21
- paragraphs = text.split('\n\n')
22
-
23
- chunks = []
24
- current_chunk = ""
25
-
26
- for para in paragraphs:
27
- para = para.strip()
28
- if not para:
29
- continue
30
-
31
- # If adding this paragraph exceeds limit, save current chunk
32
- if len(current_chunk) + len(para) + 2 > chunk_size and current_chunk:
33
- chunks.append(current_chunk.strip())
34
- # Add overlap from end of previous chunk
35
- overlap_text = current_chunk[-chunk_overlap:] if len(current_chunk) > chunk_overlap else current_chunk
36
- current_chunk = overlap_text + "\n\n" + para
37
- else:
38
- if current_chunk:
39
- current_chunk += "\n\n" + para
40
- else:
41
- current_chunk = para
42
-
43
- # Add last chunk
44
- if current_chunk:
45
- chunks.append(current_chunk.strip())
46
-
47
- # If single paragraph is too large, fall back to sentence splitting
48
- final_chunks = []
49
- for chunk_text in chunks:
50
- if len(chunk_text) > chunk_size:
51
- splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
52
- final_chunks.extend(splitter.split_text(chunk_text))
53
- else:
54
- final_chunks.append(chunk_text)
55
-
56
- log_message(f" ✂️ Текст разбит на {len(final_chunks)} семантических чанков")
57
-
58
- # Create documents
59
  chunked_docs = []
60
- for i, chunk_text in enumerate(final_chunks):
61
  chunk_metadata = doc.metadata.copy()
62
  chunk_metadata.update({
63
  "chunk_id": i,
64
- "total_chunks": len(final_chunks),
65
  "chunk_size": len(chunk_text),
66
- "is_chunked": True
67
  })
68
- chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
 
 
 
 
 
69
 
70
  return chunked_docs
71
 
72
  def process_documents_with_chunking(documents):
73
- log_message("\n" + "="*60)
74
- log_message("🔄 НАЧАЛО ПРОЦЕССА ЧАНКИНГА")
75
- log_message("="*60)
76
-
77
  all_chunked_docs = []
78
  chunk_info = []
 
 
 
 
 
79
 
80
- # Counters
81
- table_whole_count = 0 # Целые таблицы (не нуждаются в чанкинге)
82
- table_chunked_count = 0 # Таблицы, которые УЖЕ разбиты
83
- image_whole_count = 0 # Целые изображения
84
- image_chunked_count = 0 # Изображения, разбитые на чанки
85
- text_whole_count = 0 # Целые текстовые документы
86
- text_chunked_count = 0 # Текстовые документы, разбитые на чанки
87
-
88
- for idx, doc in enumerate(documents):
89
  doc_type = doc.metadata.get('type', 'text')
90
  is_already_chunked = doc.metadata.get('is_chunked', False)
91
- doc_size = len(doc.text)
92
-
93
- log_message(f"\n📄 Документ {idx+1}/{len(documents)} | "
94
- f"Тип: {doc_type} | "
95
- f"Размер: {doc_size} | "
96
- f"Уже разбит: {is_already_chunked}")
97
 
98
  if doc_type == 'table':
99
  if is_already_chunked:
100
- # Таблица уже разбита на чанки в table_prep.py
101
- table_chunked_count += 1
102
  all_chunked_docs.append(doc)
103
- log_message(f" ✓ Таблица (чанк {doc.metadata.get('chunk_id', 0) + 1}/"
104
- f"{doc.metadata.get('total_chunks', 1)}) добавлена без изменений")
 
 
 
 
 
 
 
 
105
  else:
106
- # Целая таблица
107
- table_whole_count += 1
108
  all_chunked_docs.append(doc)
109
- log_message(f" ✓ Целая таблица добавлена | "
110
- f"Номер: {doc.metadata.get('table_number', 'unknown')}")
111
-
112
- chunk_info.append({
113
- 'document_id': doc.metadata.get('document_id', 'unknown'),
114
- 'section_id': doc.metadata.get('section_id', 'unknown'),
115
- 'chunk_id': doc.metadata.get('chunk_id', 0),
116
- 'total_chunks': doc.metadata.get('total_chunks', 1),
117
- 'chunk_size': doc_size,
118
- 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
119
- 'type': 'table',
120
- 'table_number': doc.metadata.get('table_number', 'unknown'),
121
- 'is_chunked': is_already_chunked
122
- })
123
 
124
  elif doc_type == 'image':
 
 
125
  if doc_size > CHUNK_SIZE:
126
- log_message(f" 📷 Изображение требует чанкинга | Размер: {doc_size} > {CHUNK_SIZE}")
 
127
  chunked_docs = chunk_document(doc)
128
- image_chunked_count += len(chunked_docs)
129
  all_chunked_docs.extend(chunked_docs)
 
130
 
131
- for chunk_doc in chunked_docs:
132
  chunk_info.append({
133
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
134
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
135
- 'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
136
- 'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
137
  'chunk_size': len(chunk_doc.text),
138
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
139
  'type': 'image',
140
- 'image_number': chunk_doc.metadata.get('image_number', 'unknown'),
141
- 'is_chunked': True
142
  })
143
  else:
144
- image_whole_count += 1
145
  all_chunked_docs.append(doc)
146
- log_message(f" ✓ Целое изображение добавлено | Размер: {doc_size}")
147
-
148
  chunk_info.append({
149
  'document_id': doc.metadata.get('document_id', 'unknown'),
150
  'section_id': doc.metadata.get('section_id', 'unknown'),
151
  'chunk_id': 0,
152
- 'total_chunks': 1,
153
  'chunk_size': doc_size,
154
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
155
  'type': 'image',
156
- 'image_number': doc.metadata.get('image_number', 'unknown'),
157
- 'is_chunked': False
158
  })
159
 
160
- else: # text
 
161
  if doc_size > CHUNK_SIZE:
162
- log_message(f" 📝 Текст требует чанкинга | "
163
- f"Документ: {doc.metadata.get('document_id', 'unknown')} | "
164
- f"Раздел: {doc.metadata.get('section_id', 'unknown')} | "
165
  f"Размер: {doc_size} > {CHUNK_SIZE}")
166
-
167
  chunked_docs = chunk_document(doc)
168
- text_chunked_count += len(chunked_docs)
169
  all_chunked_docs.extend(chunked_docs)
 
170
 
171
- for chunk_doc in chunked_docs:
172
  chunk_info.append({
173
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
174
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
175
- 'chunk_id': chunk_doc.metadata.get('chunk_id', 0),
176
- 'total_chunks': chunk_doc.metadata.get('total_chunks', 1),
177
  'chunk_size': len(chunk_doc.text),
178
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
179
- 'type': 'text',
180
- 'is_chunked': True
181
  })
182
  else:
183
- text_whole_count += 1
184
  all_chunked_docs.append(doc)
185
- log_message(f" ✓ Целый текстовый документ добавлен | Размер: {doc_size}")
186
-
187
  chunk_info.append({
188
  'document_id': doc.metadata.get('document_id', 'unknown'),
189
  'section_id': doc.metadata.get('section_id', 'unknown'),
190
  'chunk_id': 0,
191
- 'total_chunks': 1,
192
  'chunk_size': doc_size,
193
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
194
- 'type': 'text',
195
- 'is_chunked': False
196
  })
197
 
198
  log_message(f"\n{'='*60}")
199
- log_message(f"📊 ИТОГОВАЯ СТАТИСТИКА ЧАНКИНГА:")
200
- log_message(f"{'='*60}")
201
- log_message(f" ТАБЛИЦЫ:")
202
- log_message(f" Целые (не нуждались в чанкинге): {table_whole_count}")
203
- log_message(f" Чанки (разбиты в table_prep.py): {table_chunked_count}")
204
- log_message(f" ИЗОБРАЖЕНИЯ:")
205
- log_message(f" Целые: {image_whole_count}")
206
- log_message(f" • Чанки: {image_chunked_count}")
207
- log_message(f" ТЕКСТ:")
208
- log_message(f" • Целые документы: {text_whole_count}")
209
- log_message(f" • Чанки: {text_chunked_count}")
210
- log_message(f" {'─'*58}")
211
- log_message(f" ВСЕГО ДОКУМЕНТОВ В ИНДЕКСЕ: {len(all_chunked_docs)}")
212
  log_message(f"{'='*60}\n")
213
 
214
  return all_chunked_docs, chunk_info
215
 
216
-
217
  def extract_text_from_json(data, document_id, document_name):
218
  documents = []
219
 
 
14
  chunk_size = CHUNK_SIZE
15
  if chunk_overlap is None:
16
  chunk_overlap = CHUNK_OVERLAP
17
+ text_splitter = SentenceSplitter(
18
+ chunk_size=chunk_size,
19
+ chunk_overlap=chunk_overlap,
20
+ separator=" "
21
+ )
22
 
23
+ text_chunks = text_splitter.split_text(doc.text)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  chunked_docs = []
26
+ for i, chunk_text in enumerate(text_chunks):
27
  chunk_metadata = doc.metadata.copy()
28
  chunk_metadata.update({
29
  "chunk_id": i,
30
+ "total_chunks": len(text_chunks),
31
  "chunk_size": len(chunk_text),
32
+ "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
33
  })
34
+
35
+ chunked_doc = Document(
36
+ text=chunk_text,
37
+ metadata=chunk_metadata
38
+ )
39
+ chunked_docs.append(chunked_doc)
40
 
41
  return chunked_docs
42
 
43
  def process_documents_with_chunking(documents):
 
 
 
 
44
  all_chunked_docs = []
45
  chunk_info = []
46
+ table_count = 0
47
+ table_chunks_count = 0
48
+ image_count = 0
49
+ image_chunks_count = 0
50
+ text_chunks_count = 0
51
 
52
+ for doc in documents:
 
 
 
 
 
 
 
 
53
  doc_type = doc.metadata.get('type', 'text')
54
  is_already_chunked = doc.metadata.get('is_chunked', False)
 
 
 
 
 
 
55
 
56
  if doc_type == 'table':
57
  if is_already_chunked:
58
+ table_chunks_count += 1
 
59
  all_chunked_docs.append(doc)
60
+ chunk_info.append({
61
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
62
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
63
+ 'chunk_id': doc.metadata.get('chunk_id', 0),
64
+ 'total_chunks': doc.metadata.get('total_chunks', 1),
65
+ 'chunk_size': len(doc.text),
66
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
67
+ 'type': 'table',
68
+ 'table_number': doc.metadata.get('table_number', 'unknown')
69
+ })
70
  else:
71
+ table_count += 1
 
72
  all_chunked_docs.append(doc)
73
+ chunk_info.append({
74
+ 'document_id': doc.metadata.get('document_id', 'unknown'),
75
+ 'section_id': doc.metadata.get('section_id', 'unknown'),
76
+ 'chunk_id': 0,
77
+ 'chunk_size': len(doc.text),
78
+ 'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
79
+ 'type': 'table',
80
+ 'table_number': doc.metadata.get('table_number', 'unknown')
81
+ })
 
 
 
 
 
82
 
83
  elif doc_type == 'image':
84
+ image_count += 1
85
+ doc_size = len(doc.text)
86
  if doc_size > CHUNK_SIZE:
87
+ log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number', 'unknown')} | "
88
+ f"Размер: {doc_size} > {CHUNK_SIZE}")
89
  chunked_docs = chunk_document(doc)
90
+ image_chunks_count += len(chunked_docs)
91
  all_chunked_docs.extend(chunked_docs)
92
+ log_message(f" ✂️ Разделено на {len(chunked_docs)} чанков")
93
 
94
+ for i, chunk_doc in enumerate(chunked_docs):
95
  chunk_info.append({
96
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
97
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
98
+ 'chunk_id': i,
 
99
  'chunk_size': len(chunk_doc.text),
100
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
101
  'type': 'image',
102
+ 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
 
103
  })
104
  else:
 
105
  all_chunked_docs.append(doc)
 
 
106
  chunk_info.append({
107
  'document_id': doc.metadata.get('document_id', 'unknown'),
108
  'section_id': doc.metadata.get('section_id', 'unknown'),
109
  'chunk_id': 0,
 
110
  'chunk_size': doc_size,
111
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
112
  'type': 'image',
113
+ 'image_number': doc.metadata.get('image_number', 'unknown')
 
114
  })
115
 
116
+ else:
117
+ doc_size = len(doc.text)
118
  if doc_size > CHUNK_SIZE:
119
+ log_message(f"📝 CHUNKING: Текст из '{doc.metadata.get('document_id', 'unknown')}' | "
 
 
120
  f"Размер: {doc_size} > {CHUNK_SIZE}")
 
121
  chunked_docs = chunk_document(doc)
122
+ text_chunks_count += len(chunked_docs)
123
  all_chunked_docs.extend(chunked_docs)
124
+ log_message(f" ✂️ Разделен на {len(chunked_docs)} чанков")
125
 
126
+ for i, chunk_doc in enumerate(chunked_docs):
127
  chunk_info.append({
128
  'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
129
  'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
130
+ 'chunk_id': i,
 
131
  'chunk_size': len(chunk_doc.text),
132
  'chunk_preview': chunk_doc.text[:200] + "..." if len(chunk_doc.text) > 200 else chunk_doc.text,
133
+ 'type': 'text'
 
134
  })
135
  else:
 
136
  all_chunked_docs.append(doc)
 
 
137
  chunk_info.append({
138
  'document_id': doc.metadata.get('document_id', 'unknown'),
139
  'section_id': doc.metadata.get('section_id', 'unknown'),
140
  'chunk_id': 0,
 
141
  'chunk_size': doc_size,
142
  'chunk_preview': doc.text[:200] + "..." if len(doc.text) > 200 else doc.text,
143
+ 'type': 'text'
 
144
  })
145
 
146
  log_message(f"\n{'='*60}")
147
+ log_message(f"ИТОГО ОБРАБОТАНО ДОКУМЕНТОВ:")
148
+ log_message(f" • Таблицы (целые): {table_count}")
149
+ log_message(f" • Таблицы (чанки): {table_chunks_count}")
150
+ log_message(f" Изображения (целые): {image_count - (image_chunks_count > 0)}")
151
+ log_message(f" Изображения (чанки): {image_chunks_count}")
152
+ log_message(f" • Текстовые чанки: {text_chunks_count}")
153
+ log_message(f" Всего документов: {len(all_chunked_docs)}")
 
 
 
 
 
 
154
  log_message(f"{'='*60}\n")
155
 
156
  return all_chunked_docs, chunk_info
157
 
 
158
  def extract_text_from_json(data, document_id, document_name):
159
  documents = []
160
 
index_retriever.py CHANGED
@@ -112,4 +112,142 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
112
 
113
  except Exception as e:
114
  log_message(f"Ошибка переранжировки: {str(e)}")
115
- return nodes[:top_k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  except Exception as e:
114
  log_message(f"Ошибка переранжировки: {str(e)}")
115
+ return nodes[:top_k]
116
+
117
+
118
+ from rank_bm25 import BM25Okapi
119
+ import numpy as np
120
+
121
+ class HybridRetriever:
122
+ def __init__(self, vector_retriever, documents):
123
+ self.vector_retriever = vector_retriever
124
+ self.documents = documents
125
+
126
+ # Build BM25 index
127
+ tokenized_docs = [doc.text.lower().split() for doc in documents]
128
+ self.bm25 = BM25Okapi(tokenized_docs)
129
+
130
+ # Build metadata index for exact matching
131
+ self.metadata_index = self._build_metadata_index(documents)
132
+
133
+ def _build_metadata_index(self, documents):
134
+ """Index by materials, GOSTs, classes for exact matching"""
135
+ index = {
136
+ 'materials': {},
137
+ 'gosts': {},
138
+ 'classes': {},
139
+ 'key_terms': {}
140
+ }
141
+
142
+ for i, doc in enumerate(documents):
143
+ metadata = doc.metadata
144
+
145
+ # Index materials
146
+ for material in metadata.get('materials', []):
147
+ if material not in index['materials']:
148
+ index['materials'][material] = []
149
+ index['materials'][material].append(i)
150
+
151
+ # Index GOSTs
152
+ for gost in metadata.get('gosts', []):
153
+ if gost not in index['gosts']:
154
+ index['gosts'][gost] = []
155
+ index['gosts'][gost].append(i)
156
+
157
+ # Index classes
158
+ for cls in metadata.get('classes', []):
159
+ if cls not in index['classes']:
160
+ index['classes'][cls] = []
161
+ index['classes'][cls].append(i)
162
+
163
+ # Index key terms
164
+ for term in metadata.get('key_terms', []):
165
+ term_lower = term.lower()
166
+ if term_lower not in index['key_terms']:
167
+ index['key_terms'][term_lower] = []
168
+ index['key_terms'][term_lower].append(i)
169
+
170
+ return index
171
+
172
+ def retrieve(self, query, top_k=20, vector_weight=0.5, bm25_weight=0.3, metadata_weight=0.2):
173
+ """Hybrid retrieval combining vector, BM25, and metadata matching"""
174
+
175
+ # 1. Vector search
176
+ vector_results = self.vector_retriever.retrieve(query)
177
+ vector_scores = {node.node_id: node.score for node in vector_results}
178
+
179
+ # 2. BM25 search
180
+ tokenized_query = query.lower().split()
181
+ bm25_scores = self.bm25.get_scores(tokenized_query)
182
+
183
+ # 3. Metadata exact matching
184
+ metadata_scores = self._get_metadata_scores(query)
185
+
186
+ # 4. Combine scores
187
+ all_node_ids = set(list(vector_scores.keys()) +
188
+ list(range(len(self.documents))))
189
+
190
+ combined_scores = {}
191
+ for node_id in all_node_ids:
192
+ vec_score = vector_scores.get(node_id, 0.0)
193
+ bm25_score = bm25_scores[node_id] if isinstance(node_id, int) and node_id < len(bm25_scores) else 0.0
194
+ meta_score = metadata_scores.get(node_id, 0.0)
195
+
196
+ # Normalize and combine
197
+ combined_scores[node_id] = (
198
+ vector_weight * vec_score +
199
+ bm25_weight * (bm25_score / (max(bm25_scores) + 1e-6)) +
200
+ metadata_weight * meta_score
201
+ )
202
+
203
+ # 5. Get top-k
204
+ sorted_nodes = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
205
+
206
+ # Return as node objects
207
+ results = []
208
+ for node_id, score in sorted_nodes:
209
+ if isinstance(node_id, int) and node_id < len(self.documents):
210
+ doc = self.documents[node_id]
211
+ # Create node-like object
212
+ from types import SimpleNamespace
213
+ node = SimpleNamespace(
214
+ text=doc.text,
215
+ metadata=doc.metadata,
216
+ score=score,
217
+ node_id=node_id
218
+ )
219
+ results.append(node)
220
+
221
+ return results
222
+
223
+ def _get_metadata_scores(self, query):
224
+ """Score documents by exact metadata matches"""
225
+ scores = {}
226
+ query_lower = query.lower()
227
+
228
+ # Check for material codes
229
+ import re
230
+ material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
231
+ materials_in_query = re.findall(material_pattern, query, re.IGNORECASE)
232
+
233
+ for material in materials_in_query:
234
+ if material in self.metadata_index['materials']:
235
+ for doc_id in self.metadata_index['materials'][material]:
236
+ scores[doc_id] = scores.get(doc_id, 0) + 1.0
237
+
238
+ # Check for GOSTs
239
+ gost_pattern = r'ГОСТ\s+[РЕ��\s]*\d+[\.\-\d]*'
240
+ gosts_in_query = re.findall(gost_pattern, query, re.IGNORECASE)
241
+
242
+ for gost in gosts_in_query:
243
+ if gost in self.metadata_index['gosts']:
244
+ for doc_id in self.metadata_index['gosts'][gost]:
245
+ scores[doc_id] = scores.get(doc_id, 0) + 0.8
246
+
247
+ # Check for key terms
248
+ for term, doc_ids in self.metadata_index['key_terms'].items():
249
+ if term in query_lower:
250
+ for doc_id in doc_ids:
251
+ scores[doc_id] = scores.get(doc_id, 0) + 0.5
252
+
253
+ return scores
requirements.txt CHANGED
@@ -15,4 +15,5 @@ openpyxl
15
  llama-index-llms-openai
16
  llama-index-vector-stores-faiss
17
  llama-index-retrievers-bm25
18
- tiktoken
 
 
15
  llama-index-llms-openai
16
  llama-index-vector-stores-faiss
17
  llama-index-retrievers-bm25
18
+ tiktoken
19
+ rank-bm25
table_prep.py CHANGED
@@ -32,80 +32,93 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def create_table_chunks_with_headers(table_data, rows_per_chunk=10):
36
- """
37
- Intelligently chunk tables by preserving headers and grouping rows
38
- """
39
- doc_id = table_data.get('document_id') or table_data.get('document', 'Неизвестно')
40
- table_num = table_data.get('table_number', 'Неизвестно')
41
- table_title = table_data.get('table_title', 'Неизвестно')
42
- section = table_data.get('section', 'Неизвестно')
43
- headers = table_data.get('headers', [])
44
- table_rows = table_data.get('data', [])
45
-
46
- if not table_rows:
47
- return []
48
-
49
- # Create header string that will be included in EVERY chunk
50
- header_context = f"Таблица {table_num}: {table_title}\n"
51
- header_context += f"Документ: {doc_id}\n"
52
- header_context += f"Раздел: {section}\n"
53
- if headers:
54
- header_context += f"Заголовки: {' | '.join(headers)}\n"
55
- header_context += f"Всего строк в таблице: {len(table_rows)}\n\n"
56
 
57
- # Calculate optimal rows per chunk based on content size
58
- avg_row_size = sum(len(str(row)) for row in table_rows[:5]) / min(5, len(table_rows))
59
- max_chunk_size = CHUNK_SIZE - len(header_context) - 500 # Safety margin
60
- optimal_rows = max(5, int(max_chunk_size / avg_row_size))
61
 
62
- log_message(f" 📐 Средний размер строки: {avg_row_size:.0f} символов")
63
- log_message(f" 📊 Оптимальное кол-во строк на чанк: {optimal_rows}")
 
 
 
64
 
65
- chunks = []
66
- total_rows = len(table_rows)
67
 
68
- for i in range(0, total_rows, optimal_rows):
69
- chunk_rows = table_rows[i:i + optimal_rows]
70
-
71
- # Build chunk content
72
- chunk_content = header_context
73
- chunk_content += f"[Строки {i+1}-{min(i+optimal_rows, total_rows)} из {total_rows}]\n"
74
- chunk_content += "Данные:\n"
75
 
76
- for row_idx, row in enumerate(chunk_rows, start=i+1):
77
- if isinstance(row, dict):
78
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
79
- chunk_content += f"Строка {row_idx}: {row_text}\n"
80
-
81
- chunk_metadata = {
82
- "type": "table",
83
- "table_number": table_num,
84
- "table_title": table_title,
85
- "document_id": doc_id,
86
- "section": section,
87
- "section_id": section,
88
- "headers": headers,
89
- "chunk_id": i // optimal_rows,
90
- "total_chunks": (total_rows + optimal_rows - 1) // optimal_rows,
91
- "row_range": f"{i+1}-{min(i+optimal_rows, total_rows)}",
92
- "total_table_rows": total_rows,
93
- "is_chunked": True
94
- }
95
 
96
- doc = Document(text=chunk_content, metadata=chunk_metadata)
97
- chunks.append(doc)
 
 
 
 
98
 
99
- log_message(f" Чанк {len(chunks)}: строки {i+1}-{min(i+optimal_rows, total_rows)} | "
100
- f"{len(chunk_content)} символов")
 
 
 
101
 
102
- return chunks
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def table_to_document(table_data, document_id=None):
106
- """
107
- Convert table to Document(s) with intelligent chunking
108
- """
109
  if not isinstance(table_data, dict):
110
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
111
  return []
@@ -116,46 +129,41 @@ def table_to_document(table_data, document_id=None):
116
  section = table_data.get('section', 'Неизвестно')
117
 
118
  table_rows = table_data.get('data', [])
119
- if not table_rows:
120
- log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных")
121
  return []
122
 
123
- log_message(f"\n📊 Обработка таблицы {table_num} из документа '{doc_id}'")
124
- log_message(f" Название: {table_title}")
125
- log_message(f" Раздел: {section}")
126
- log_message(f" Строк данных: {len(table_rows)}")
127
-
128
- # Estimate if table needs chunking
129
- sample_content = create_table_content(table_data)
130
- estimated_size = len(sample_content)
131
-
132
- log_message(f" Оценочный размер: {estimated_size} символов")
133
-
134
- # Threshold: if table is small enough, keep it whole
135
- if estimated_size <= CHUNK_SIZE * 0.8: # 80% of limit for safety
136
- log_message(f" Таблица достаточно мала, хранится целиком")
137
- doc = Document(
138
- text=sample_content,
139
- metadata={
140
- "type": "table",
141
- "table_number": table_num,
142
- "table_title": table_title,
143
- "document_id": doc_id,
144
- "section": section,
145
- "section_id": section,
146
- "headers": table_data.get('headers', []),
147
- "total_rows": len(table_rows),
148
- "content_size": estimated_size,
149
- "is_chunked": False
150
- }
151
- )
152
- return [doc]
153
  else:
154
- log_message(f" ⚠️ Таблица слишком большая ({estimated_size} > {CHUNK_SIZE})")
155
- log_message(f" 🔄 Применяется умный чанкинг с сохранением заголовков...")
156
- chunks = create_table_chunks_with_headers(table_data)
157
- log_message(f" ✅ Таблица разбита на {len(chunks)} чанков с сохранением структуры")
158
- return chunks
159
 
160
  def load_table_data(repo_id, hf_token, table_data_dir):
161
  log_message("=" * 60)
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ # In table_prep.py - replace chunk_table_document function
36
+
37
+ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
38
+ if chunk_size is None:
39
+ chunk_size = CHUNK_SIZE
40
+ if chunk_overlap is None:
41
+ chunk_overlap = CHUNK_OVERLAP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Extract critical metadata from table before chunking
44
+ table_metadata = extract_table_metadata(doc.text)
 
 
45
 
46
+ text_splitter = SentenceSplitter(
47
+ chunk_size=chunk_size,
48
+ chunk_overlap=chunk_overlap,
49
+ separator="\n"
50
+ )
51
 
52
+ text_chunks = text_splitter.split_text(doc.text)
 
53
 
54
+ chunked_docs = []
55
+ for i, chunk_text in enumerate(text_chunks):
56
+ chunk_metadata = doc.metadata.copy()
 
 
 
 
57
 
58
+ # Add extracted keywords/materials to each chunk
59
+ chunk_metadata.update({
60
+ "chunk_id": i,
61
+ "total_chunks": len(text_chunks),
62
+ "chunk_size": len(chunk_text),
63
+ "is_chunked": True,
64
+ "materials": table_metadata.get("materials", []), # All materials from table
65
+ "key_terms": table_metadata.get("key_terms", []), # Technical terms
66
+ "table_summary": table_metadata.get("summary", "") # Brief table description
67
+ })
 
 
 
 
 
 
 
 
 
68
 
69
+ # Enrich chunk text with context from full table
70
+ enriched_text = f"""[Таблица {doc.metadata.get('table_number')}: {doc.metadata.get('table_title')}]
71
+ [Материалы в таблице: {', '.join(table_metadata.get('materials', [])[:10])}]
72
+ [Ключевые термины: {', '.join(table_metadata.get('key_terms', [])[:10])}]
73
+
74
+ {chunk_text}"""
75
 
76
+ chunked_doc = Document(
77
+ text=enriched_text,
78
+ metadata=chunk_metadata
79
+ )
80
+ chunked_docs.append(chunked_doc)
81
 
82
+ return chunked_docs
83
+
84
 
85
+ def extract_table_metadata(table_text):
86
+ """Extract searchable metadata from table content"""
87
+ import re
88
+
89
+ # Extract material codes (e.g., 08Х18Н10Т)
90
+ material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
91
+ materials = list(set(re.findall(material_pattern, table_text, re.IGNORECASE)))
92
+
93
+ # Extract GOST standards
94
+ gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
95
+ gosts = list(set(re.findall(gost_pattern, table_text, re.IGNORECASE)))
96
+
97
+ # Extract class/category codes
98
+ class_pattern = r'\b\d[АБВСI]+[IVX]+[a-z]*\b'
99
+ classes = list(set(re.findall(class_pattern, table_text, re.IGNORECASE)))
100
+
101
+ # Extract common technical terms
102
+ tech_terms = []
103
+ keywords = ['контроль', 'испытание', 'сертификат', 'качество', 'план',
104
+ 'полуфабрикат', 'оборудование', 'арматура', 'деталь']
105
+ for keyword in keywords:
106
+ if keyword.lower() in table_text.lower():
107
+ tech_terms.append(keyword)
108
+
109
+ # Create brief summary
110
+ lines = table_text.split('\n')[:5]
111
+ summary = ' '.join([l.strip() for l in lines if l.strip()])[:200]
112
+
113
+ return {
114
+ "materials": materials,
115
+ "gosts": gosts,
116
+ "classes": classes,
117
+ "key_terms": tech_terms + gosts,
118
+ "summary": summary
119
+ }
120
 
121
  def table_to_document(table_data, document_id=None):
 
 
 
122
  if not isinstance(table_data, dict):
123
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
124
  return []
 
129
  section = table_data.get('section', 'Неизвестно')
130
 
131
  table_rows = table_data.get('data', [])
132
+ if not table_rows or len(table_rows) == 0:
133
+ log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
134
  return []
135
 
136
+ content = create_table_content(table_data)
137
+ content_size = len(content)
138
+ row_count = len(table_rows)
139
+
140
+ base_doc = Document(
141
+ text=content,
142
+ metadata={
143
+ "type": "table",
144
+ "table_number": table_num,
145
+ "table_title": table_title,
146
+ "document_id": doc_id,
147
+ "section": section,
148
+ "section_id": section,
149
+ "total_rows": row_count,
150
+ "content_size": content_size
151
+ }
152
+ )
153
+
154
+ if content_size > CHUNK_SIZE:
155
+ log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
156
+ f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
157
+ chunked_docs = chunk_table_document(base_doc)
158
+ log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
159
+ for i, chunk_doc in enumerate(chunked_docs):
160
+ log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
161
+ return chunked_docs
 
 
 
 
162
  else:
163
+ log_message(f" ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
164
+ f"Размер: {content_size} символов | Строк: {row_count}")
165
+ return [base_doc]
166
+
 
167
 
168
  def load_table_data(repo_id, hf_token, table_data_dir):
169
  log_message("=" * 60)
utils.py CHANGED
@@ -21,9 +21,11 @@ def get_llm_model(model_name):
21
  raise Exception(f"API ключ не найден для модели {model_name}")
22
 
23
  if model_config["provider"] == "google":
 
24
  return GoogleGenAI(
25
  model=model_config["model_name"],
26
- api_key=model_config["api_key"]
 
27
  )
28
  elif model_config["provider"] == "openai":
29
  return OpenAI(
@@ -35,7 +37,11 @@ def get_llm_model(model_name):
35
 
36
  except Exception as e:
37
  log_message(f"Ошибка создания модели {model_name}: {str(e)}")
38
- return GoogleGenAI(model="gemini-2.0-flash", api_key=GOOGLE_API_KEY)
 
 
 
 
39
 
40
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
41
  return HuggingFaceEmbedding(model_name=model_name)
@@ -225,7 +231,7 @@ def generate_sources_html(nodes, chunks_df=None):
225
 
226
  html += "</div>"
227
  return html
228
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
229
  if query_engine is None:
230
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
231
 
@@ -234,18 +240,22 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
234
 
235
  llm = get_llm_model(current_model)
236
 
237
- # Direct retrieval without query expansion
238
- retrieved_nodes = query_engine.retriever.retrieve(question)
239
-
240
- log_message(f"Получено {len(retrieved_nodes)} узлов")
 
 
 
241
 
 
242
  reranked_nodes = rerank_nodes(
243
  question,
244
  retrieved_nodes,
245
  reranker,
246
- top_k=20,
247
- min_score_threshold=0.5,
248
- diversity_penalty=0.3
249
  )
250
 
251
  formatted_context = format_context_for_llm(reranked_nodes)
 
21
  raise Exception(f"API ключ не найден для модели {model_name}")
22
 
23
  if model_config["provider"] == "google":
24
+ # Fix: Remove image_config parameter or set it properly
25
  return GoogleGenAI(
26
  model=model_config["model_name"],
27
+ api_key=model_config["api_key"],
28
+ # Don't pass image_config=None
29
  )
30
  elif model_config["provider"] == "openai":
31
  return OpenAI(
 
37
 
38
  except Exception as e:
39
  log_message(f"Ошибка создания модели {model_name}: {str(e)}")
40
+ # Fix: Also apply to fallback model
41
+ return GoogleGenAI(
42
+ model="gemini-2.0-flash",
43
+ api_key=GOOGLE_API_KEY
44
+ )
45
 
46
  def get_embedding_model(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
47
  return HuggingFaceEmbedding(model_name=model_name)
 
231
 
232
  html += "</div>"
233
  return html
234
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None, hybrid_retriever=None):
235
  if query_engine is None:
236
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
237
 
 
240
 
241
  llm = get_llm_model(current_model)
242
 
243
+ # Use hybrid retriever if available
244
+ if hybrid_retriever:
245
+ retrieved_nodes = hybrid_retriever.retrieve(question, top_k=30)
246
+ log_message(f"Hybrid retrieval: получено {len(retrieved_nodes)} узлов")
247
+ else:
248
+ retrieved_nodes = query_engine.retriever.retrieve(question)
249
+ log_message(f"Vector retrieval: получено {len(retrieved_nodes)} узлов")
250
 
251
+ # Rerank with increased top_k
252
  reranked_nodes = rerank_nodes(
253
  question,
254
  retrieved_nodes,
255
  reranker,
256
+ top_k=25, # Increased from 20
257
+ min_score_threshold=0.3, # Lowered from 0.5 to catch more results
258
+ diversity_penalty=0.2 # Reduced penalty
259
  )
260
 
261
  formatted_context = format_context_for_llm(reranked_nodes)