MrSimple07 commited on
Commit
aa38fcf
·
1 Parent(s): eefdfd0

table prep changed

Browse files
Files changed (3) hide show
  1. index_retriever.py +2 -116
  2. table_prep.py +102 -53
  3. utils.py +7 -8
index_retriever.py CHANGED
@@ -16,7 +16,7 @@ def create_query_engine(vector_index):
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
- similarity_top_k=20
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
@@ -49,119 +49,6 @@ def create_query_engine(vector_index):
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
 
52
- import re
53
- from typing import List, Dict, Set
54
- from my_logging import log_message
55
-
56
- def extract_keywords_from_query(query: str) -> Dict[str, List[str]]:
57
- """Extract technical keywords from query"""
58
- keywords = {
59
- 'materials': [],
60
- 'gosts': [],
61
- 'classes': [],
62
- 'technical_terms': []
63
- }
64
-
65
- # Material codes: 08Х18Н10Т, 12Х18Н10Т, etc.
66
- material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
67
- keywords['materials'] = re.findall(material_pattern, query, re.IGNORECASE)
68
-
69
- # GOST standards
70
- gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
71
- keywords['gosts'] = re.findall(gost_pattern, query, re.IGNORECASE)
72
-
73
- # Classification codes: 3СIIIa, 1А, 2BII, etc.
74
- class_pattern = r'\b\d[АБВГСD]+[IV]+[a-z]?\b'
75
- keywords['classes'] = re.findall(class_pattern, query, re.IGNORECASE)
76
-
77
- # Technical terms
78
- terms = ['полуфабрикат', 'план качества', 'контроль', 'арматура',
79
- 'ультразвуковой', 'сварка', 'испытание']
80
- for term in terms:
81
- if term.lower() in query.lower():
82
- keywords['technical_terms'].append(term)
83
-
84
- return keywords
85
-
86
- def keyword_search_nodes(nodes: List, keywords: Dict[str, List[str]]) -> List:
87
- """Filter nodes by exact keyword matches"""
88
- if not any(keywords.values()):
89
- return nodes
90
-
91
- matched_nodes = []
92
-
93
- for node in nodes:
94
- text_lower = node.text.lower()
95
- metadata = node.metadata if hasattr(node, 'metadata') else {}
96
-
97
- # Check materials
98
- for material in keywords['materials']:
99
- if material.lower() in text_lower:
100
- matched_nodes.append(node)
101
- break
102
- else:
103
- # Check GOSTs
104
- for gost in keywords['gosts']:
105
- if gost.lower() in text_lower:
106
- matched_nodes.append(node)
107
- break
108
- else:
109
- # Check classes
110
- for cls in keywords['classes']:
111
- if cls.lower() in text_lower:
112
- matched_nodes.append(node)
113
- break
114
- else:
115
- # Check technical terms (at least 2 matches)
116
- term_matches = sum(1 for term in keywords['technical_terms']
117
- if term.lower() in text_lower)
118
- if term_matches >= 2:
119
- matched_nodes.append(node)
120
-
121
- return matched_nodes
122
-
123
- def hybrid_retrieve_with_keywords(question: str, query_engine, top_k: int = 40) -> List:
124
- """Retrieve using both vector search and keyword matching"""
125
-
126
- # Extract keywords from query
127
- keywords = extract_keywords_from_query(question)
128
- log_message(f"Извлечены ключевые слова: {keywords}")
129
-
130
- # Get vector search results
131
- vector_nodes = query_engine.retriever.retrieve(question)
132
- log_message(f"Векторный поиск: {len(vector_nodes)} узлов")
133
-
134
- # Apply keyword filtering
135
- if any(keywords.values()):
136
- keyword_nodes = keyword_search_nodes(vector_nodes, keywords)
137
- log_message(f"После фильтрации по ключевым словам: {len(keyword_nodes)} узлов")
138
-
139
- # If keyword search found results, prioritize them
140
- if keyword_nodes:
141
- # Deduplicate and combine
142
- seen_ids = set()
143
- combined_nodes = []
144
-
145
- # First add keyword matches
146
- for node in keyword_nodes[:top_k]:
147
- node_id = id(node)
148
- if node_id not in seen_ids:
149
- combined_nodes.append(node)
150
- seen_ids.add(node_id)
151
-
152
- # Then fill with vector results
153
- for node in vector_nodes:
154
- if len(combined_nodes) >= top_k:
155
- break
156
- node_id = id(node)
157
- if node_id not in seen_ids:
158
- combined_nodes.append(node)
159
- seen_ids.add(node_id)
160
-
161
- return combined_nodes[:top_k]
162
-
163
- return vector_nodes[:top_k]
164
-
165
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
166
  if not nodes or not reranker:
167
  return nodes[:top_k]
@@ -225,5 +112,4 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
225
 
226
  except Exception as e:
227
  log_message(f"Ошибка переранжировки: {str(e)}")
228
- return nodes[:top_k]
229
-
 
16
  try:
17
  bm25_retriever = BM25Retriever.from_defaults(
18
  docstore=vector_index.docstore,
19
+ similarity_top_k=30
20
  )
21
 
22
  vector_retriever = VectorIndexRetriever(
 
49
  log_message(f"Ошибка создания query engine: {str(e)}")
50
  raise
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
53
  if not nodes or not reranker:
54
  return nodes[:top_k]
 
112
 
113
  except Exception as e:
114
  log_message(f"Ошибка переранжировки: {str(e)}")
115
+ return nodes[:top_k]
 
table_prep.py CHANGED
@@ -32,7 +32,21 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- # In table_prep.py - replace chunk_table_document function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
38
  if chunk_size is None:
@@ -42,37 +56,109 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
42
 
43
  # Extract critical metadata from table before chunking
44
  table_metadata = extract_table_metadata(doc.text)
 
 
 
 
45
 
46
- text_splitter = SentenceSplitter(
47
- chunk_size=chunk_size,
48
- chunk_overlap=chunk_overlap,
49
- separator="\n"
50
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- text_chunks = text_splitter.split_text(doc.text)
53
 
 
54
  chunked_docs = []
 
 
 
55
  for i, chunk_text in enumerate(text_chunks):
56
  chunk_metadata = doc.metadata.copy()
57
-
58
- # Add extracted keywords/materials to each chunk
59
  chunk_metadata.update({
60
  "chunk_id": i,
61
  "total_chunks": len(text_chunks),
62
  "chunk_size": len(chunk_text),
63
  "is_chunked": True,
64
- "materials": table_metadata.get("materials", []), # All materials from table
65
- "key_terms": table_metadata.get("key_terms", []), # Technical terms
66
- "table_summary": table_metadata.get("summary", "") # Brief table description
67
  })
68
 
69
- # Enrich chunk text with context from full table
70
- enriched_text = f"""[Таблица {doc.metadata.get('table_number')}: {doc.metadata.get('table_title')}]
71
- [Материалы в таблице: {', '.join(table_metadata.get('materials', [])[:10])}]
72
- [Ключевые термины: {', '.join(table_metadata.get('key_terms', [])[:10])}]
 
 
 
73
 
74
  {chunk_text}"""
75
 
 
 
 
 
 
76
  chunked_doc = Document(
77
  text=enriched_text,
78
  metadata=chunk_metadata
@@ -81,43 +167,6 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
81
 
82
  return chunked_docs
83
 
84
-
85
- def extract_table_metadata(table_text):
86
- """Extract searchable metadata from table content"""
87
- import re
88
-
89
- # Extract material codes (e.g., 08Х18Н10Т)
90
- material_pattern = r'\b\d{2}[ХНТМКВБА]+\d{1,2}[ХНТМКВБА]*\d*\b'
91
- materials = list(set(re.findall(material_pattern, table_text, re.IGNORECASE)))
92
-
93
- # Extract GOST standards
94
- gost_pattern = r'ГОСТ\s+[РЕН\s]*\d+[\.\-\d]*'
95
- gosts = list(set(re.findall(gost_pattern, table_text, re.IGNORECASE)))
96
-
97
- # Extract class/category codes
98
- class_pattern = r'\b\d[АБВСI]+[IVX]+[a-z]*\b'
99
- classes = list(set(re.findall(class_pattern, table_text, re.IGNORECASE)))
100
-
101
- # Extract common technical terms
102
- tech_terms = []
103
- keywords = ['контроль', 'испытание', 'сертификат', 'качество', 'план',
104
- 'полуфабрикат', 'оборудование', 'арматура', 'деталь']
105
- for keyword in keywords:
106
- if keyword.lower() in table_text.lower():
107
- tech_terms.append(keyword)
108
-
109
- # Create brief summary
110
- lines = table_text.split('\n')[:5]
111
- summary = ' '.join([l.strip() for l in lines if l.strip()])[:200]
112
-
113
- return {
114
- "materials": materials,
115
- "gosts": gosts,
116
- "classes": classes,
117
- "key_terms": tech_terms + gosts,
118
- "summary": summary
119
- }
120
-
121
  def table_to_document(table_data, document_id=None):
122
  if not isinstance(table_data, dict):
123
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ def extract_table_metadata(table_text: str) -> dict:
36
+ words = table_text.split()
37
+ unique_words = set(words)
38
+
39
+ from collections import Counter
40
+ stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
41
+ filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
42
+ common = Counter(filtered).most_common(15)
43
+ key_terms = [w for w, _ in common]
44
+
45
+ return {
46
+ "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
47
+ "materials": [], # if you want to extract material names, hook in regex or LLM here
48
+ "key_terms": key_terms
49
+ }
50
 
51
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
52
  if chunk_size is None:
 
56
 
57
  # Extract critical metadata from table before chunking
58
  table_metadata = extract_table_metadata(doc.text)
59
+ table_num = doc.metadata.get('table_number', 'unknown')
60
+ table_title = doc.metadata.get('table_title', 'unknown')
61
+ doc_id = doc.metadata.get('document_id', 'unknown')
62
+ section = doc.metadata.get('section', 'unknown')
63
 
64
+ # Parse table structure from your create_table_content format
65
+ lines = doc.text.strip().split('\n')
66
+
67
+ # Find where data rows start
68
+ table_header_lines = []
69
+ data_rows = []
70
+ in_data = False
71
+
72
+ for line in lines:
73
+ if line.startswith('Данные таблицы:'):
74
+ in_data = True
75
+ table_header_lines.append(line)
76
+ elif in_data and line.startswith('Строка'):
77
+ data_rows.append(line)
78
+ elif not in_data:
79
+ table_header_lines.append(line)
80
+
81
+ table_header = '\n'.join(table_header_lines) + '\n'
82
+
83
+ if not data_rows:
84
+ log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
85
+ text_splitter = SentenceSplitter(
86
+ chunk_size=chunk_size,
87
+ chunk_overlap=chunk_overlap,
88
+ separator="\n"
89
+ )
90
+ text_chunks = text_splitter.split_text(doc.text)
91
+ log_message(f" 📊 Стандартное разбиение: {len(text_chunks)} чанков")
92
+ else:
93
+ # Row-based chunking
94
+ log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
95
+
96
+ header_size = len(table_header)
97
+ # Reserve space for enrichment prefix
98
+ available_size = chunk_size - header_size - 300
99
+
100
+ text_chunks = []
101
+ current_chunk_rows = []
102
+ current_size = 0
103
+
104
+ for row in data_rows:
105
+ row_size = len(row) + 1
106
+
107
+ # Check if adding this row exceeds limit
108
+ if current_size + row_size > available_size and current_chunk_rows:
109
+ # Create chunk
110
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
111
+ text_chunks.append(chunk_text)
112
+ log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
113
+
114
+ # Overlap: keep last 2 rows
115
+ overlap_count = min(2, len(current_chunk_rows))
116
+ current_chunk_rows = current_chunk_rows[-overlap_count:]
117
+ current_size = sum(len(r) + 1 for r in current_chunk_rows)
118
+
119
+ current_chunk_rows.append(row)
120
+ current_size += row_size
121
+
122
+ # Final chunk
123
+ if current_chunk_rows:
124
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
125
+ text_chunks.append(chunk_text)
126
+ log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
127
 
128
+ log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
129
 
130
+ # Create enriched chunks
131
  chunked_docs = []
132
+ materials = table_metadata.get("materials", [])
133
+ key_terms = table_metadata.get("key_terms", [])
134
+
135
  for i, chunk_text in enumerate(text_chunks):
136
  chunk_metadata = doc.metadata.copy()
 
 
137
  chunk_metadata.update({
138
  "chunk_id": i,
139
  "total_chunks": len(text_chunks),
140
  "chunk_size": len(chunk_text),
141
  "is_chunked": True,
142
+ "materials": materials,
143
+ "key_terms": key_terms,
144
+ "table_summary": table_metadata.get("summary", "")
145
  })
146
 
147
+ # Enrichment prefix
148
+ materials_str = ', '.join(materials[:10]) if materials else 'нет'
149
+ terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
150
+
151
+ enriched_text = f"""[Таблица {table_num}: {table_title}]
152
+ [Материалы в таблице: {materials_str}]
153
+ [Ключевые термины: {terms_str}]
154
 
155
  {chunk_text}"""
156
 
157
+ log_message(f" ✓ Чанк {i+1}/{len(text_chunks)}: "
158
+ f"размер={len(enriched_text)}, "
159
+ f"материалов={len(materials)}, "
160
+ f"терминов={len(key_terms)}")
161
+
162
  chunked_doc = Document(
163
  text=enriched_text,
164
  metadata=chunk_metadata
 
167
 
168
  return chunked_docs
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  def table_to_document(table_data, document_id=None):
171
  if not isinstance(table_data, dict):
172
  log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
utils.py CHANGED
@@ -231,8 +231,7 @@ def generate_sources_html(nodes, chunks_df=None):
231
 
232
  html += "</div>"
233
  return html
234
- def answer_question(question, query_engine, reranker, current_model, chunks_df=None, hybrid_retriever=None):
235
- from index_retriever import hybrid_retrieve_with_keywords
236
  if query_engine is None:
237
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
238
 
@@ -241,18 +240,18 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
241
 
242
  llm = get_llm_model(current_model)
243
 
244
- # Use keyword-enhanced retrieval
245
- retrieved_nodes = hybrid_retrieve_with_keywords(question, query_engine, top_k=40)
246
- log_message(f"Hybrid keyword retrieval: получено {len(retrieved_nodes)} узлов")
 
247
 
248
- # Rerank
249
  reranked_nodes = rerank_nodes(
250
  question,
251
  retrieved_nodes,
252
  reranker,
253
  top_k=25,
254
- min_score_threshold=0.3,
255
- diversity_penalty=0.2
256
  )
257
 
258
  formatted_context = format_context_for_llm(reranked_nodes)
 
231
 
232
  html += "</div>"
233
  return html
234
+ def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
 
235
  if query_engine is None:
236
  return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
237
 
 
240
 
241
  llm = get_llm_model(current_model)
242
 
243
+ # Direct retrieval without query expansion
244
+ retrieved_nodes = query_engine.retriever.retrieve(question)
245
+
246
+ log_message(f"Получено {len(retrieved_nodes)} узлов")
247
 
 
248
  reranked_nodes = rerank_nodes(
249
  question,
250
  retrieved_nodes,
251
  reranker,
252
  top_k=25,
253
+ min_score_threshold=0.5,
254
+ diversity_penalty=0.3
255
  )
256
 
257
  formatted_context = format_context_for_llm(reranked_nodes)