MrSimple07 commited on
Commit
123a5db
·
1 Parent(s): 83b921a

simplest version

Browse files
Files changed (2) hide show
  1. documents_prep.py +80 -54
  2. utils.py +75 -38
documents_prep.py CHANGED
@@ -38,42 +38,60 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_rows(table_data, doc_id, max_rows=10): # Reduced from 30
42
  headers = table_data.get('headers', [])
43
  rows = table_data.get('data', [])
44
  table_num = table_data.get('table_number', 'unknown')
45
  table_title = table_data.get('table_title', '')
46
  section = table_data.get('section', '')
47
 
48
- # Keep original format
49
  table_num_clean = str(table_num).strip()
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if not rows:
 
52
  return []
53
 
54
- # For small tables, keep as single chunk
 
 
55
  if len(rows) <= max_rows:
56
- content = format_table_content(table_data, headers, rows)
57
  chunk_size = len(content)
58
- log_message(f" 📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → 1 chunk ({chunk_size} chars)")
59
 
60
- return [Document(
61
- text=content,
62
- metadata={
63
- 'type': 'table',
64
- 'document_id': doc_id,
65
- 'table_number': table_num_clean,
66
- 'table_title': table_title,
67
- 'section': section,
68
- 'total_rows': len(rows),
69
- 'chunk_size': chunk_size,
70
- 'is_complete_table': True
71
- }
72
- )]
73
-
74
- # For large tables, chunk with overlap
 
 
 
75
  chunks = []
76
- overlap = 3 # Reduced overlap
77
  chunk_num = 0
78
 
79
  for i in range(0, len(rows), max_rows - overlap):
@@ -85,71 +103,79 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=10): # Reduced from 30
85
  table_data,
86
  headers,
87
  chunk_rows,
 
88
  chunk_info=chunk_info
89
  )
90
 
91
  chunk_size = len(content)
92
 
93
- chunks.append(Document(
94
- text=content,
95
- metadata={
96
- 'type': 'table',
97
- 'document_id': doc_id,
98
- 'table_number': table_num_clean,
99
- 'table_title': table_title,
100
- 'section': section,
101
- 'chunk_id': chunk_num,
102
- 'row_start': i,
103
- 'row_end': i + len(chunk_rows),
104
- 'total_rows': len(rows),
105
- 'chunk_size': chunk_size,
106
- 'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
107
- 'is_complete_table': False
108
- }
109
- ))
 
 
 
 
 
110
  chunk_num += 1
111
 
112
- log_message(f" 📊 Table {table_num_clean} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
113
- for idx, chunk in enumerate(chunks):
114
- log_message(f" Chunk {idx+1}: rows {chunk.metadata['row_start']}-{chunk.metadata['row_end']} ({chunk.metadata['chunk_size']} chars)")
115
 
116
  return chunks
117
 
118
 
119
- def format_table_content(table_data, headers, rows, chunk_info=""):
120
  doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
121
  table_num = table_data.get('table_number', 'unknown')
122
  table_title = table_data.get('table_title', '')
123
  section = table_data.get('section', '')
124
 
125
- table_num_clean = str(table_num).replace('№', '').strip()
126
-
127
  content = f"ДОКУМЕНТ: {doc_id}\n"
128
- content += f"ТАБЛИЦА: {table_num_clean}\n"
129
- content += f"НОМЕР ТАБЛИЦЫ: {table_num_clean}\n"
 
130
  if table_title:
131
  content += f"НАЗВАНИЕ: {table_title}\n"
132
  if section:
133
  content += f"РАЗДЕЛ: {section}\n"
134
  content += f"{'='*70}\n\n"
135
 
136
- content += f"Это таблица {table_num_clean} из документа {doc_id}. "
137
- content += f"Номер таблицы: {table_num_clean}. "
 
 
138
  content += f"Документ: {doc_id}. "
139
 
 
 
 
 
 
140
  if table_title:
141
  content += f"Название таблицы: {table_title}. "
142
  content += f"Таблица о: {table_title}. "
143
 
144
- if section:
145
- content += f"Раздел: {section}. "
146
-
147
- content += f"Поиск: таблица {table_num_clean} {doc_id}. "
148
 
149
  if chunk_info:
150
  content += f"\n{chunk_info}\n"
151
 
152
- content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_num_clean}:\n"
153
  content += f"="*70 + "\n\n"
154
 
155
  if headers:
@@ -169,7 +195,7 @@ def format_table_content(table_data, headers, rows, chunk_info=""):
169
  content += f"{idx}. {' | '.join(parts)}\n"
170
 
171
  content += f"\n{'='*70}\n"
172
- content += f"КОНЕЦ ТАБЛИЦЫ {table_num_clean} ИЗ {doc_id}\n"
173
 
174
  return content
175
 
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
42
  headers = table_data.get('headers', [])
43
  rows = table_data.get('data', [])
44
  table_num = table_data.get('table_number', 'unknown')
45
  table_title = table_data.get('table_title', '')
46
  section = table_data.get('section', '')
47
 
48
+ # Enhanced table identification
49
  table_num_clean = str(table_num).strip()
50
 
51
+ # Create unique table identifier with section context
52
+ if 'приложени' in section.lower():
53
+ # Extract appendix number
54
+ import re
55
+ appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
56
+ if appendix_match:
57
+ appendix_num = appendix_match.group(1).upper()
58
+ table_identifier = f"{table_num_clean} (Приложение {appendix_num})"
59
+ else:
60
+ table_identifier = f"{table_num_clean} ({section[:30]})"
61
+ else:
62
+ table_identifier = table_num_clean
63
+
64
  if not rows:
65
+ log_message(f" ⚠️ Table {table_identifier} ({doc_id}): Empty table, skipping")
66
  return []
67
 
68
+ log_message(f" 📊 Processing Table {table_identifier} ({doc_id}): {len(rows)} rows, {len(headers)} columns")
69
+
70
+ # For small tables
71
  if len(rows) <= max_rows:
72
+ content = format_table_content(table_data, headers, rows, table_identifier)
73
  chunk_size = len(content)
 
74
 
75
+ metadata = {
76
+ 'type': 'table',
77
+ 'document_id': doc_id,
78
+ 'table_number': table_num_clean,
79
+ 'table_identifier': table_identifier, # NEW: unique identifier
80
+ 'table_title': table_title,
81
+ 'section': section,
82
+ 'total_rows': len(rows),
83
+ 'chunk_size': chunk_size,
84
+ 'is_complete_table': True
85
+ }
86
+
87
+ log_message(f" ✓ Single chunk created:")
88
+ log_message(f" Metadata: {metadata}")
89
+
90
+ return [Document(text=content, metadata=metadata)]
91
+
92
+ # For large tables with chunking
93
  chunks = []
94
+ overlap = 3
95
  chunk_num = 0
96
 
97
  for i in range(0, len(rows), max_rows - overlap):
 
103
  table_data,
104
  headers,
105
  chunk_rows,
106
+ table_identifier,
107
  chunk_info=chunk_info
108
  )
109
 
110
  chunk_size = len(content)
111
 
112
+ metadata = {
113
+ 'type': 'table',
114
+ 'document_id': doc_id,
115
+ 'table_number': table_num_clean,
116
+ 'table_identifier': table_identifier, # NEW
117
+ 'table_title': table_title,
118
+ 'section': section,
119
+ 'chunk_id': chunk_num,
120
+ 'row_start': i,
121
+ 'row_end': i + len(chunk_rows),
122
+ 'total_rows': len(rows),
123
+ 'chunk_size': chunk_size,
124
+ 'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
125
+ 'is_complete_table': False
126
+ }
127
+
128
+ chunks.append(Document(text=content, metadata=metadata))
129
+
130
+ log_message(f" Chunk {chunk_num+1} created:")
131
+ log_message(f" Rows: {i}-{i+len(chunk_rows)}, Size: {chunk_size} chars")
132
+ log_message(f" Metadata: {metadata}")
133
+
134
  chunk_num += 1
135
 
136
+ log_message(f" Table {table_identifier} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
 
 
137
 
138
  return chunks
139
 
140
 
141
+ def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
142
  doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
143
  table_num = table_data.get('table_number', 'unknown')
144
  table_title = table_data.get('table_title', '')
145
  section = table_data.get('section', '')
146
 
147
+ # Use enhanced identifier
 
148
  content = f"ДОКУМЕНТ: {doc_id}\n"
149
+ content += f"ТАБЛИЦА: {table_identifier}\n"
150
+ content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
151
+ content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
152
  if table_title:
153
  content += f"НАЗВАНИЕ: {table_title}\n"
154
  if section:
155
  content += f"РАЗДЕЛ: {section}\n"
156
  content += f"{'='*70}\n\n"
157
 
158
+ # Enhanced search keywords
159
+ content += f"Это таблица {table_identifier} из документа {doc_id}. "
160
+ content += f"Идентификатор таблицы: {table_identifier}. "
161
+ content += f"Номер: {table_num}. "
162
  content += f"Документ: {doc_id}. "
163
 
164
+ if section:
165
+ content += f"Находится в разделе: {section}. "
166
+ if 'приложени' in section.lower():
167
+ content += f"Таблица из приложения. "
168
+
169
  if table_title:
170
  content += f"Название таблицы: {table_title}. "
171
  content += f"Таблица о: {table_title}. "
172
 
173
+ content += f"Поиск: таблица {table_identifier} {doc_id}. "
 
 
 
174
 
175
  if chunk_info:
176
  content += f"\n{chunk_info}\n"
177
 
178
+ content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
179
  content += f"="*70 + "\n\n"
180
 
181
  if headers:
 
195
  content += f"{idx}. {' | '.join(parts)}\n"
196
 
197
  content += f"\n{'='*70}\n"
198
+ content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
199
 
200
  return content
201
 
utils.py CHANGED
@@ -24,9 +24,15 @@ def format_sources(nodes):
24
  doc_id = meta.get('document_id', 'unknown')
25
 
26
  if doc_type == 'table':
27
- table_num = meta.get('table_number', 'unknown')
28
  title = meta.get('table_title', '')
29
- sources.append(f"📊 {doc_id} - Таблица {table_num}: {title}")
 
 
 
 
 
 
30
  elif doc_type == 'image':
31
  img_num = meta.get('image_number', 'unknown')
32
  sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
@@ -34,76 +40,107 @@ def format_sources(nodes):
34
  section = meta.get('section_id', '')
35
  sources.append(f"📄 {doc_id} - Раздел {section}")
36
 
37
- return "\n".join(set(sources))
38
 
39
  def preprocess_query(question):
40
  import re
41
 
42
  question_lower = question.lower()
43
 
44
- table_match = re.search(r'табли[цу]\w*\s+([а-яa-z0-9\.]+)', question_lower)
 
 
 
 
 
45
  doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
46
 
47
  enhanced_query = question
48
 
49
- if table_match:
50
- table_num = table_match.group(1).upper()
51
- enhanced_query += f" таблица номер {table_num}"
 
 
 
 
 
 
 
 
52
 
53
  if doc_match:
54
  doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
55
  enhanced_query += f" документ {doc_id}"
 
 
56
 
57
  return enhanced_query
58
 
59
  def answer_question(question, query_engine, reranker):
60
  try:
 
61
  log_message(f"Query: {question}")
 
62
 
63
  enhanced_query = preprocess_query(question)
64
  if enhanced_query != question:
65
  log_message(f"Enhanced query: {enhanced_query}")
66
 
67
  retrieved = query_engine.retriever.retrieve(enhanced_query)
68
- log_message(f"Retrieved {len(retrieved)} nodes")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- doc_ids = [n.metadata.get('document_id', 'unknown') for n in retrieved]
71
- table_nums = [n.metadata.get('table_number', '') for n in retrieved if n.metadata.get('type') == 'table']
72
- log_message(f"Retrieved from documents: {set(doc_ids)}")
73
- if table_nums:
74
- log_message(f"Retrieved tables: {set(table_nums)}")
 
 
 
75
 
76
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
77
- log_message(f"Reranked to {len(reranked)} nodes")
78
 
79
- doc_ids_reranked = [n.metadata.get('document_id', 'unknown') for n in reranked]
80
- table_nums_reranked = [n.metadata.get('table_number', '') for n in reranked if n.metadata.get('type') == 'table']
81
- log_message(f"After reranking - documents: {set(doc_ids_reranked)}")
82
- if table_nums_reranked:
83
- log_message(f"After reranking - tables: {set(table_nums_reranked)}")
84
-
85
- context_parts = []
86
  for n in reranked:
87
- meta = n.metadata
88
- doc_id = meta.get('document_id', 'unknown')
89
- doc_type = meta.get('type', 'text')
90
 
91
- if doc_type == 'table':
92
- table_num = meta.get('table_number', 'unknown')
93
- title = meta.get('table_title', '')
94
- source_label = f"[ТАБЛИЦА {table_num} - {doc_id}]"
95
- if title:
96
- source_label += f" {title}"
97
- elif doc_type == 'image':
98
- img_num = meta.get('image_number', 'unknown')
99
- source_label = f"[РИСУНОК {img_num} - {doc_id}]"
100
  else:
101
- section = meta.get('section_id', '')
102
- source_label = f"[{doc_id} - {section}]"
103
-
104
- context_parts.append(f"{source_label}\n{n.text}")
105
 
106
- context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
 
 
 
 
 
 
 
 
107
 
108
  prompt = f"""Ты эксперт по технической документации.
109
 
 
24
  doc_id = meta.get('document_id', 'unknown')
25
 
26
  if doc_type == 'table':
27
+ table_id = meta.get('table_identifier', meta.get('table_number', 'unknown'))
28
  title = meta.get('table_title', '')
29
+ section = meta.get('section', '')
30
+ source = f"📊 {doc_id} - {table_id}"
31
+ if title:
32
+ source += f": {title}"
33
+ if section:
34
+ source += f" ({section})"
35
+ sources.append(source)
36
  elif doc_type == 'image':
37
  img_num = meta.get('image_number', 'unknown')
38
  sources.append(f"🖼️ {doc_id} - Рисунок {img_num}")
 
40
  section = meta.get('section_id', '')
41
  sources.append(f"📄 {doc_id} - Раздел {section}")
42
 
43
+ return "\n".join(sources) # Don't use set() to preserve order
44
 
45
  def preprocess_query(question):
46
  import re
47
 
48
  question_lower = question.lower()
49
 
50
+ # Enhanced table detection with appendix
51
+ table_patterns = [
52
+ r'табли[цу]\w*\s+([а-яa-z0-9\.]+)(?:\s+(?:из\s+)?приложени[яеий]\s+(\d+|[а-я]))?',
53
+ r'табли[цу]\w*\s+(?:№|номер)?\s*([а-яa-z0-9\.]+)',
54
+ ]
55
+
56
  doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
57
 
58
  enhanced_query = question
59
 
60
+ for pattern in table_patterns:
61
+ table_match = re.search(pattern, question_lower)
62
+ if table_match:
63
+ table_num = table_match.group(1).upper()
64
+ enhanced_query += f" таблица номер {table_num}"
65
+
66
+ # Add appendix context if mentioned
67
+ if len(table_match.groups()) > 1 and table_match.group(2):
68
+ appendix_num = table_match.group(2).upper()
69
+ enhanced_query += f" приложение {appendix_num}"
70
+ break
71
 
72
  if doc_match:
73
  doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
74
  enhanced_query += f" документ {doc_id}"
75
+ # Add variations for better matching
76
+ enhanced_query += f" {doc_match.group(1).upper()}Р {doc_match.group(2)}"
77
 
78
  return enhanced_query
79
 
80
  def answer_question(question, query_engine, reranker):
81
  try:
82
+ log_message(f"\n{'='*70}")
83
  log_message(f"Query: {question}")
84
+ log_message(f"{'='*70}")
85
 
86
  enhanced_query = preprocess_query(question)
87
  if enhanced_query != question:
88
  log_message(f"Enhanced query: {enhanced_query}")
89
 
90
  retrieved = query_engine.retriever.retrieve(enhanced_query)
91
+ log_message(f"\n📥 INITIAL RETRIEVAL: {len(retrieved)} nodes")
92
+
93
+ # Detailed logging
94
+ doc_ids = {}
95
+ for n in retrieved:
96
+ doc_id = n.metadata.get('document_id', 'unknown')
97
+ if doc_id not in doc_ids:
98
+ doc_ids[doc_id] = {'tables': [], 'text': 0, 'images': 0}
99
+
100
+ if n.metadata.get('type') == 'table':
101
+ table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
102
+ doc_ids[doc_id]['tables'].append(table_id)
103
+ elif n.metadata.get('type') == 'image':
104
+ doc_ids[doc_id]['images'] += 1
105
+ else:
106
+ doc_ids[doc_id]['text'] += 1
107
 
108
+ for doc_id, counts in doc_ids.items():
109
+ log_message(f" 📄 {doc_id}:")
110
+ if counts['tables']:
111
+ log_message(f" Tables: {', '.join(set(counts['tables']))}")
112
+ if counts['text']:
113
+ log_message(f" Text chunks: {counts['text']}")
114
+ if counts['images']:
115
+ log_message(f" Images: {counts['images']}")
116
 
117
  reranked = rerank_nodes(question, retrieved, reranker, top_k=25)
118
+ log_message(f"\n🔄 AFTER RERANKING: {len(reranked)} nodes")
119
 
120
+ # Detailed reranking results
121
+ doc_ids_reranked = {}
 
 
 
 
 
122
  for n in reranked:
123
+ doc_id = n.metadata.get('document_id', 'unknown')
124
+ if doc_id not in doc_ids_reranked:
125
+ doc_ids_reranked[doc_id] = {'tables': [], 'text': 0, 'images': 0}
126
 
127
+ if n.metadata.get('type') == 'table':
128
+ table_id = n.metadata.get('table_identifier', n.metadata.get('table_number', ''))
129
+ doc_ids_reranked[doc_id]['tables'].append(table_id)
130
+ elif n.metadata.get('type') == 'image':
131
+ doc_ids_reranked[doc_id]['images'] += 1
 
 
 
 
132
  else:
133
+ doc_ids_reranked[doc_id]['text'] += 1
 
 
 
134
 
135
+ for doc_id, counts in doc_ids_reranked.items():
136
+ log_message(f" 📄 {doc_id}:")
137
+ if counts['tables']:
138
+ log_message(f" Tables: {', '.join(set(counts['tables']))}")
139
+ if counts['text']:
140
+ log_message(f" Text chunks: {counts['text']}")
141
+ if counts['images']:
142
+ log_message(f" Images: {counts['images']}")
143
+ context = "\n\n" + ("="*70 + "\n\n").join(doc_ids_reranked)
144
 
145
  prompt = f"""Ты эксперт по технической документации.
146