MrSimple07 commited on
Commit
0b6ee4f
·
1 Parent(s): c7a9dbd

simplest version

Browse files
Files changed (2) hide show
  1. documents_prep.py +86 -72
  2. utils.py +12 -28
documents_prep.py CHANGED
@@ -38,6 +38,21 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def chunk_table_by_rows(table_data, doc_id, max_rows=30):
42
  headers = table_data.get('headers', [])
43
  rows = table_data.get('data', [])
@@ -45,38 +60,37 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
45
  table_title = table_data.get('table_title', '')
46
  section = table_data.get('section', '')
47
 
48
- # Enhanced table identification
 
 
49
  table_num_clean = str(table_num).strip()
50
 
51
- # Create unique table identifier with section context
 
52
  if 'приложени' in section.lower():
53
- # Extract appendix number
54
- import re
55
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
56
  if appendix_match:
57
  appendix_num = appendix_match.group(1).upper()
58
- table_identifier = f"{table_num_clean} (Приложение {appendix_num})"
59
  else:
60
- table_identifier = f"{table_num_clean} ({section[:30]})"
61
  else:
62
  table_identifier = table_num_clean
63
 
64
  if not rows:
65
- log_message(f" ⚠️ Table {table_identifier} ({doc_id}): Empty table, skipping")
66
  return []
67
 
68
- log_message(f" 📊 Processing Table {table_identifier} ({doc_id}): {len(rows)} rows, {len(headers)} columns")
69
 
70
- # For small tables
71
  if len(rows) <= max_rows:
72
- content = format_table_content(table_data, headers, rows, table_identifier)
73
  chunk_size = len(content)
74
 
75
  metadata = {
76
  'type': 'table',
77
  'document_id': doc_id,
78
  'table_number': table_num_clean,
79
- 'table_identifier': table_identifier, # NEW: unique identifier
80
  'table_title': table_title,
81
  'section': section,
82
  'total_rows': len(rows),
@@ -84,27 +98,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
84
  'is_complete_table': True
85
  }
86
 
87
- log_message(f" Single chunk created:")
88
- log_message(f" Metadata: {metadata}")
89
 
90
  return [Document(text=content, metadata=metadata)]
91
 
92
- # For large tables with chunking
93
  chunks = []
94
  overlap = 3
95
- chunk_num = 0
96
 
97
  for i in range(0, len(rows), max_rows - overlap):
98
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
99
-
100
- chunk_info = f"Часть {chunk_num+1}: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
101
 
102
  content = format_table_content(
103
  table_data,
104
  headers,
105
- chunk_rows,
 
106
  table_identifier,
107
- chunk_info=chunk_info
108
  )
109
 
110
  chunk_size = len(content)
@@ -113,7 +124,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
113
  'type': 'table',
114
  'document_id': doc_id,
115
  'table_number': table_num_clean,
116
- 'table_identifier': table_identifier, # NEW
117
  'table_title': table_title,
118
  'section': section,
119
  'chunk_id': chunk_num,
@@ -121,82 +132,77 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
121
  'row_end': i + len(chunk_rows),
122
  'total_rows': len(rows),
123
  'chunk_size': chunk_size,
124
- 'total_chunks': ((len(rows) - overlap) // (max_rows - overlap)) + 1,
125
  'is_complete_table': False
126
  }
127
 
128
- chunks.append(Document(text=content, metadata=metadata))
129
-
130
- log_message(f" Chunk {chunk_num+1} created:")
131
- log_message(f" Rows: {i}-{i+len(chunk_rows)}, Size: {chunk_size} chars")
132
- log_message(f" Metadata: {metadata}")
133
 
134
- chunk_num += 1
135
-
136
- log_message(f" ✓ Table {table_identifier} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
137
 
138
  return chunks
139
 
140
 
141
- def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
142
- doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
143
  table_num = table_data.get('table_number', 'unknown')
144
  table_title = table_data.get('table_title', '')
145
  section = table_data.get('section', '')
146
 
147
- # Use enhanced identifier
148
  content = f"ДОКУМЕНТ: {doc_id}\n"
149
  content += f"ТАБЛИЦА: {table_identifier}\n"
150
- content += f"ПОЛНОЕ НАЗВАНИЕ: {table_identifier}\n"
151
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
 
 
 
 
 
 
 
 
152
  if table_title:
153
  content += f"НАЗВАНИЕ: {table_title}\n"
154
  if section:
155
  content += f"РАЗДЕЛ: {section}\n"
156
- content += f"{'='*70}\n\n"
157
 
158
- # Enhanced search keywords
159
- content += f"Это таблица {table_identifier} из документа {doc_id}. "
160
- content += f"Идентификатор таблицы: {table_identifier}. "
161
- content += f"Номер: {table_num}. "
162
- content += f"Документ: {doc_id}. "
163
 
164
- if section:
165
- content += f"Находится в разделе: {section}. "
166
- if 'приложени' in section.lower():
167
- content += f"Таблица из приложения. "
168
 
169
  if table_title:
170
- content += f"Название таблицы: {table_title}. "
171
- content += f"Таблица о: {table_title}. "
172
 
173
- content += f"Поиск: таблица {table_identifier} {doc_id}. "
 
 
 
 
 
174
 
175
  if chunk_info:
176
- content += f"\n{chunk_info}\n"
177
 
178
- content += f"\n\nСОДЕРЖИМОЕ ТАБЛИЦЫ {table_identifier}:\n"
179
- content += f"="*70 + "\n\n"
180
 
181
  if headers:
182
- header_str = ' | '.join(str(h) for h in headers)
183
- content += f"ЗАГОЛОВКИ СТОЛБЦОВ:\n{header_str}\n\n"
184
 
185
- content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
186
  for idx, row in enumerate(rows, 1):
187
  if isinstance(row, dict):
188
  parts = [f"{k}: {v}" for k, v in row.items()
189
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
190
  if parts:
191
  content += f"{idx}. {' | '.join(parts)}\n"
192
  elif isinstance(row, list):
193
- parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
194
  if parts:
195
  content += f"{idx}. {' | '.join(parts)}\n"
196
 
197
- content += f"\n{'='*70}\n"
198
- content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
199
-
200
  return content
201
 
202
  def load_json_documents(repo_id, hf_token, json_dir):
@@ -328,7 +334,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
328
  return documents
329
 
330
  def extract_sections_from_json(json_path):
331
- """Extract sections from a single JSON file"""
332
  documents = []
333
 
334
  try:
@@ -336,8 +341,8 @@ def extract_sections_from_json(json_path):
336
  data = json.load(f)
337
 
338
  doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
 
339
 
340
- # Extract all section levels
341
  for section in data.get('sections', []):
342
  if section.get('section_text', '').strip():
343
  documents.append(Document(
@@ -345,11 +350,11 @@ def extract_sections_from_json(json_path):
345
  metadata={
346
  'type': 'text',
347
  'document_id': doc_id,
348
- 'section_id': section.get('section_id', '')
 
349
  }
350
  ))
351
 
352
- # Subsections
353
  for subsection in section.get('subsections', []):
354
  if subsection.get('subsection_text', '').strip():
355
  documents.append(Document(
@@ -357,11 +362,11 @@ def extract_sections_from_json(json_path):
357
  metadata={
358
  'type': 'text',
359
  'document_id': doc_id,
360
- 'section_id': subsection.get('subsection_id', '')
 
361
  }
362
  ))
363
 
364
- # Sub-subsections
365
  for sub_sub in subsection.get('sub_subsections', []):
366
  if sub_sub.get('sub_subsection_text', '').strip():
367
  documents.append(Document(
@@ -369,7 +374,8 @@ def extract_sections_from_json(json_path):
369
  metadata={
370
  'type': 'text',
371
  'document_id': doc_id,
372
- 'section_id': sub_sub.get('sub_subsection_id', '')
 
373
  }
374
  ))
375
 
@@ -380,13 +386,14 @@ def extract_sections_from_json(json_path):
380
 
381
 
382
  def load_table_documents(repo_id, hf_token, table_dir):
383
- """Load and chunk tables"""
384
  log_message("Loading tables...")
385
 
386
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
387
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
388
 
389
  all_chunks = []
 
 
390
  for file_path in table_files:
391
  try:
392
  local_path = hf_hub_download(
@@ -399,21 +406,28 @@ def load_table_documents(repo_id, hf_token, table_dir):
399
  with open(local_path, 'r', encoding='utf-8') as f:
400
  data = json.load(f)
401
 
402
- # Extract file-level document_id
403
- file_doc_id = data.get('document_id', data.get('document', 'unknown'))
404
 
405
  for sheet in data.get('sheets', []):
406
- # Use sheet-level document_id if available, otherwise use file-level
407
- sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
408
 
409
- # CRITICAL: Pass document_id to chunk function
410
  chunks = chunk_table_by_rows(sheet, sheet_doc_id)
411
  all_chunks.extend(chunks)
412
 
 
 
 
 
413
  except Exception as e:
414
  log_message(f"Error loading {file_path}: {e}")
415
 
416
- log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
 
 
 
 
417
  return all_chunks
418
 
419
 
 
38
  return chunked
39
 
40
 
41
+ def normalize_doc_id(doc_id):
42
+ """Normalize document ID for consistent matching"""
43
+ if not doc_id or doc_id == 'unknown':
44
+ return doc_id
45
+
46
+ doc_id = str(doc_id).strip()
47
+
48
+ # Normalize spacing: "ГОСТ Р" variations
49
+ import re
50
+ doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
51
+ doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
52
+
53
+ return doc_id
54
+
55
+
56
  def chunk_table_by_rows(table_data, doc_id, max_rows=30):
57
  headers = table_data.get('headers', [])
58
  rows = table_data.get('data', [])
 
60
  table_title = table_data.get('table_title', '')
61
  section = table_data.get('section', '')
62
 
63
+ # NORMALIZE document ID
64
+ doc_id = normalize_doc_id(doc_id)
65
+
66
  table_num_clean = str(table_num).strip()
67
 
68
+ # Create section-aware identifier
69
+ import re
70
  if 'приложени' in section.lower():
 
 
71
  appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
72
  if appendix_match:
73
  appendix_num = appendix_match.group(1).upper()
74
+ table_identifier = f"{table_num_clean} Приложение {appendix_num}"
75
  else:
76
+ table_identifier = table_num_clean
77
  else:
78
  table_identifier = table_num_clean
79
 
80
  if not rows:
 
81
  return []
82
 
83
+ log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
84
 
 
85
  if len(rows) <= max_rows:
86
+ content = format_table_content(table_data, headers, rows, doc_id, table_identifier)
87
  chunk_size = len(content)
88
 
89
  metadata = {
90
  'type': 'table',
91
  'document_id': doc_id,
92
  'table_number': table_num_clean,
93
+ 'table_identifier': table_identifier,
94
  'table_title': table_title,
95
  'section': section,
96
  'total_rows': len(rows),
 
98
  'is_complete_table': True
99
  }
100
 
101
+ log_message(f" Chunk: 1/1, {chunk_size} chars, doc={doc_id}, table={table_identifier}")
 
102
 
103
  return [Document(text=content, metadata=metadata)]
104
 
 
105
  chunks = []
106
  overlap = 3
 
107
 
108
  for i in range(0, len(rows), max_rows - overlap):
109
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
110
+ chunk_num = i // (max_rows - overlap)
 
111
 
112
  content = format_table_content(
113
  table_data,
114
  headers,
115
+ chunk_rows,
116
+ doc_id,
117
  table_identifier,
118
+ chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
119
  )
120
 
121
  chunk_size = len(content)
 
124
  'type': 'table',
125
  'document_id': doc_id,
126
  'table_number': table_num_clean,
127
+ 'table_identifier': table_identifier,
128
  'table_title': table_title,
129
  'section': section,
130
  'chunk_id': chunk_num,
 
132
  'row_end': i + len(chunk_rows),
133
  'total_rows': len(rows),
134
  'chunk_size': chunk_size,
135
+ 'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
136
  'is_complete_table': False
137
  }
138
 
139
+ log_message(f" Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
 
 
 
 
140
 
141
+ chunks.append(Document(text=content, metadata=metadata))
 
 
142
 
143
  return chunks
144
 
145
 
146
+ def format_table_content(table_data, headers, rows, doc_id, table_identifier, chunk_info=""):
 
147
  table_num = table_data.get('table_number', 'unknown')
148
  table_title = table_data.get('table_title', '')
149
  section = table_data.get('section', '')
150
 
151
+ # Build content with multiple search variations
152
  content = f"ДОКУМЕНТ: {doc_id}\n"
153
  content += f"ТАБЛИЦА: {table_identifier}\n"
154
+
155
+ # Add search variations for document ID
156
+ doc_variations = [doc_id]
157
+ if 'Р' in doc_id:
158
+ doc_variations.append(doc_id.replace(' Р ', ' Р'))
159
+ doc_variations.append(doc_id.replace(' Р ', 'Р'))
160
+
161
+ for var in set(doc_variations):
162
+ content += f"ДОКУМЕНТ_ВАРИАНТ: {var}\n"
163
+
164
  if table_title:
165
  content += f"НАЗВАНИЕ: {table_title}\n"
166
  if section:
167
  content += f"РАЗДЕЛ: {section}\n"
 
168
 
169
+ content += f"{'='*70}\n\n"
 
 
 
 
170
 
171
+ # Enhanced search text
172
+ content += f"Документ {doc_id}. "
173
+ content += f"Таблица {table_identifier}. "
174
+ content += f"Номер таблицы {table_num}. "
175
 
176
  if table_title:
177
+ content += f"Название: {table_title}. "
 
178
 
179
+ if section:
180
+ content += f"Раздел: {section}. "
181
+
182
+ # Add more search patterns
183
+ content += f"Таблицы документа {doc_id}. "
184
+ content += f"Содержание {doc_id}. "
185
 
186
  if chunk_info:
187
+ content += f"{chunk_info}. "
188
 
189
+ content += f"\n\nДАННЫЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
 
190
 
191
  if headers:
192
+ content += f"СТОЛБЦЫ: {' | '.join(str(h) for h in headers)}\n\n"
 
193
 
 
194
  for idx, row in enumerate(rows, 1):
195
  if isinstance(row, dict):
196
  parts = [f"{k}: {v}" for k, v in row.items()
197
+ if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
198
  if parts:
199
  content += f"{idx}. {' | '.join(parts)}\n"
200
  elif isinstance(row, list):
201
+ parts = [str(v) for v in row
202
+ if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
203
  if parts:
204
  content += f"{idx}. {' | '.join(parts)}\n"
205
 
 
 
 
206
  return content
207
 
208
  def load_json_documents(repo_id, hf_token, json_dir):
 
334
  return documents
335
 
336
  def extract_sections_from_json(json_path):
 
337
  documents = []
338
 
339
  try:
 
341
  data = json.load(f)
342
 
343
  doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
344
+ doc_id = normalize_doc_id(doc_id) # NORMALIZE
345
 
 
346
  for section in data.get('sections', []):
347
  if section.get('section_text', '').strip():
348
  documents.append(Document(
 
350
  metadata={
351
  'type': 'text',
352
  'document_id': doc_id,
353
+ 'section_id': section.get('section_id', ''),
354
+ 'chunk_size': len(section['section_text'])
355
  }
356
  ))
357
 
 
358
  for subsection in section.get('subsections', []):
359
  if subsection.get('subsection_text', '').strip():
360
  documents.append(Document(
 
362
  metadata={
363
  'type': 'text',
364
  'document_id': doc_id,
365
+ 'section_id': subsection.get('subsection_id', ''),
366
+ 'chunk_size': len(subsection['subsection_text'])
367
  }
368
  ))
369
 
 
370
  for sub_sub in subsection.get('sub_subsections', []):
371
  if sub_sub.get('sub_subsection_text', '').strip():
372
  documents.append(Document(
 
374
  metadata={
375
  'type': 'text',
376
  'document_id': doc_id,
377
+ 'section_id': sub_sub.get('sub_subsection_id', ''),
378
+ 'chunk_size': len(sub_sub['sub_subsection_text'])
379
  }
380
  ))
381
 
 
386
 
387
 
388
  def load_table_documents(repo_id, hf_token, table_dir):
 
389
  log_message("Loading tables...")
390
 
391
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
392
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
393
 
394
  all_chunks = []
395
+ doc_table_count = {}
396
+
397
  for file_path in table_files:
398
  try:
399
  local_path = hf_hub_download(
 
406
  with open(local_path, 'r', encoding='utf-8') as f:
407
  data = json.load(f)
408
 
409
+ file_doc_id = normalize_doc_id(data.get('document_id', data.get('document', 'unknown')))
 
410
 
411
  for sheet in data.get('sheets', []):
412
+ sheet_doc_id = normalize_doc_id(sheet.get('document_id', sheet.get('document', file_doc_id)))
 
413
 
 
414
  chunks = chunk_table_by_rows(sheet, sheet_doc_id)
415
  all_chunks.extend(chunks)
416
 
417
+ if sheet_doc_id not in doc_table_count:
418
+ doc_table_count[sheet_doc_id] = 0
419
+ doc_table_count[sheet_doc_id] += len(chunks)
420
+
421
  except Exception as e:
422
  log_message(f"Error loading {file_path}: {e}")
423
 
424
+ log_message(f"\n{'='*60}")
425
+ log_message("TABLE LOADING SUMMARY:")
426
+ for doc_id, count in sorted(doc_table_count.items()):
427
+ log_message(f" {doc_id}: {count} table chunks")
428
+ log_message(f"TOTAL: {len(all_chunks)} table chunks")
429
+ log_message(f"{'='*60}\n")
430
+
431
  return all_chunks
432
 
433
 
utils.py CHANGED
@@ -41,33 +41,19 @@ def preprocess_query(question):
41
 
42
  question_lower = question.lower()
43
 
44
- # Enhanced table detection with appendix
45
- table_patterns = [
46
- r'табли[цу]\w*\s+([а-яa-z0-9\.]+)(?:\s+(?:из\s+)?приложени[яеий]\s+(\d+|[а-я]))?',
47
- r'табли[цу]\w*\s+(?:№|номер)?\s*([а-яa-z0-9\.]+)',
48
- ]
49
-
50
- doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
51
 
52
  enhanced_query = question
53
 
54
- for pattern in table_patterns:
55
- table_match = re.search(pattern, question_lower)
56
- if table_match:
57
- table_num = table_match.group(1).upper()
58
- enhanced_query += f" таблица номер {table_num}"
59
-
60
- # Add appendix context if mentioned
61
- if len(table_match.groups()) > 1 and table_match.group(2):
62
- appendix_num = table_match.group(2).upper()
63
- enhanced_query += f" приложение {appendix_num}"
64
- break
65
-
66
  if doc_match:
67
- doc_id = f"{doc_match.group(1).upper()} {doc_match.group(2)}"
68
- enhanced_query += f" документ {doc_id}"
69
- # Add variations for better matching
70
- enhanced_query += f" {doc_match.group(1).upper()}Р {doc_match.group(2)}"
 
 
 
71
 
72
  return enhanced_query
73
 
@@ -119,7 +105,7 @@ def answer_question(question, query_engine, reranker):
119
  context_parts.append(f"{source_label}\n{n.text}")
120
 
121
  context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
122
-
123
  prompt = f"""Ты эксперт по технической документации.
124
 
125
  КОНТЕКСТ:
@@ -129,10 +115,8 @@ def answer_question(question, query_engine, reranker):
129
 
130
  ИНСТРУКЦИИ:
131
  1. Используй ТОЛЬКО контекст выше
132
- 2. Если спрашивают содержание таблицы - ОБЯЗАТЕЛЬНО приведи ВСЕ данные из таблицы
133
- 3. Укажи источник: документ и номер таблицы
134
- 4. Если таблица разбита на части - объедини информацию
135
- 5. Если информации нет - четко скажи об этом
136
 
137
  ОТВЕТ:"""
138
 
 
41
 
42
  question_lower = question.lower()
43
 
44
+ # Extract document ID and normalize
45
+ doc_match = re.search(r'(гост|нп|му)\s*р?\s*[№-]*\s*([0-9\.-]+)', question_lower)
 
 
 
 
 
46
 
47
  enhanced_query = question
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if doc_match:
50
+ doc_type = doc_match.group(1).upper()
51
+ doc_num = doc_match.group(2)
52
+
53
+ # Add normalized versions
54
+ enhanced_query += f" {doc_type} Р {doc_num}"
55
+ enhanced_query += f" {doc_type}Р {doc_num}"
56
+ enhanced_query += f" {doc_type} {doc_num}"
57
 
58
  return enhanced_query
59
 
 
105
  context_parts.append(f"{source_label}\n{n.text}")
106
 
107
  context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
108
+ from config import CUSTOM_PROMPT
109
  prompt = f"""Ты эксперт по технической документации.
110
 
111
  КОНТЕКСТ:
 
115
 
116
  ИНСТРУКЦИИ:
117
  1. Используй ТОЛЬКО контекст выше
118
+ 2. Укажи источник: документ и номер таблицы
119
+ 3. Если информации нет - четко скажи об этом
 
 
120
 
121
  ОТВЕТ:"""
122