MrSimple07 commited on
Commit
0fa3553
·
1 Parent(s): affe7a3

old version of documents prep

Browse files
Files changed (1) hide show
  1. documents_prep.py +25 -102
documents_prep.py CHANGED
@@ -34,26 +34,6 @@ def chunk_text_documents(documents):
34
 
35
  return chunked
36
 
37
- def normalize_connection_type(s):
38
- # Replace Cyrillic with Latin
39
- s = s.replace('С', 'C').replace('с', 'c')
40
- s = s.replace('У', 'U').replace('у', 'u')
41
- s = s.replace('Т', 'T').replace('т', 't')
42
- s= s.replace('С-', 'C-').replace('с-', 'c-')
43
- s = s.replace('У-', 'U-').replace('у-', 'u-')
44
- s = s.replace('Т-', 'T-').replace('т-', 't-')
45
- # REMOVE ALL HYPHENS for consistent tokenization
46
- s = s.replace('-', '')
47
- return s
48
-
49
- def extract_connection_type(text):
50
- import re
51
- # Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
52
- match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
53
- if match:
54
- normalized = normalize_connection_type(match.group(0))
55
- return normalized
56
- return ''
57
 
58
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
59
  headers = table_data.get('headers', [])
@@ -61,7 +41,6 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
61
  table_num = table_data.get('table_number', 'unknown')
62
  table_title = table_data.get('table_title', '')
63
  section = table_data.get('section', '')
64
- table_description = table_data.get('table_description', '')
65
 
66
  table_num_clean = str(table_num).strip()
67
 
@@ -81,13 +60,8 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
81
 
82
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
83
 
84
- # Calculate base metadata size - NOW INCLUDING DESCRIPTION
85
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
86
-
87
- # ADD DESCRIPTION HERE if it exists
88
- if table_description:
89
- base_content += f"ОПИСАНИЕ: {table_description}\n\n"
90
-
91
  base_size = len(base_content)
92
  available_space = max_chars - base_size - 200
93
 
@@ -105,9 +79,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
105
  'section': section,
106
  'total_rows': len(rows),
107
  'chunk_size': len(content),
108
- 'is_complete_table': True,
109
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
110
-
111
  }
112
 
113
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
@@ -141,8 +113,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
141
  'row_end': current_rows[-1]['_idx'],
142
  'total_rows': len(rows),
143
  'chunk_size': len(content),
144
- 'is_complete_table': False,
145
- 'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
146
  }
147
 
148
  chunks.append(Document(text=content, metadata=metadata))
@@ -184,62 +155,44 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
184
 
185
  return chunks
186
 
 
187
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
188
- content = f"ДОКУМЕНТ: {doc_id}\n"
189
- content += f"ТАБЛИЦА: {table_identifier}\n"
190
 
191
- if table_title:
192
- # Normalize the title text itself for better searchability
193
- normalized_title = normalize_connection_type(table_title)
194
- content += f"НАЗВАНИЕ ТАБЛИЦЫ: {normalized_title}\n"
195
-
196
- # Extract and store the normalized connection type
197
- connection_type = extract_connection_type(table_title)
198
- if connection_type:
199
- content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
 
201
- if table_num and table_num != table_identifier:
202
- content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
203
 
204
  if section:
205
- content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
206
 
207
- content += f"\n{'='*70}\n"
208
 
209
  if headers:
210
- content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
211
- for i, h in enumerate(headers, 1):
212
- # NORMALIZE HEADERS TOO
213
- normalized_header = normalize_connection_type(h)
214
- content += f" {i}. {normalized_header}\n"
215
- content += "\n"
216
-
217
- content += "ДАННЫЕ ТАБЛИЦЫ:\n"
218
  return content
219
 
220
 
221
  def format_single_row(row, idx):
222
- """Format a single row with normalization"""
223
  if isinstance(row, dict):
224
- # NORMALIZE VALUES IN ROWS
225
- parts = []
226
- for k, v in row.items():
227
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
228
- normalized_v = normalize_connection_type(str(v))
229
- parts.append(f"{k}: {normalized_v}")
230
  if parts:
231
  return f"{idx}. {' | '.join(parts)}\n"
232
  elif isinstance(row, list):
233
- # NORMALIZE LIST VALUES
234
- parts = []
235
- for v in row:
236
- if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']:
237
- normalized_v = normalize_connection_type(str(v))
238
- parts.append(normalized_v)
239
  if parts:
240
  return f"{idx}. {' | '.join(parts)}\n"
241
  return ""
242
 
 
243
  def format_table_rows(rows):
244
  """Format multiple rows"""
245
  content = ""
@@ -440,8 +393,6 @@ def load_table_documents(repo_id, hf_token, table_dir):
440
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
441
 
442
  all_chunks = []
443
- connection_type_sources = {} # Track which table each type comes from
444
-
445
  for file_path in table_files:
446
  try:
447
  local_path = hf_hub_download(
@@ -458,35 +409,18 @@ def load_table_documents(repo_id, hf_token, table_dir):
458
 
459
  for sheet in data.get('sheets', []):
460
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
461
- table_num = sheet.get('table_number', 'unknown')
462
- table_title = sheet.get('table_title', '')
463
 
 
464
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
465
  all_chunks.extend(chunks)
466
 
467
- # Track connection type source
468
- conn_type = extract_connection_type(table_title)
469
- if conn_type:
470
- if conn_type not in connection_type_sources:
471
- connection_type_sources[conn_type] = []
472
- connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
473
-
474
  except Exception as e:
475
  log_message(f"Error loading {file_path}: {e}")
476
 
477
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
478
-
479
- log_message("="*60)
480
- log_message("CONNECTION TYPES AND THEIR SOURCES:")
481
- for conn_type in sorted(connection_type_sources.keys()):
482
- sources = connection_type_sources[conn_type]
483
- log_message(f" {conn_type}: {len(sources)} tables")
484
- for src in sources:
485
- log_message(f" - {src}")
486
- log_message("="*60)
487
-
488
  return all_chunks
489
 
 
490
  def load_image_documents(repo_id, hf_token, image_dir):
491
  """Load image descriptions"""
492
  log_message("Loading images...")
@@ -534,7 +468,9 @@ def load_image_documents(repo_id, hf_token, image_dir):
534
 
535
  return documents
536
 
 
537
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
 
538
  log_message("="*60)
539
  log_message("STARTING DOCUMENT LOADING")
540
  log_message("="*60)
@@ -546,19 +482,6 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
546
  # Load tables (already chunked)
547
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
548
 
549
- # NEW: Analyze connection types in tables
550
- connection_types = {}
551
- for chunk in table_chunks:
552
- conn_type = chunk.metadata.get('connection_type', '')
553
- if conn_type:
554
- connection_types[conn_type] = connection_types.get(conn_type, 0) + 1
555
-
556
- log_message("="*60)
557
- log_message("CONNECTION TYPES FOUND IN TABLES:")
558
- for conn_type, count in sorted(connection_types.items()):
559
- log_message(f" {conn_type}: {count} chunks")
560
- log_message("="*60)
561
-
562
  # Load images (no chunking needed)
563
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
564
 
 
34
 
35
  return chunked
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
39
  headers = table_data.get('headers', [])
 
41
  table_num = table_data.get('table_number', 'unknown')
42
  table_title = table_data.get('table_title', '')
43
  section = table_data.get('section', '')
 
44
 
45
  table_num_clean = str(table_num).strip()
46
 
 
60
 
61
  log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
62
 
63
+ # Calculate base metadata size
64
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
 
 
 
 
 
65
  base_size = len(base_content)
66
  available_space = max_chars - base_size - 200
67
 
 
79
  'section': section,
80
  'total_rows': len(rows),
81
  'chunk_size': len(content),
82
+ 'is_complete_table': True
 
 
83
  }
84
 
85
  log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
 
113
  'row_end': current_rows[-1]['_idx'],
114
  'total_rows': len(rows),
115
  'chunk_size': len(content),
116
+ 'is_complete_table': False
 
117
  }
118
 
119
  chunks.append(Document(text=content, metadata=metadata))
 
155
 
156
  return chunks
157
 
158
+
159
  def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
160
+ content = f"ТАБЛИЦА {table_identifier} из документа {doc_id}\n"
 
161
 
162
+ # Add table type/number prominently for matching
163
+ if table_num:
164
+ content += f"ТИП: {table_num}\n"
 
 
 
 
 
 
165
 
166
+ if table_title:
167
+ content += f"НАЗВАНИЕ: {table_title}\n"
168
 
169
  if section:
170
+ content += f"РАЗДЕЛ: {section}\n"
171
 
172
+ content += f"{'='*70}\n"
173
 
174
  if headers:
175
+ header_str = ' | '.join(str(h) for h in headers)
176
+ content += f"ЗАГОЛОВКИ: {header_str}\n\n"
177
+
178
+ content += "ДАННЫЕ:\n"
 
 
 
 
179
  return content
180
 
181
 
182
  def format_single_row(row, idx):
183
+ """Format a single row"""
184
  if isinstance(row, dict):
185
+ parts = [f"{k}: {v}" for k, v in row.items()
186
+ if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
187
  if parts:
188
  return f"{idx}. {' | '.join(parts)}\n"
189
  elif isinstance(row, list):
190
+ parts = [str(v) for v in row if v and str(v).strip() and str(v).lower() not in ['nan', 'none', '']]
 
 
 
 
 
191
  if parts:
192
  return f"{idx}. {' | '.join(parts)}\n"
193
  return ""
194
 
195
+
196
  def format_table_rows(rows):
197
  """Format multiple rows"""
198
  content = ""
 
393
  table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
394
 
395
  all_chunks = []
 
 
396
  for file_path in table_files:
397
  try:
398
  local_path = hf_hub_download(
 
409
 
410
  for sheet in data.get('sheets', []):
411
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
 
 
412
 
413
+ # Use the consistent MAX_CHARS_TABLE from config
414
  chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
415
  all_chunks.extend(chunks)
416
 
 
 
 
 
 
 
 
417
  except Exception as e:
418
  log_message(f"Error loading {file_path}: {e}")
419
 
420
  log_message(f"✓ Loaded {len(all_chunks)} table chunks")
 
 
 
 
 
 
 
 
 
 
421
  return all_chunks
422
 
423
+
424
  def load_image_documents(repo_id, hf_token, image_dir):
425
  """Load image descriptions"""
426
  log_message("Loading images...")
 
468
 
469
  return documents
470
 
471
+
472
  def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
473
+ """Main loader - combines all document types"""
474
  log_message("="*60)
475
  log_message("STARTING DOCUMENT LOADING")
476
  log_message("="*60)
 
482
  # Load tables (already chunked)
483
  table_chunks = load_table_documents(repo_id, hf_token, table_dir)
484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  # Load images (no chunking needed)
486
  image_docs = load_image_documents(repo_id, hf_token, image_dir)
487