MrSimple07 commited on
Commit
aafe88b
·
1 Parent(s): 05c597d

new debug functions + 2000 chunk size

Browse files
Files changed (1) hide show
  1. documents_prep.py +19 -17
documents_prep.py CHANGED
@@ -174,31 +174,33 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
174
  content += f"ТАБЛИЦА: {table_identifier}\n"
175
 
176
  # Extract and emphasize the connection type if present
177
- connection_type = ''
178
  if table_title:
179
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
180
 
181
- # Parse type from title - ADD MORE VARIANTS
182
  import re
183
- type_match = re.search(r'[СУUTC]-?\s*\d+(?:-\d+)?', table_title)
184
  if type_match:
185
- connection_type = type_match.group(0).replace(' ', '')
186
- # Normalize: always use С (Cyrillic)
187
- connection_type = connection_type.replace('C', 'С').replace('c', 'С')
188
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
189
- # ADD SEARCHABLE KEYWORDS
190
- content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
191
 
192
- # Also check table_identifier for type
193
- if not connection_type and table_identifier:
194
- import re
195
- type_match = re.search(r'[СУUTC]-?\s*\d+', table_identifier)
196
- if type_match:
197
- connection_type = type_match.group(0).replace(' ', '')
198
- connection_type = connection_type.replace('C', 'С')
199
- content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
200
- content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
201
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
 
204
  def format_single_row(row, idx):
 
174
  content += f"ТАБЛИЦА: {table_identifier}\n"
175
 
176
  # Extract and emphasize the connection type if present
 
177
  if table_title:
178
  content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
179
 
180
+ # Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
181
  import re
182
+ type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
183
  if type_match:
184
+ connection_type = type_match.group(0)
 
 
185
  content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
 
 
186
 
187
+ if table_num and table_num != table_identifier:
188
+ content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
189
+
190
+ if section:
191
+ content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
 
 
 
 
192
 
193
+ content += f"\n{'='*70}\n"
194
+
195
+ # Add headers with better formatting
196
+ if headers:
197
+ content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
198
+ for i, h in enumerate(headers, 1):
199
+ content += f" {i}. {h}\n"
200
+ content += "\n"
201
+
202
+ content += "ДАННЫЕ ТАБЛИЦЫ:\n"
203
+ return content
204
 
205
 
206
  def format_single_row(row, idx):