Spaces:
Sleeping
Sleeping
Commit
·
aafe88b
1
Parent(s):
05c597d
new debug functions + 2000 chunk size
Browse files- documents_prep.py +19 -17
documents_prep.py
CHANGED
|
@@ -174,31 +174,33 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
|
|
| 174 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 175 |
|
| 176 |
# Extract and emphasize the connection type if present
|
| 177 |
-
connection_type = ''
|
| 178 |
if table_title:
|
| 179 |
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
|
| 180 |
|
| 181 |
-
# Parse type from title
|
| 182 |
import re
|
| 183 |
-
type_match = re.search(r'[СУUTC]-?\
|
| 184 |
if type_match:
|
| 185 |
-
connection_type = type_match.group(0)
|
| 186 |
-
# Normalize: always use С (Cyrillic)
|
| 187 |
-
connection_type = connection_type.replace('C', 'С').replace('c', 'С')
|
| 188 |
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
| 189 |
-
# ADD SEARCHABLE KEYWORDS
|
| 190 |
-
content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
connection_type = type_match.group(0).replace(' ', '')
|
| 198 |
-
connection_type = connection_type.replace('C', 'С')
|
| 199 |
-
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
| 200 |
-
content += f"КЛЮЧЕВЫЕ СЛОВА: {connection_type} тип сварного соединения\n"
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
|
| 204 |
def format_single_row(row, idx):
|
|
|
|
| 174 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 175 |
|
| 176 |
# Extract and emphasize the connection type if present
|
|
|
|
| 177 |
if table_title:
|
| 178 |
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
|
| 179 |
|
| 180 |
+
# Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
|
| 181 |
import re
|
| 182 |
+
type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
|
| 183 |
if type_match:
|
| 184 |
+
connection_type = type_match.group(0)
|
|
|
|
|
|
|
| 185 |
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
if table_num and table_num != table_identifier:
|
| 188 |
+
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
| 189 |
+
|
| 190 |
+
if section:
|
| 191 |
+
content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
content += f"\n{'='*70}\n"
|
| 194 |
+
|
| 195 |
+
# Add headers with better formatting
|
| 196 |
+
if headers:
|
| 197 |
+
content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
|
| 198 |
+
for i, h in enumerate(headers, 1):
|
| 199 |
+
content += f" {i}. {h}\n"
|
| 200 |
+
content += "\n"
|
| 201 |
+
|
| 202 |
+
content += "ДАННЫЕ ТАБЛИЦЫ:\n"
|
| 203 |
+
return content
|
| 204 |
|
| 205 |
|
| 206 |
def format_single_row(row, idx):
|