Spaces:
Sleeping
Sleeping
Commit
·
0b6ee4f
1
Parent(s):
c7a9dbd
simplest version
Browse files- documents_prep.py +86 -72
- utils.py +12 -28
documents_prep.py
CHANGED
|
@@ -38,6 +38,21 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
| 42 |
headers = table_data.get('headers', [])
|
| 43 |
rows = table_data.get('data', [])
|
|
@@ -45,38 +60,37 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 45 |
table_title = table_data.get('table_title', '')
|
| 46 |
section = table_data.get('section', '')
|
| 47 |
|
| 48 |
-
#
|
|
|
|
|
|
|
| 49 |
table_num_clean = str(table_num).strip()
|
| 50 |
|
| 51 |
-
# Create
|
|
|
|
| 52 |
if 'приложени' in section.lower():
|
| 53 |
-
# Extract appendix number
|
| 54 |
-
import re
|
| 55 |
appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
|
| 56 |
if appendix_match:
|
| 57 |
appendix_num = appendix_match.group(1).upper()
|
| 58 |
-
table_identifier = f"{table_num_clean}
|
| 59 |
else:
|
| 60 |
-
table_identifier =
|
| 61 |
else:
|
| 62 |
table_identifier = table_num_clean
|
| 63 |
|
| 64 |
if not rows:
|
| 65 |
-
log_message(f" ⚠️ Table {table_identifier} ({doc_id}): Empty table, skipping")
|
| 66 |
return []
|
| 67 |
|
| 68 |
-
log_message(f" 📊 Processing
|
| 69 |
|
| 70 |
-
# For small tables
|
| 71 |
if len(rows) <= max_rows:
|
| 72 |
-
content = format_table_content(table_data, headers, rows, table_identifier)
|
| 73 |
chunk_size = len(content)
|
| 74 |
|
| 75 |
metadata = {
|
| 76 |
'type': 'table',
|
| 77 |
'document_id': doc_id,
|
| 78 |
'table_number': table_num_clean,
|
| 79 |
-
'table_identifier': table_identifier,
|
| 80 |
'table_title': table_title,
|
| 81 |
'section': section,
|
| 82 |
'total_rows': len(rows),
|
|
@@ -84,27 +98,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 84 |
'is_complete_table': True
|
| 85 |
}
|
| 86 |
|
| 87 |
-
log_message(f"
|
| 88 |
-
log_message(f" Metadata: {metadata}")
|
| 89 |
|
| 90 |
return [Document(text=content, metadata=metadata)]
|
| 91 |
|
| 92 |
-
# For large tables with chunking
|
| 93 |
chunks = []
|
| 94 |
overlap = 3
|
| 95 |
-
chunk_num = 0
|
| 96 |
|
| 97 |
for i in range(0, len(rows), max_rows - overlap):
|
| 98 |
chunk_rows = rows[i:min(i+max_rows, len(rows))]
|
| 99 |
-
|
| 100 |
-
chunk_info = f"Часть {chunk_num+1}: строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
|
| 101 |
|
| 102 |
content = format_table_content(
|
| 103 |
table_data,
|
| 104 |
headers,
|
| 105 |
-
chunk_rows,
|
|
|
|
| 106 |
table_identifier,
|
| 107 |
-
chunk_info=
|
| 108 |
)
|
| 109 |
|
| 110 |
chunk_size = len(content)
|
|
@@ -113,7 +124,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 113 |
'type': 'table',
|
| 114 |
'document_id': doc_id,
|
| 115 |
'table_number': table_num_clean,
|
| 116 |
-
'table_identifier': table_identifier,
|
| 117 |
'table_title': table_title,
|
| 118 |
'section': section,
|
| 119 |
'chunk_id': chunk_num,
|
|
@@ -121,82 +132,77 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 121 |
'row_end': i + len(chunk_rows),
|
| 122 |
'total_rows': len(rows),
|
| 123 |
'chunk_size': chunk_size,
|
| 124 |
-
'total_chunks': (
|
| 125 |
'is_complete_table': False
|
| 126 |
}
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
log_message(f" Chunk {chunk_num+1} created:")
|
| 131 |
-
log_message(f" Rows: {i}-{i+len(chunk_rows)}, Size: {chunk_size} chars")
|
| 132 |
-
log_message(f" Metadata: {metadata}")
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
log_message(f" ✓ Table {table_identifier} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
|
| 137 |
|
| 138 |
return chunks
|
| 139 |
|
| 140 |
|
| 141 |
-
def format_table_content(table_data, headers, rows, table_identifier, chunk_info=""):
|
| 142 |
-
doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
|
| 143 |
table_num = table_data.get('table_number', 'unknown')
|
| 144 |
table_title = table_data.get('table_title', '')
|
| 145 |
section = table_data.get('section', '')
|
| 146 |
|
| 147 |
-
#
|
| 148 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 149 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if table_title:
|
| 153 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
| 154 |
if section:
|
| 155 |
content += f"РАЗДЕЛ: {section}\n"
|
| 156 |
-
content += f"{'='*70}\n\n"
|
| 157 |
|
| 158 |
-
|
| 159 |
-
content += f"Это таблица {table_identifier} из документа {doc_id}. "
|
| 160 |
-
content += f"Идентификатор таблицы: {table_identifier}. "
|
| 161 |
-
content += f"Номер: {table_num}. "
|
| 162 |
-
content += f"Документ: {doc_id}. "
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
|
| 169 |
if table_title:
|
| 170 |
-
content += f"
|
| 171 |
-
content += f"Таблица о: {table_title}. "
|
| 172 |
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
if chunk_info:
|
| 176 |
-
content += f"
|
| 177 |
|
| 178 |
-
content += f"\n\n
|
| 179 |
-
content += f"="*70 + "\n\n"
|
| 180 |
|
| 181 |
if headers:
|
| 182 |
-
|
| 183 |
-
content += f"ЗАГОЛОВКИ СТОЛБЦОВ:\n{header_str}\n\n"
|
| 184 |
|
| 185 |
-
content += f"ДАННЫЕ ТАБЛИЦЫ:\n"
|
| 186 |
for idx, row in enumerate(rows, 1):
|
| 187 |
if isinstance(row, dict):
|
| 188 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 189 |
-
if v and str(v).strip()
|
| 190 |
if parts:
|
| 191 |
content += f"{idx}. {' | '.join(parts)}\n"
|
| 192 |
elif isinstance(row, list):
|
| 193 |
-
parts = [str(v) for v in row
|
|
|
|
| 194 |
if parts:
|
| 195 |
content += f"{idx}. {' | '.join(parts)}\n"
|
| 196 |
|
| 197 |
-
content += f"\n{'='*70}\n"
|
| 198 |
-
content += f"КОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 199 |
-
|
| 200 |
return content
|
| 201 |
|
| 202 |
def load_json_documents(repo_id, hf_token, json_dir):
|
|
@@ -328,7 +334,6 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 328 |
return documents
|
| 329 |
|
| 330 |
def extract_sections_from_json(json_path):
|
| 331 |
-
"""Extract sections from a single JSON file"""
|
| 332 |
documents = []
|
| 333 |
|
| 334 |
try:
|
|
@@ -336,8 +341,8 @@ def extract_sections_from_json(json_path):
|
|
| 336 |
data = json.load(f)
|
| 337 |
|
| 338 |
doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
|
|
|
|
| 339 |
|
| 340 |
-
# Extract all section levels
|
| 341 |
for section in data.get('sections', []):
|
| 342 |
if section.get('section_text', '').strip():
|
| 343 |
documents.append(Document(
|
|
@@ -345,11 +350,11 @@ def extract_sections_from_json(json_path):
|
|
| 345 |
metadata={
|
| 346 |
'type': 'text',
|
| 347 |
'document_id': doc_id,
|
| 348 |
-
'section_id': section.get('section_id', '')
|
|
|
|
| 349 |
}
|
| 350 |
))
|
| 351 |
|
| 352 |
-
# Subsections
|
| 353 |
for subsection in section.get('subsections', []):
|
| 354 |
if subsection.get('subsection_text', '').strip():
|
| 355 |
documents.append(Document(
|
|
@@ -357,11 +362,11 @@ def extract_sections_from_json(json_path):
|
|
| 357 |
metadata={
|
| 358 |
'type': 'text',
|
| 359 |
'document_id': doc_id,
|
| 360 |
-
'section_id': subsection.get('subsection_id', '')
|
|
|
|
| 361 |
}
|
| 362 |
))
|
| 363 |
|
| 364 |
-
# Sub-subsections
|
| 365 |
for sub_sub in subsection.get('sub_subsections', []):
|
| 366 |
if sub_sub.get('sub_subsection_text', '').strip():
|
| 367 |
documents.append(Document(
|
|
@@ -369,7 +374,8 @@ def extract_sections_from_json(json_path):
|
|
| 369 |
metadata={
|
| 370 |
'type': 'text',
|
| 371 |
'document_id': doc_id,
|
| 372 |
-
'section_id': sub_sub.get('sub_subsection_id', '')
|
|
|
|
| 373 |
}
|
| 374 |
))
|
| 375 |
|
|
@@ -380,13 +386,14 @@ def extract_sections_from_json(json_path):
|
|
| 380 |
|
| 381 |
|
| 382 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 383 |
-
"""Load and chunk tables"""
|
| 384 |
log_message("Loading tables...")
|
| 385 |
|
| 386 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 387 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 388 |
|
| 389 |
all_chunks = []
|
|
|
|
|
|
|
| 390 |
for file_path in table_files:
|
| 391 |
try:
|
| 392 |
local_path = hf_hub_download(
|
|
@@ -399,21 +406,28 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 399 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 400 |
data = json.load(f)
|
| 401 |
|
| 402 |
-
|
| 403 |
-
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 404 |
|
| 405 |
for sheet in data.get('sheets', []):
|
| 406 |
-
|
| 407 |
-
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 408 |
|
| 409 |
-
# CRITICAL: Pass document_id to chunk function
|
| 410 |
chunks = chunk_table_by_rows(sheet, sheet_doc_id)
|
| 411 |
all_chunks.extend(chunks)
|
| 412 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
except Exception as e:
|
| 414 |
log_message(f"Error loading {file_path}: {e}")
|
| 415 |
|
| 416 |
-
log_message(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
return all_chunks
|
| 418 |
|
| 419 |
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def normalize_doc_id(doc_id):
|
| 42 |
+
"""Normalize document ID for consistent matching"""
|
| 43 |
+
if not doc_id or doc_id == 'unknown':
|
| 44 |
+
return doc_id
|
| 45 |
+
|
| 46 |
+
doc_id = str(doc_id).strip()
|
| 47 |
+
|
| 48 |
+
# Normalize spacing: "ГОСТ Р" variations
|
| 49 |
+
import re
|
| 50 |
+
doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
|
| 51 |
+
doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
|
| 52 |
+
|
| 53 |
+
return doc_id
|
| 54 |
+
|
| 55 |
+
|
| 56 |
def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
| 57 |
headers = table_data.get('headers', [])
|
| 58 |
rows = table_data.get('data', [])
|
|
|
|
| 60 |
table_title = table_data.get('table_title', '')
|
| 61 |
section = table_data.get('section', '')
|
| 62 |
|
| 63 |
+
# NORMALIZE document ID
|
| 64 |
+
doc_id = normalize_doc_id(doc_id)
|
| 65 |
+
|
| 66 |
table_num_clean = str(table_num).strip()
|
| 67 |
|
| 68 |
+
# Create section-aware identifier
|
| 69 |
+
import re
|
| 70 |
if 'приложени' in section.lower():
|
|
|
|
|
|
|
| 71 |
appendix_match = re.search(r'приложени[еия]\s*(\d+|[а-яА-Я])', section.lower())
|
| 72 |
if appendix_match:
|
| 73 |
appendix_num = appendix_match.group(1).upper()
|
| 74 |
+
table_identifier = f"{table_num_clean} Приложение {appendix_num}"
|
| 75 |
else:
|
| 76 |
+
table_identifier = table_num_clean
|
| 77 |
else:
|
| 78 |
table_identifier = table_num_clean
|
| 79 |
|
| 80 |
if not rows:
|
|
|
|
| 81 |
return []
|
| 82 |
|
| 83 |
+
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 84 |
|
|
|
|
| 85 |
if len(rows) <= max_rows:
|
| 86 |
+
content = format_table_content(table_data, headers, rows, doc_id, table_identifier)
|
| 87 |
chunk_size = len(content)
|
| 88 |
|
| 89 |
metadata = {
|
| 90 |
'type': 'table',
|
| 91 |
'document_id': doc_id,
|
| 92 |
'table_number': table_num_clean,
|
| 93 |
+
'table_identifier': table_identifier,
|
| 94 |
'table_title': table_title,
|
| 95 |
'section': section,
|
| 96 |
'total_rows': len(rows),
|
|
|
|
| 98 |
'is_complete_table': True
|
| 99 |
}
|
| 100 |
|
| 101 |
+
log_message(f" Chunk: 1/1, {chunk_size} chars, doc={doc_id}, table={table_identifier}")
|
|
|
|
| 102 |
|
| 103 |
return [Document(text=content, metadata=metadata)]
|
| 104 |
|
|
|
|
| 105 |
chunks = []
|
| 106 |
overlap = 3
|
|
|
|
| 107 |
|
| 108 |
for i in range(0, len(rows), max_rows - overlap):
|
| 109 |
chunk_rows = rows[i:min(i+max_rows, len(rows))]
|
| 110 |
+
chunk_num = i // (max_rows - overlap)
|
|
|
|
| 111 |
|
| 112 |
content = format_table_content(
|
| 113 |
table_data,
|
| 114 |
headers,
|
| 115 |
+
chunk_rows,
|
| 116 |
+
doc_id,
|
| 117 |
table_identifier,
|
| 118 |
+
chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
|
| 119 |
)
|
| 120 |
|
| 121 |
chunk_size = len(content)
|
|
|
|
| 124 |
'type': 'table',
|
| 125 |
'document_id': doc_id,
|
| 126 |
'table_number': table_num_clean,
|
| 127 |
+
'table_identifier': table_identifier,
|
| 128 |
'table_title': table_title,
|
| 129 |
'section': section,
|
| 130 |
'chunk_id': chunk_num,
|
|
|
|
| 132 |
'row_end': i + len(chunk_rows),
|
| 133 |
'total_rows': len(rows),
|
| 134 |
'chunk_size': chunk_size,
|
| 135 |
+
'total_chunks': (len(rows) + max_rows - overlap - 1) // (max_rows - overlap),
|
| 136 |
'is_complete_table': False
|
| 137 |
}
|
| 138 |
|
| 139 |
+
log_message(f" Chunk: {chunk_num+1}, rows {i}-{i+len(chunk_rows)}, {chunk_size} chars")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
chunks.append(Document(text=content, metadata=metadata))
|
|
|
|
|
|
|
| 142 |
|
| 143 |
return chunks
|
| 144 |
|
| 145 |
|
| 146 |
+
def format_table_content(table_data, headers, rows, doc_id, table_identifier, chunk_info=""):
|
|
|
|
| 147 |
table_num = table_data.get('table_number', 'unknown')
|
| 148 |
table_title = table_data.get('table_title', '')
|
| 149 |
section = table_data.get('section', '')
|
| 150 |
|
| 151 |
+
# Build content with multiple search variations
|
| 152 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 153 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 154 |
+
|
| 155 |
+
# Add search variations for document ID
|
| 156 |
+
doc_variations = [doc_id]
|
| 157 |
+
if 'Р' in doc_id:
|
| 158 |
+
doc_variations.append(doc_id.replace(' Р ', ' Р'))
|
| 159 |
+
doc_variations.append(doc_id.replace(' Р ', 'Р'))
|
| 160 |
+
|
| 161 |
+
for var in set(doc_variations):
|
| 162 |
+
content += f"ДОКУМЕНТ_ВАРИАНТ: {var}\n"
|
| 163 |
+
|
| 164 |
if table_title:
|
| 165 |
content += f"НАЗВАНИЕ: {table_title}\n"
|
| 166 |
if section:
|
| 167 |
content += f"РАЗДЕЛ: {section}\n"
|
|
|
|
| 168 |
|
| 169 |
+
content += f"{'='*70}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
# Enhanced search text
|
| 172 |
+
content += f"Документ {doc_id}. "
|
| 173 |
+
content += f"Таблица {table_identifier}. "
|
| 174 |
+
content += f"Номер таблицы {table_num}. "
|
| 175 |
|
| 176 |
if table_title:
|
| 177 |
+
content += f"Название: {table_title}. "
|
|
|
|
| 178 |
|
| 179 |
+
if section:
|
| 180 |
+
content += f"Раздел: {section}. "
|
| 181 |
+
|
| 182 |
+
# Add more search patterns
|
| 183 |
+
content += f"Таблицы документа {doc_id}. "
|
| 184 |
+
content += f"Содержание {doc_id}. "
|
| 185 |
|
| 186 |
if chunk_info:
|
| 187 |
+
content += f"{chunk_info}. "
|
| 188 |
|
| 189 |
+
content += f"\n\nДАННЫЕ ТАБЛИЦЫ {table_identifier}:\n{'='*70}\n\n"
|
|
|
|
| 190 |
|
| 191 |
if headers:
|
| 192 |
+
content += f"СТОЛБЦЫ: {' | '.join(str(h) for h in headers)}\n\n"
|
|
|
|
| 193 |
|
|
|
|
| 194 |
for idx, row in enumerate(rows, 1):
|
| 195 |
if isinstance(row, dict):
|
| 196 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 197 |
+
if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
|
| 198 |
if parts:
|
| 199 |
content += f"{idx}. {' | '.join(parts)}\n"
|
| 200 |
elif isinstance(row, list):
|
| 201 |
+
parts = [str(v) for v in row
|
| 202 |
+
if v and str(v).strip().lower() not in ['nan', 'none', '', 'null']]
|
| 203 |
if parts:
|
| 204 |
content += f"{idx}. {' | '.join(parts)}\n"
|
| 205 |
|
|
|
|
|
|
|
|
|
|
| 206 |
return content
|
| 207 |
|
| 208 |
def load_json_documents(repo_id, hf_token, json_dir):
|
|
|
|
| 334 |
return documents
|
| 335 |
|
| 336 |
def extract_sections_from_json(json_path):
|
|
|
|
| 337 |
documents = []
|
| 338 |
|
| 339 |
try:
|
|
|
|
| 341 |
data = json.load(f)
|
| 342 |
|
| 343 |
doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
|
| 344 |
+
doc_id = normalize_doc_id(doc_id) # NORMALIZE
|
| 345 |
|
|
|
|
| 346 |
for section in data.get('sections', []):
|
| 347 |
if section.get('section_text', '').strip():
|
| 348 |
documents.append(Document(
|
|
|
|
| 350 |
metadata={
|
| 351 |
'type': 'text',
|
| 352 |
'document_id': doc_id,
|
| 353 |
+
'section_id': section.get('section_id', ''),
|
| 354 |
+
'chunk_size': len(section['section_text'])
|
| 355 |
}
|
| 356 |
))
|
| 357 |
|
|
|
|
| 358 |
for subsection in section.get('subsections', []):
|
| 359 |
if subsection.get('subsection_text', '').strip():
|
| 360 |
documents.append(Document(
|
|
|
|
| 362 |
metadata={
|
| 363 |
'type': 'text',
|
| 364 |
'document_id': doc_id,
|
| 365 |
+
'section_id': subsection.get('subsection_id', ''),
|
| 366 |
+
'chunk_size': len(subsection['subsection_text'])
|
| 367 |
}
|
| 368 |
))
|
| 369 |
|
|
|
|
| 370 |
for sub_sub in subsection.get('sub_subsections', []):
|
| 371 |
if sub_sub.get('sub_subsection_text', '').strip():
|
| 372 |
documents.append(Document(
|
|
|
|
| 374 |
metadata={
|
| 375 |
'type': 'text',
|
| 376 |
'document_id': doc_id,
|
| 377 |
+
'section_id': sub_sub.get('sub_subsection_id', ''),
|
| 378 |
+
'chunk_size': len(sub_sub['sub_subsection_text'])
|
| 379 |
}
|
| 380 |
))
|
| 381 |
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
def load_table_documents(repo_id, hf_token, table_dir):
|
|
|
|
| 389 |
log_message("Loading tables...")
|
| 390 |
|
| 391 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 392 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 393 |
|
| 394 |
all_chunks = []
|
| 395 |
+
doc_table_count = {}
|
| 396 |
+
|
| 397 |
for file_path in table_files:
|
| 398 |
try:
|
| 399 |
local_path = hf_hub_download(
|
|
|
|
| 406 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 407 |
data = json.load(f)
|
| 408 |
|
| 409 |
+
file_doc_id = normalize_doc_id(data.get('document_id', data.get('document', 'unknown')))
|
|
|
|
| 410 |
|
| 411 |
for sheet in data.get('sheets', []):
|
| 412 |
+
sheet_doc_id = normalize_doc_id(sheet.get('document_id', sheet.get('document', file_doc_id)))
|
|
|
|
| 413 |
|
|
|
|
| 414 |
chunks = chunk_table_by_rows(sheet, sheet_doc_id)
|
| 415 |
all_chunks.extend(chunks)
|
| 416 |
|
| 417 |
+
if sheet_doc_id not in doc_table_count:
|
| 418 |
+
doc_table_count[sheet_doc_id] = 0
|
| 419 |
+
doc_table_count[sheet_doc_id] += len(chunks)
|
| 420 |
+
|
| 421 |
except Exception as e:
|
| 422 |
log_message(f"Error loading {file_path}: {e}")
|
| 423 |
|
| 424 |
+
log_message(f"\n{'='*60}")
|
| 425 |
+
log_message("TABLE LOADING SUMMARY:")
|
| 426 |
+
for doc_id, count in sorted(doc_table_count.items()):
|
| 427 |
+
log_message(f" {doc_id}: {count} table chunks")
|
| 428 |
+
log_message(f"TOTAL: {len(all_chunks)} table chunks")
|
| 429 |
+
log_message(f"{'='*60}\n")
|
| 430 |
+
|
| 431 |
return all_chunks
|
| 432 |
|
| 433 |
|
utils.py
CHANGED
|
@@ -41,33 +41,19 @@ def preprocess_query(question):
|
|
| 41 |
|
| 42 |
question_lower = question.lower()
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
r'табли[цу]\w*\s+([а-яa-z0-9\.]+)(?:\s+(?:из\s+)?приложени[яеий]\s+(\d+|[а-я]))?',
|
| 47 |
-
r'табли[цу]\w*\s+(?:№|номер)?\s*([а-яa-z0-9\.]+)',
|
| 48 |
-
]
|
| 49 |
-
|
| 50 |
-
doc_match = re.search(r'(гост|нп|му)[^\s]*\s*[рp№-]*\s*([0-9\.-]+)', question_lower)
|
| 51 |
|
| 52 |
enhanced_query = question
|
| 53 |
|
| 54 |
-
for pattern in table_patterns:
|
| 55 |
-
table_match = re.search(pattern, question_lower)
|
| 56 |
-
if table_match:
|
| 57 |
-
table_num = table_match.group(1).upper()
|
| 58 |
-
enhanced_query += f" таблица номер {table_num}"
|
| 59 |
-
|
| 60 |
-
# Add appendix context if mentioned
|
| 61 |
-
if len(table_match.groups()) > 1 and table_match.group(2):
|
| 62 |
-
appendix_num = table_match.group(2).upper()
|
| 63 |
-
enhanced_query += f" приложение {appendix_num}"
|
| 64 |
-
break
|
| 65 |
-
|
| 66 |
if doc_match:
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
return enhanced_query
|
| 73 |
|
|
@@ -119,7 +105,7 @@ def answer_question(question, query_engine, reranker):
|
|
| 119 |
context_parts.append(f"{source_label}\n{n.text}")
|
| 120 |
|
| 121 |
context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
|
| 122 |
-
|
| 123 |
prompt = f"""Ты эксперт по технической документации.
|
| 124 |
|
| 125 |
КОНТЕКСТ:
|
|
@@ -129,10 +115,8 @@ def answer_question(question, query_engine, reranker):
|
|
| 129 |
|
| 130 |
ИНСТРУКЦИИ:
|
| 131 |
1. Используй ТОЛЬКО контекст выше
|
| 132 |
-
2.
|
| 133 |
-
3.
|
| 134 |
-
4. Если таблица разбита на части - объедини информацию
|
| 135 |
-
5. Если информации нет - четко скажи об этом
|
| 136 |
|
| 137 |
ОТВЕТ:"""
|
| 138 |
|
|
|
|
| 41 |
|
| 42 |
question_lower = question.lower()
|
| 43 |
|
| 44 |
+
# Extract document ID and normalize
|
| 45 |
+
doc_match = re.search(r'(гост|нп|му)\s*р?\s*[№-]*\s*([0-9\.-]+)', question_lower)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
enhanced_query = question
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if doc_match:
|
| 50 |
+
doc_type = doc_match.group(1).upper()
|
| 51 |
+
doc_num = doc_match.group(2)
|
| 52 |
+
|
| 53 |
+
# Add normalized versions
|
| 54 |
+
enhanced_query += f" {doc_type} Р {doc_num}"
|
| 55 |
+
enhanced_query += f" {doc_type}Р {doc_num}"
|
| 56 |
+
enhanced_query += f" {doc_type} {doc_num}"
|
| 57 |
|
| 58 |
return enhanced_query
|
| 59 |
|
|
|
|
| 105 |
context_parts.append(f"{source_label}\n{n.text}")
|
| 106 |
|
| 107 |
context = "\n\n" + ("="*70 + "\n\n").join(context_parts)
|
| 108 |
+
from config import CUSTOM_PROMPT
|
| 109 |
prompt = f"""Ты эксперт по технической документации.
|
| 110 |
|
| 111 |
КОНТЕКСТ:
|
|
|
|
| 115 |
|
| 116 |
ИНСТРУКЦИИ:
|
| 117 |
1. Используй ТОЛЬКО контекст выше
|
| 118 |
+
2. Укажи источник: документ и номер таблицы
|
| 119 |
+
3. Если информации нет - четко скажи об этом
|
|
|
|
|
|
|
| 120 |
|
| 121 |
ОТВЕТ:"""
|
| 122 |
|