Spaces:
Sleeping
Sleeping
Commit ·
9c77451
1
Parent(s): 60178fd
fixing normalizing hypens
Browse files- documents_prep.py +28 -24
- index_retriever.py +13 -6
- utils.py +18 -2
documents_prep.py
CHANGED
|
@@ -39,16 +39,17 @@ def normalize_connection_type(s):
|
|
| 39 |
s = s.replace('С', 'C').replace('с', 'c')
|
| 40 |
s = s.replace('У', 'U').replace('у', 'u')
|
| 41 |
s = s.replace('Т', 'T').replace('т', 't')
|
| 42 |
-
# REMOVE HYPHENS for consistent tokenization
|
| 43 |
s = s.replace('-', '')
|
| 44 |
return s
|
| 45 |
|
| 46 |
def extract_connection_type(text):
|
| 47 |
import re
|
| 48 |
-
# Match with or without
|
| 49 |
-
match = re.search(r'[СCс]-?\d+(?:-\d+)
|
| 50 |
if match:
|
| 51 |
-
|
|
|
|
| 52 |
return ''
|
| 53 |
|
| 54 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
|
@@ -181,23 +182,17 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 181 |
return chunks
|
| 182 |
|
| 183 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 184 |
-
# Start with clear identification
|
| 185 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 186 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 187 |
|
| 188 |
-
# Extract and emphasize the connection type if present
|
| 189 |
if table_title:
|
| 190 |
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
|
| 191 |
|
| 192 |
-
#
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
# NORMALIZE: Convert Cyrillic to Latin for consistency
|
| 198 |
-
connection_type_normalized = connection_type.replace('С', 'C').replace('У', 'U').replace('Т', 'T')
|
| 199 |
-
# Show BOTH in content for searchability
|
| 200 |
-
content += f"ТИП СОЕДИНЕНИЯ: {connection_type} ({connection_type_normalized})\n"
|
| 201 |
|
| 202 |
if table_num and table_num != table_identifier:
|
| 203 |
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
|
@@ -207,7 +202,6 @@ def format_table_header(doc_id, table_identifier, table_num, table_title, sectio
|
|
| 207 |
|
| 208 |
content += f"\n{'='*70}\n"
|
| 209 |
|
| 210 |
-
# Add headers with better formatting
|
| 211 |
if headers:
|
| 212 |
content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
|
| 213 |
for i, h in enumerate(headers, 1):
|
|
@@ -432,6 +426,8 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 432 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 433 |
|
| 434 |
all_chunks = []
|
|
|
|
|
|
|
| 435 |
for file_path in table_files:
|
| 436 |
try:
|
| 437 |
local_path = hf_hub_download(
|
|
@@ -448,27 +444,35 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 448 |
|
| 449 |
for sheet in data.get('sheets', []):
|
| 450 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
|
|
|
|
|
|
| 451 |
|
| 452 |
-
# Use the consistent MAX_CHARS_TABLE from config
|
| 453 |
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
|
| 454 |
all_chunks.extend(chunks)
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
except Exception as e:
|
| 457 |
log_message(f"Error loading {file_path}: {e}")
|
| 458 |
|
| 459 |
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 460 |
|
| 461 |
log_message("="*60)
|
| 462 |
-
log_message("CONNECTION
|
| 463 |
-
for
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
log_message(f"
|
| 468 |
log_message("="*60)
|
|
|
|
| 469 |
return all_chunks
|
| 470 |
|
| 471 |
-
|
| 472 |
def load_image_documents(repo_id, hf_token, image_dir):
|
| 473 |
"""Load image descriptions"""
|
| 474 |
log_message("Loading images...")
|
|
|
|
| 39 |
s = s.replace('С', 'C').replace('с', 'c')
|
| 40 |
s = s.replace('У', 'U').replace('у', 'u')
|
| 41 |
s = s.replace('Т', 'T').replace('т', 't')
|
| 42 |
+
# REMOVE ALL HYPHENS for consistent tokenization
|
| 43 |
s = s.replace('-', '')
|
| 44 |
return s
|
| 45 |
|
| 46 |
def extract_connection_type(text):
|
| 47 |
import re
|
| 48 |
+
# Match pattern with or without hyphens: C-25, C-25-1, С25, etc.
|
| 49 |
+
match = re.search(r'[СCс]-?\d+(?:-\d+)*', text)
|
| 50 |
if match:
|
| 51 |
+
normalized = normalize_connection_type(match.group(0))
|
| 52 |
+
return normalized
|
| 53 |
return ''
|
| 54 |
|
| 55 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
|
|
|
| 182 |
return chunks
|
| 183 |
|
| 184 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
|
|
|
| 185 |
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 186 |
content += f"ТАБЛИЦА: {table_identifier}\n"
|
| 187 |
|
|
|
|
| 188 |
if table_title:
|
| 189 |
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
|
| 190 |
|
| 191 |
+
# Extract and normalize connection type
|
| 192 |
+
connection_type = extract_connection_type(table_title)
|
| 193 |
+
if connection_type:
|
| 194 |
+
# Show normalized version for searchability
|
| 195 |
+
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
if table_num and table_num != table_identifier:
|
| 198 |
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
|
|
|
| 202 |
|
| 203 |
content += f"\n{'='*70}\n"
|
| 204 |
|
|
|
|
| 205 |
if headers:
|
| 206 |
content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
|
| 207 |
for i, h in enumerate(headers, 1):
|
|
|
|
| 426 |
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 427 |
|
| 428 |
all_chunks = []
|
| 429 |
+
connection_type_sources = {} # Track which table each type comes from
|
| 430 |
+
|
| 431 |
for file_path in table_files:
|
| 432 |
try:
|
| 433 |
local_path = hf_hub_download(
|
|
|
|
| 444 |
|
| 445 |
for sheet in data.get('sheets', []):
|
| 446 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 447 |
+
table_num = sheet.get('table_number', 'unknown')
|
| 448 |
+
table_title = sheet.get('table_title', '')
|
| 449 |
|
|
|
|
| 450 |
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE)
|
| 451 |
all_chunks.extend(chunks)
|
| 452 |
|
| 453 |
+
# Track connection type source
|
| 454 |
+
conn_type = extract_connection_type(table_title)
|
| 455 |
+
if conn_type:
|
| 456 |
+
if conn_type not in connection_type_sources:
|
| 457 |
+
connection_type_sources[conn_type] = []
|
| 458 |
+
connection_type_sources[conn_type].append(f"{sheet_doc_id} Table {table_num}")
|
| 459 |
+
|
| 460 |
except Exception as e:
|
| 461 |
log_message(f"Error loading {file_path}: {e}")
|
| 462 |
|
| 463 |
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 464 |
|
| 465 |
log_message("="*60)
|
| 466 |
+
log_message("CONNECTION TYPES AND THEIR SOURCES:")
|
| 467 |
+
for conn_type in sorted(connection_type_sources.keys()):
|
| 468 |
+
sources = connection_type_sources[conn_type]
|
| 469 |
+
log_message(f" {conn_type}: {len(sources)} tables")
|
| 470 |
+
for src in sources:
|
| 471 |
+
log_message(f" - {src}")
|
| 472 |
log_message("="*60)
|
| 473 |
+
|
| 474 |
return all_chunks
|
| 475 |
|
|
|
|
| 476 |
def load_image_documents(repo_id, hf_token, image_dir):
|
| 477 |
"""Load image descriptions"""
|
| 478 |
log_message("Loading images...")
|
index_retriever.py
CHANGED
|
@@ -11,25 +11,32 @@ from config import CUSTOM_PROMPT, PROMPT_SIMPLE_POISK
|
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
|
| 14 |
-
|
| 15 |
-
connection_types = {}
|
| 16 |
table_count = 0
|
|
|
|
| 17 |
for doc in documents:
|
| 18 |
if doc.metadata.get('type') == 'table':
|
| 19 |
table_count += 1
|
| 20 |
conn_type = doc.metadata.get('connection_type', '')
|
| 21 |
if conn_type:
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
log_message("="*60)
|
| 25 |
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 26 |
-
log_message("CONNECTION TYPES IN INDEX:")
|
| 27 |
-
for conn_type
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
log_message("="*60)
|
| 30 |
|
| 31 |
return VectorStoreIndex.from_documents(documents)
|
| 32 |
|
|
|
|
| 33 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 34 |
if not nodes or not reranker:
|
| 35 |
return nodes[:top_k]
|
|
|
|
| 11 |
def create_vector_index(documents):
|
| 12 |
log_message("Строю векторный индекс")
|
| 13 |
|
| 14 |
+
connection_type_sources = {}
|
|
|
|
| 15 |
table_count = 0
|
| 16 |
+
|
| 17 |
for doc in documents:
|
| 18 |
if doc.metadata.get('type') == 'table':
|
| 19 |
table_count += 1
|
| 20 |
conn_type = doc.metadata.get('connection_type', '')
|
| 21 |
if conn_type:
|
| 22 |
+
table_id = f"{doc.metadata.get('document_id', 'unknown')} Table {doc.metadata.get('table_number', 'N/A')}"
|
| 23 |
+
if conn_type not in connection_type_sources:
|
| 24 |
+
connection_type_sources[conn_type] = []
|
| 25 |
+
connection_type_sources[conn_type].append(table_id)
|
| 26 |
|
| 27 |
log_message("="*60)
|
| 28 |
log_message(f"INDEXING {table_count} TABLE CHUNKS")
|
| 29 |
+
log_message("CONNECTION TYPES IN INDEX WITH SOURCES:")
|
| 30 |
+
for conn_type in sorted(connection_type_sources.keys()):
|
| 31 |
+
sources = list(set(connection_type_sources[conn_type])) # Unique sources
|
| 32 |
+
log_message(f" {conn_type}: {len(connection_type_sources[conn_type])} chunks from {len(sources)} tables")
|
| 33 |
+
for src in sources:
|
| 34 |
+
log_message(f" - {src}")
|
| 35 |
log_message("="*60)
|
| 36 |
|
| 37 |
return VectorStoreIndex.from_documents(documents)
|
| 38 |
|
| 39 |
+
|
| 40 |
def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5):
|
| 41 |
if not nodes or not reranker:
|
| 42 |
return nodes[:top_k]
|
utils.py
CHANGED
|
@@ -173,6 +173,16 @@ def deduplicate_nodes(nodes):
|
|
| 173 |
return unique_nodes
|
| 174 |
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 177 |
if query_engine is None:
|
| 178 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
|
@@ -180,8 +190,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 180 |
try:
|
| 181 |
start_time = time.time()
|
| 182 |
|
| 183 |
-
#
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
log_message(f"user query: {question}")
|
| 186 |
|
| 187 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
|
|
|
| 173 |
return unique_nodes
|
| 174 |
|
| 175 |
|
| 176 |
+
def normalize_query(query):
|
| 177 |
+
"""Normalize query to match stored format"""
|
| 178 |
+
import re
|
| 179 |
+
# Replace Cyrillic connection types with Latin
|
| 180 |
+
query = query.replace('С-', 'C-').replace('с-', 'c-')
|
| 181 |
+
query = query.replace('У-', 'U-').replace('у-', 'u-')
|
| 182 |
+
query = query.replace('Т-', 'T-').replace('т-', 't-')
|
| 183 |
+
return query
|
| 184 |
+
|
| 185 |
+
|
| 186 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 187 |
if query_engine is None:
|
| 188 |
return "<div style='background-color: #e53e3e; color: white; padding: 20px; border-radius: 10px;'>Система не инициализирована</div>", "", ""
|
|
|
|
| 190 |
try:
|
| 191 |
start_time = time.time()
|
| 192 |
|
| 193 |
+
# NORMALIZE QUERY: Convert Cyrillic to Latin
|
| 194 |
+
normalized_question = normalize_query(question)
|
| 195 |
+
log_message(f"Original query: {question}")
|
| 196 |
+
if normalized_question != question:
|
| 197 |
+
log_message(f"Normalized query: {normalized_question}")
|
| 198 |
+
|
| 199 |
+
# Use normalized query for retrieval
|
| 200 |
+
retrieved_nodes = query_engine.retriever.retrieve(normalized_question)
|
| 201 |
log_message(f"user query: {question}")
|
| 202 |
|
| 203 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|