Spaces:
Sleeping
Sleeping
Commit
·
8d6a517
1
Parent(s):
6ff1953
added debugging functions for the c25
Browse files- documents_prep.py +38 -14
- utils.py +30 -0
documents_prep.py
CHANGED
|
@@ -34,6 +34,11 @@ def chunk_text_documents(documents):
|
|
| 34 |
|
| 35 |
return chunked
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 39 |
headers = table_data.get('headers', [])
|
|
@@ -41,6 +46,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 41 |
table_num = table_data.get('table_number', 'unknown')
|
| 42 |
table_title = table_data.get('table_title', '')
|
| 43 |
section = table_data.get('section', '')
|
|
|
|
| 44 |
|
| 45 |
table_num_clean = str(table_num).strip()
|
| 46 |
|
|
@@ -60,8 +66,13 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 60 |
|
| 61 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 62 |
|
| 63 |
-
# Calculate base metadata size
|
| 64 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
base_size = len(base_content)
|
| 66 |
available_space = max_chars - base_size - 200
|
| 67 |
|
|
@@ -79,7 +90,9 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 79 |
'section': section,
|
| 80 |
'total_rows': len(rows),
|
| 81 |
'chunk_size': len(content),
|
| 82 |
-
'is_complete_table': True
|
|
|
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
|
@@ -155,27 +168,38 @@ def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_ro
|
|
| 155 |
|
| 156 |
return chunks
|
| 157 |
|
| 158 |
-
|
| 159 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
if table_num:
|
| 164 |
-
content += f"ТИП: {table_num}\n"
|
| 165 |
|
|
|
|
| 166 |
if table_title:
|
| 167 |
-
content += f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if section:
|
| 170 |
-
content += f"
|
| 171 |
|
| 172 |
-
content += f"{'='*70}\n"
|
| 173 |
|
|
|
|
| 174 |
if headers:
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
content += "
|
| 179 |
return content
|
| 180 |
|
| 181 |
|
|
|
|
| 34 |
|
| 35 |
return chunked
|
| 36 |
|
| 37 |
+
def extract_connection_type(text):
|
| 38 |
+
"""Extract connection type like С-25, У-14, etc. from text"""
|
| 39 |
+
import re
|
| 40 |
+
match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', text)
|
| 41 |
+
return match.group(0) if match else ''
|
| 42 |
|
| 43 |
def chunk_table_by_content(table_data, doc_id, max_chars=MAX_CHARS_TABLE, max_rows=MAX_ROWS_TABLE):
|
| 44 |
headers = table_data.get('headers', [])
|
|
|
|
| 46 |
table_num = table_data.get('table_number', 'unknown')
|
| 47 |
table_title = table_data.get('table_title', '')
|
| 48 |
section = table_data.get('section', '')
|
| 49 |
+
table_description = table_data.get('table_description', '') # NEW
|
| 50 |
|
| 51 |
table_num_clean = str(table_num).strip()
|
| 52 |
|
|
|
|
| 66 |
|
| 67 |
log_message(f" 📊 Processing: {doc_id} - {table_identifier} ({len(rows)} rows)")
|
| 68 |
|
| 69 |
+
# Calculate base metadata size - NOW INCLUDING DESCRIPTION
|
| 70 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 71 |
+
|
| 72 |
+
# ADD DESCRIPTION HERE if it exists
|
| 73 |
+
if table_description:
|
| 74 |
+
base_content += f"ОПИСАНИЕ: {table_description}\n\n"
|
| 75 |
+
|
| 76 |
base_size = len(base_content)
|
| 77 |
available_space = max_chars - base_size - 200
|
| 78 |
|
|
|
|
| 90 |
'section': section,
|
| 91 |
'total_rows': len(rows),
|
| 92 |
'chunk_size': len(content),
|
| 93 |
+
'is_complete_table': True,
|
| 94 |
+
'connection_type': extract_connection_type(table_title) if table_title else '' # NEW
|
| 95 |
+
|
| 96 |
}
|
| 97 |
|
| 98 |
log_message(f" Single chunk: {len(content)} chars, {len(rows)} rows")
|
|
|
|
| 168 |
|
| 169 |
return chunks
|
| 170 |
|
|
|
|
| 171 |
def format_table_header(doc_id, table_identifier, table_num, table_title, section, headers):
|
| 172 |
+
# Start with clear identification
|
| 173 |
+
content = f"ДОКУМЕНТ: {doc_id}\n"
|
| 174 |
+
content += f"ТАБЛИЦА: {table_identifier}\n"
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
# Extract and emphasize the connection type if present
|
| 177 |
if table_title:
|
| 178 |
+
content += f"НАЗВАНИЕ ТАБЛИЦЫ: {table_title}\n"
|
| 179 |
+
|
| 180 |
+
# Parse type from title (e.g., "С-25" from "Тип сварного соединения С-25")
|
| 181 |
+
import re
|
| 182 |
+
type_match = re.search(r'[СУUTC]-?\d+(?:-\d+)?', table_title)
|
| 183 |
+
if type_match:
|
| 184 |
+
connection_type = type_match.group(0)
|
| 185 |
+
content += f"ТИП СОЕДИНЕНИЯ: {connection_type}\n"
|
| 186 |
+
|
| 187 |
+
if table_num and table_num != table_identifier:
|
| 188 |
+
content += f"НОМЕР ТАБЛИЦЫ: {table_num}\n"
|
| 189 |
|
| 190 |
if section:
|
| 191 |
+
content += f"РАЗДЕЛ ДОКУМЕНТА: {section}\n"
|
| 192 |
|
| 193 |
+
content += f"\n{'='*70}\n"
|
| 194 |
|
| 195 |
+
# Add headers with better formatting
|
| 196 |
if headers:
|
| 197 |
+
content += "СТОЛБЦЫ ТАБЛИЦЫ:\n"
|
| 198 |
+
for i, h in enumerate(headers, 1):
|
| 199 |
+
content += f" {i}. {h}\n"
|
| 200 |
+
content += "\n"
|
| 201 |
|
| 202 |
+
content += "ДАННЫЕ ТАБЛИЦЫ:\n"
|
| 203 |
return content
|
| 204 |
|
| 205 |
|
utils.py
CHANGED
|
@@ -172,6 +172,28 @@ def deduplicate_nodes(nodes):
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 177 |
if query_engine is None:
|
|
@@ -186,6 +208,14 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
|
|
| 186 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 187 |
|
| 188 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 190 |
|
| 191 |
# Simple reranking
|
|
|
|
| 172 |
|
| 173 |
return unique_nodes
|
| 174 |
|
| 175 |
+
def debug_search_tables(vector_index, search_term="С-25"):
|
| 176 |
+
"""Debug function to find all tables containing a specific term"""
|
| 177 |
+
all_nodes = list(vector_index.docstore.docs.values())
|
| 178 |
+
|
| 179 |
+
matching = []
|
| 180 |
+
for node in all_nodes:
|
| 181 |
+
if node.metadata.get('type') == 'table':
|
| 182 |
+
text = node.get_content()
|
| 183 |
+
if search_term in text or search_term in node.metadata.get('table_title', ''):
|
| 184 |
+
matching.append({
|
| 185 |
+
'doc_id': node.metadata.get('document_id'),
|
| 186 |
+
'table_num': node.metadata.get('table_number'),
|
| 187 |
+
'title': node.metadata.get('table_title', '')[:100]
|
| 188 |
+
})
|
| 189 |
+
|
| 190 |
+
log_message(f"\n{'='*60}")
|
| 191 |
+
log_message(f"DEBUG: Found {len(matching)} tables containing '{search_term}'")
|
| 192 |
+
for m in matching:
|
| 193 |
+
log_message(f" • {m['doc_id']} - Table {m['table_num']}: {m['title']}")
|
| 194 |
+
log_message(f"{'='*60}\n")
|
| 195 |
+
|
| 196 |
+
return matching
|
| 197 |
|
| 198 |
def answer_question(question, query_engine, reranker, current_model, chunks_df=None):
|
| 199 |
if query_engine is None:
|
|
|
|
| 208 |
log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
|
| 209 |
|
| 210 |
unique_retrieved = deduplicate_nodes(retrieved_nodes)
|
| 211 |
+
|
| 212 |
+
# DEBUG: Log what was retrieved
|
| 213 |
+
log_message(f"RETRIEVED: {len(unique_retrieved)} nodes")
|
| 214 |
+
for i, node in enumerate(unique_retrieved): # All debug
|
| 215 |
+
table_num = node.metadata.get('table_number', 'N/A')
|
| 216 |
+
table_title = node.metadata.get('table_title', 'N/A')
|
| 217 |
+
doc_id = node.metadata.get('document_id', 'N/A')
|
| 218 |
+
log_message(f" [{i+1}] {doc_id} - Table {table_num}: {table_title[:50]}")
|
| 219 |
log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
|
| 220 |
|
| 221 |
# Simple reranking
|