Spaces:
Sleeping
Sleeping
Commit
·
05822e9
1
Parent(s):
7565a55
max chars = 2000 + removed normalize_doc_id
Browse files- documents_prep.py +4 -25
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -38,21 +38,6 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def normalize_doc_id(doc_id):
|
| 42 |
-
"""Normalize document ID for consistent matching"""
|
| 43 |
-
if not doc_id or doc_id == 'unknown':
|
| 44 |
-
return doc_id
|
| 45 |
-
|
| 46 |
-
doc_id = str(doc_id).strip()
|
| 47 |
-
|
| 48 |
-
# Normalize spacing: "ГОСТ Р" variations
|
| 49 |
-
import re
|
| 50 |
-
doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
|
| 51 |
-
doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
|
| 52 |
-
|
| 53 |
-
return doc_id
|
| 54 |
-
|
| 55 |
-
|
| 56 |
def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
| 57 |
"""Chunk tables by content size instead of rows"""
|
| 58 |
headers = table_data.get('headers', [])
|
|
@@ -60,9 +45,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
|
| 60 |
table_num = table_data.get('table_number', 'unknown')
|
| 61 |
table_title = table_data.get('table_title', '')
|
| 62 |
section = table_data.get('section', '')
|
| 63 |
-
|
| 64 |
-
# NORMALIZE document ID
|
| 65 |
-
doc_id = normalize_doc_id(doc_id)
|
| 66 |
table_num_clean = str(table_num).strip()
|
| 67 |
|
| 68 |
# Create section-aware identifier
|
|
@@ -85,7 +68,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
|
| 85 |
# Calculate base metadata size (everything except row data)
|
| 86 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 87 |
base_size = len(base_content)
|
| 88 |
-
available_space = max_chars - base_size - 200
|
| 89 |
|
| 90 |
# If entire table fits, return as one chunk
|
| 91 |
full_rows_content = format_table_rows(rows)
|
|
@@ -239,10 +222,7 @@ def format_table_footer(table_identifier, doc_id):
|
|
| 239 |
"""Format table footer"""
|
| 240 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 241 |
|
| 242 |
-
|
| 243 |
-
# Update load_table_documents to use new function
|
| 244 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 245 |
-
"""Load and chunk tables by content size"""
|
| 246 |
log_message("Loading tables...")
|
| 247 |
|
| 248 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
@@ -266,8 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 266 |
for sheet in data.get('sheets', []):
|
| 267 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 268 |
|
| 269 |
-
|
| 270 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
|
| 271 |
all_chunks.extend(chunks)
|
| 272 |
|
| 273 |
except Exception as e:
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1500
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def chunk_table_by_content(table_data, doc_id, max_chars=2000):
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
|
|
|
| 45 |
table_num = table_data.get('table_number', 'unknown')
|
| 46 |
table_title = table_data.get('table_title', '')
|
| 47 |
section = table_data.get('section', '')
|
| 48 |
+
|
|
|
|
|
|
|
| 49 |
table_num_clean = str(table_num).strip()
|
| 50 |
|
| 51 |
# Create section-aware identifier
|
|
|
|
| 68 |
# Calculate base metadata size (everything except row data)
|
| 69 |
base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
|
| 70 |
base_size = len(base_content)
|
| 71 |
+
available_space = max_chars - base_size - 200
|
| 72 |
|
| 73 |
# If entire table fits, return as one chunk
|
| 74 |
full_rows_content = format_table_rows(rows)
|
|
|
|
| 222 |
"""Format table footer"""
|
| 223 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 224 |
|
|
|
|
|
|
|
| 225 |
def load_table_documents(repo_id, hf_token, table_dir):
|
|
|
|
| 226 |
log_message("Loading tables...")
|
| 227 |
|
| 228 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
|
|
| 246 |
for sheet in data.get('sheets', []):
|
| 247 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 248 |
|
| 249 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
|
|
|
|
| 250 |
all_chunks.extend(chunks)
|
| 251 |
|
| 252 |
except Exception as e:
|