MrSimple07 commited on
Commit
05822e9
·
1 Parent(s): 7565a55

max chars = 2000 + removed normalize_doc_id

Browse files
Files changed (1) hide show
  1. documents_prep.py +4 -25
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 1024
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
@@ -38,21 +38,6 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def normalize_doc_id(doc_id):
42
- """Normalize document ID for consistent matching"""
43
- if not doc_id or doc_id == 'unknown':
44
- return doc_id
45
-
46
- doc_id = str(doc_id).strip()
47
-
48
- # Normalize spacing: "ГОСТ Р" variations
49
- import re
50
- doc_id = re.sub(r'ГОСТ\s*Р', 'ГОСТ Р', doc_id, flags=re.IGNORECASE)
51
- doc_id = re.sub(r'НП\s*-', 'НП-', doc_id, flags=re.IGNORECASE)
52
-
53
- return doc_id
54
-
55
-
56
  def chunk_table_by_content(table_data, doc_id, max_chars=2000):
57
  """Chunk tables by content size instead of rows"""
58
  headers = table_data.get('headers', [])
@@ -60,9 +45,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
60
  table_num = table_data.get('table_number', 'unknown')
61
  table_title = table_data.get('table_title', '')
62
  section = table_data.get('section', '')
63
-
64
- # NORMALIZE document ID
65
- doc_id = normalize_doc_id(doc_id)
66
  table_num_clean = str(table_num).strip()
67
 
68
  # Create section-aware identifier
@@ -85,7 +68,7 @@ def chunk_table_by_content(table_data, doc_id, max_chars=2000):
85
  # Calculate base metadata size (everything except row data)
86
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
87
  base_size = len(base_content)
88
- available_space = max_chars - base_size - 200 # Reserve 200 chars for footer
89
 
90
  # If entire table fits, return as one chunk
91
  full_rows_content = format_table_rows(rows)
@@ -239,10 +222,7 @@ def format_table_footer(table_identifier, doc_id):
239
  """Format table footer"""
240
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
241
 
242
-
243
- # Update load_table_documents to use new function
244
  def load_table_documents(repo_id, hf_token, table_dir):
245
- """Load and chunk tables by content size"""
246
  log_message("Loading tables...")
247
 
248
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -266,8 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
266
  for sheet in data.get('sheets', []):
267
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
268
 
269
- # Use content-based chunking instead of row-based
270
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2500)
271
  all_chunks.extend(chunks)
272
 
273
  except Exception as e:
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1500
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
 
38
  return chunked
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def chunk_table_by_content(table_data, doc_id, max_chars=2000):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
 
45
  table_num = table_data.get('table_number', 'unknown')
46
  table_title = table_data.get('table_title', '')
47
  section = table_data.get('section', '')
48
+
 
 
49
  table_num_clean = str(table_num).strip()
50
 
51
  # Create section-aware identifier
 
68
  # Calculate base metadata size (everything except row data)
69
  base_content = format_table_header(doc_id, table_identifier, table_num, table_title, section, headers)
70
  base_size = len(base_content)
71
+ available_space = max_chars - base_size - 200
72
 
73
  # If entire table fits, return as one chunk
74
  full_rows_content = format_table_rows(rows)
 
222
  """Format table footer"""
223
  return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
224
 
 
 
225
  def load_table_documents(repo_id, hf_token, table_dir):
 
226
  log_message("Loading tables...")
227
 
228
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
 
246
  for sheet in data.get('sheets', []):
247
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
248
 
249
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=2000)
 
250
  all_chunks.extend(chunks)
251
 
252
  except Exception as e: