MrSimple07 commited on
Commit
7f19939
·
1 Parent(s): 2d1ebe6

chunk siz = 1000, max_chars = 1500

Browse files
Files changed (1) hide show
  1. documents_prep.py +3 -3
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 2000
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_content(table_data, doc_id, max_chars=3000):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
@@ -246,7 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
246
  for sheet in data.get('sheets', []):
247
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
248
 
249
- chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=3000)
250
  all_chunks.extend(chunks)
251
 
252
  except Exception as e:
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1000
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_content(table_data, doc_id, max_chars=1500):
42
  """Chunk tables by content size instead of rows"""
43
  headers = table_data.get('headers', [])
44
  rows = table_data.get('data', [])
 
246
  for sheet in data.get('sheets', []):
247
  sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
248
 
249
+ chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1500)
250
  all_chunks.extend(chunks)
251
 
252
  except Exception as e: