MrSimple07 commited on
Commit
2b217eb
·
1 Parent(s): 634c04c

chunk size = 2048 + rows=15

Browse files
Files changed (1) hide show
  1. documents_prep.py +5 -4
documents_prep.py CHANGED
@@ -6,13 +6,14 @@ from llama_index.core import Document
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
- CHUNK_SIZE = 2048
 
10
  CHUNK_OVERLAP = 128
11
 
12
  def chunk_text_documents(documents):
13
  text_splitter = SentenceSplitter(
14
  chunk_size=CHUNK_SIZE,
15
- chunk_overlap=CHUNK_OVERLAP,
16
  )
17
 
18
  chunked = []
@@ -22,7 +23,7 @@ def chunk_text_documents(documents):
22
  chunk.metadata.update({
23
  'chunk_id': i,
24
  'total_chunks': len(chunks),
25
- 'chunk_size': len(chunk.text)
26
  })
27
  chunked.append(chunk)
28
 
@@ -37,7 +38,7 @@ def chunk_text_documents(documents):
37
  return chunked
38
 
39
 
40
- def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=15, max_chars=3000):
41
  """
42
  Chunk tables by rows with fallback to character limit.
43
  Keeps 3-4 rows together, but splits individual rows if they're too large.
 
6
  from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
+ # Configuration
10
+ CHUNK_SIZE = 1500
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
14
  text_splitter = SentenceSplitter(
15
  chunk_size=CHUNK_SIZE,
16
+ chunk_overlap=CHUNK_OVERLAP
17
  )
18
 
19
  chunked = []
 
23
  chunk.metadata.update({
24
  'chunk_id': i,
25
  'total_chunks': len(chunks),
26
+ 'chunk_size': len(chunk.text) # Add chunk size
27
  })
28
  chunked.append(chunk)
29
 
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=10, max_chars=2000):
42
  """
43
  Chunk tables by rows with fallback to character limit.
44
  Keeps 3-4 rows together, but splits individual rows if they're too large.