Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

MrSimple07 commited on Oct 5, 2025

Commit

7f19939

1 Parent(s): 2d1ebe6

chunk siz = 1000, max_chars = 1500

Files changed (1) hide show

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 2000
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_content(table_data, doc_id, max_chars=3000):
     """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
@@ -246,7 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
-                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=3000)
                 all_chunks.extend(chunks)
         except Exception as e:

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1000
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
     return chunked
+def chunk_table_by_content(table_data, doc_id, max_chars=1500):
     """Chunk tables by content size instead of rows"""
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1500)
                 all_chunks.extend(chunks)
         except Exception as e: