Spaces:
Sleeping
Sleeping
Commit
·
7f19939
1
Parent(s):
2d1ebe6
chunk siz = 1000, max_chars = 1500
Browse files- documents_prep.py +3 -3
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_content(table_data, doc_id, max_chars=
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
|
@@ -246,7 +246,7 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 246 |
for sheet in data.get('sheets', []):
|
| 247 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 248 |
|
| 249 |
-
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=
|
| 250 |
all_chunks.extend(chunks)
|
| 251 |
|
| 252 |
except Exception as e:
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1000
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_content(table_data, doc_id, max_chars=1500):
|
| 42 |
"""Chunk tables by content size instead of rows"""
|
| 43 |
headers = table_data.get('headers', [])
|
| 44 |
rows = table_data.get('data', [])
|
|
|
|
| 246 |
for sheet in data.get('sheets', []):
|
| 247 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 248 |
|
| 249 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1500)
|
| 250 |
all_chunks.extend(chunks)
|
| 251 |
|
| 252 |
except Exception as e:
|