Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +7 -4
split_files_to_excel.py
CHANGED
|
@@ -359,10 +359,13 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
| 359 |
is_first_chunk = True # Keep track of the first chunk in the document
|
| 360 |
to_encode += doc.page_content
|
| 361 |
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
| 366 |
#print(f"to_encode:\n{to_encode}")
|
| 367 |
encoded = tokenizer.encode(to_encode)#encode the current document
|
| 368 |
if len(encoded) < min_chunk_size and not skip_next:
|
|
|
|
| 359 |
is_first_chunk = True # Keep track of the first chunk in the document
|
| 360 |
to_encode += doc.page_content
|
| 361 |
# if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
|
| 362 |
+
try:
|
| 363 |
+
if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
|
| 364 |
+
# print('SAME DOC')
|
| 365 |
+
skip_next = True
|
| 366 |
+
to_encode += documents[i+1].page_content
|
| 367 |
+
except Exception as e:
|
| 368 |
+
print(e)
|
| 369 |
#print(f"to_encode:\n{to_encode}")
|
| 370 |
encoded = tokenizer.encode(to_encode)#encode the current document
|
| 371 |
if len(encoded) < min_chunk_size and not skip_next:
|