Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

YchKhan commited on May 15, 2024

Commit

9767141

verified ·

1 Parent(s): 9ea18b7

Update split_files_to_excel.py

Files changed (1) hide show

split_files_to_excel.py CHANGED Viewed

@@ -359,10 +359,13 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
             is_first_chunk = True  # Keep track of the first chunk in the document
         to_encode += doc.page_content
         # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
-        if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
-            # print('SAME DOC')
-            skip_next = True
-            to_encode += documents[i+1].page_content
         #print(f"to_encode:\n{to_encode}")
         encoded = tokenizer.encode(to_encode)#encode the current document
         if len(encoded) < min_chunk_size and not skip_next:

             is_first_chunk = True  # Keep track of the first chunk in the document
         to_encode += doc.page_content
         # if last chunk < min_chunk_size we add it to the previous chunk for the splitting.
+        try:
+            if (documents[i+1] is documents[-1] or documents[i+1].metadata['source'] != documents[i+2].metadata['source']) and len(tokenizer.encode(documents[i+1].page_content)) < min_chunk_size: # if the next doc is the last doc of the current file or the last of the corpus
+                # print('SAME DOC')
+                skip_next = True
+                to_encode += documents[i+1].page_content
+        except Exception as e:
+            print(e)
         #print(f"to_encode:\n{to_encode}")
         encoded = tokenizer.encode(to_encode)#encode the current document
         if len(encoded) < min_chunk_size and not skip_next: