Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

YchKhan commited on May 15, 2024

Commit

5fb1f69

verified ·

1 Parent(s): 9767141

Update split_files_to_excel.py

Files changed (1) hide show

split_files_to_excel.py CHANGED Viewed

@@ -181,7 +181,7 @@ def create_documents(source, snippets, font_sizes):
 ## Group Chunks docx or pdf
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
-def group_chunks_by_section(chunks, min_chunk_size=512):
     filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
     #print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
     new_chunks = []
@@ -580,7 +580,7 @@ def split_in_df(files):
         if file_path.endswith('.zip'):
             extracted_files = extract_zip(file_path)
             processed_files.extend(extracted_files)
-            base_folders.append(os.path.splitext(os.path.basename(file_path))[0])
         else:
             processed_files.append(file_path)
             base_folders.append("")

 ## Group Chunks docx or pdf
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
+def group_chunks_by_section(chunks, min_chunk_size=64):
     filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
     #print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
     new_chunks = []
         if file_path.endswith('.zip'):
             extracted_files = extract_zip(file_path)
             processed_files.extend(extracted_files)
+            base_folders.extend([os.path.splitext(os.path.basename(file_path))[0] * len(extracted_files)])
         else:
             processed_files.append(file_path)
             base_folders.append("")