Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +2 -2
split_files_to_excel.py
CHANGED
|
@@ -181,7 +181,7 @@ def create_documents(source, snippets, font_sizes):
|
|
| 181 |
## Group Chunks docx or pdf
|
| 182 |
|
| 183 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 184 |
-
def group_chunks_by_section(chunks, min_chunk_size=
|
| 185 |
filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
|
| 186 |
#print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
|
| 187 |
new_chunks = []
|
|
@@ -580,7 +580,7 @@ def split_in_df(files):
|
|
| 580 |
if file_path.endswith('.zip'):
|
| 581 |
extracted_files = extract_zip(file_path)
|
| 582 |
processed_files.extend(extracted_files)
|
| 583 |
-
base_folders.
|
| 584 |
else:
|
| 585 |
processed_files.append(file_path)
|
| 586 |
base_folders.append("")
|
|
|
|
| 181 |
## Group Chunks docx or pdf
|
| 182 |
|
| 183 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 184 |
+
def group_chunks_by_section(chunks, min_chunk_size=64):
|
| 185 |
filtered_chunks = [chunk for chunk in chunks if chunk.metadata['category'] != 'PageBreak']# Add more filters if needed
|
| 186 |
#print(f"filtered = {len(filtered_chunks)} - before = {len(chunks)}")
|
| 187 |
new_chunks = []
|
|
|
|
| 580 |
if file_path.endswith('.zip'):
|
| 581 |
extracted_files = extract_zip(file_path)
|
| 582 |
processed_files.extend(extracted_files)
|
| 583 |
+
base_folders.extend([os.path.splitext(os.path.basename(file_path))[0] * len(extracted_files)])
|
| 584 |
else:
|
| 585 |
processed_files.append(file_path)
|
| 586 |
base_folders.append("")
|