Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +18 -1
split_files_to_excel.py
CHANGED
|
@@ -477,8 +477,25 @@ def build_index(docs, index, output_folder):
|
|
| 477 |
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
| 478 |
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
def split_in_df(files):
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
df = pd.DataFrame()
|
| 483 |
for document in documents:
|
| 484 |
filename = document.metadata['filename']
|
|
|
|
| 477 |
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
| 478 |
|
| 479 |
|
| 480 |
+
def extract_zip(zip_path):
|
| 481 |
+
extracted_files = []
|
| 482 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 483 |
+
for file_info in zip_ref.infolist():
|
| 484 |
+
extracted_files.append(file_info.filename)
|
| 485 |
+
zip_ref.extract(file_info.filename)
|
| 486 |
+
return extracted_files
|
| 487 |
+
|
| 488 |
def split_in_df(files):
|
| 489 |
+
print("Processing zip files...")
|
| 490 |
+
for file_path in files:
|
| 491 |
+
if file_path.endswith('.zip'):
|
| 492 |
+
extracted_files = extract_zip(file_path)
|
| 493 |
+
processed_files.extend(extracted_files)
|
| 494 |
+
else:
|
| 495 |
+
processed_files.append(file_path)
|
| 496 |
+
print("Finished processing zip files\Splitting files into chunks...")
|
| 497 |
+
documents = split_doc_in_chunks(processed_files)
|
| 498 |
+
print("Finished splitting")
|
| 499 |
df = pd.DataFrame()
|
| 500 |
for document in documents:
|
| 501 |
filename = document.metadata['filename']
|