Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +6 -5
split_files_to_excel.py
CHANGED
|
@@ -471,19 +471,20 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
|
| 471 |
docs = []
|
| 472 |
for i, filename in enumerate(input_folder):
|
| 473 |
path = filename#os.path.join(input_folder, filename)
|
| 474 |
-
print(f"Treating file {i}/{len(input_folder)}")
|
| 475 |
# Select the appropriate document loader
|
| 476 |
chunks=[]
|
| 477 |
if path.endswith(".pdf"):
|
| 478 |
try:
|
| 479 |
print("Treatment of pdf file", path)
|
| 480 |
raw_chunks = split_pdf(path, input_folder)
|
| 481 |
-
for raw_chunk in raw_chunks:
|
| 482 |
-
print(f"BASE zzzzz LIST : {base_folders} = i = {
|
| 483 |
-
raw_chunk.metadata["Base Folder"] = base_folders[
|
| 484 |
sb_chunks = group_chunks_by_section(raw_chunks)
|
| 485 |
if nb_pages > 0:
|
| 486 |
for sb_chunk in sb_chunks:
|
|
|
|
| 487 |
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
| 488 |
chunks.append(sb_chunk)
|
| 489 |
else:
|
|
@@ -602,7 +603,7 @@ def split_in_df(files, nb_pages):
|
|
| 602 |
else:
|
| 603 |
processed_files.append(file_path)
|
| 604 |
base_folders.append("")
|
| 605 |
-
print(f"BASE FOLDERS LIST : {base_folders}")
|
| 606 |
print("Finished processing zip files\nSplitting files into chunks...")
|
| 607 |
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
| 608 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
|
|
|
| 471 |
docs = []
|
| 472 |
for i, filename in enumerate(input_folder):
|
| 473 |
path = filename#os.path.join(input_folder, filename)
|
| 474 |
+
print(f"Treating file {i+1}/{len(input_folder)}")
|
| 475 |
# Select the appropriate document loader
|
| 476 |
chunks=[]
|
| 477 |
if path.endswith(".pdf"):
|
| 478 |
try:
|
| 479 |
print("Treatment of pdf file", path)
|
| 480 |
raw_chunks = split_pdf(path, input_folder)
|
| 481 |
+
for j, raw_chunk in enumerate(raw_chunks):
|
| 482 |
+
print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
|
| 483 |
+
raw_chunk.metadata["Base Folder"] = base_folders[j]
|
| 484 |
sb_chunks = group_chunks_by_section(raw_chunks)
|
| 485 |
if nb_pages > 0:
|
| 486 |
for sb_chunk in sb_chunks:
|
| 487 |
+
print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
|
| 488 |
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
| 489 |
chunks.append(sb_chunk)
|
| 490 |
else:
|
|
|
|
| 603 |
else:
|
| 604 |
processed_files.append(file_path)
|
| 605 |
base_folders.append("")
|
| 606 |
+
print(f"BASE FOLDERS LIST : {base_folders}, FILES LIST : {processed_files}")
|
| 607 |
print("Finished processing zip files\nSplitting files into chunks...")
|
| 608 |
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
| 609 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|