Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +17 -8
split_files_to_excel.py
CHANGED
|
@@ -455,7 +455,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
| 455 |
|
| 456 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 457 |
|
| 458 |
-
def split_doc_in_chunks(input_folder):
|
| 459 |
docs = []
|
| 460 |
for i, filename in enumerate(input_folder):
|
| 461 |
path = filename#os.path.join(input_folder, filename)
|
|
@@ -465,8 +465,10 @@ def split_doc_in_chunks(input_folder):
|
|
| 465 |
if path.endswith(".pdf"):
|
| 466 |
try:
|
| 467 |
print("Treatment of pdf file", path)
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
| 470 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 471 |
# for chunk in chunks:
|
| 472 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
@@ -475,9 +477,11 @@ def split_doc_in_chunks(input_folder):
|
|
| 475 |
elif path.endswith(".docx"):
|
| 476 |
try:
|
| 477 |
print ("Treatment of docx file", path)
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
| 481 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 482 |
#if "cards-Jan 2022-SP.docx" in path:
|
| 483 |
#for chunk in chunks:
|
|
@@ -496,6 +500,7 @@ def split_doc_in_chunks(input_folder):
|
|
| 496 |
chunk.metadata["filename"] = filename.split("/")[-1]
|
| 497 |
chunk.metadata["file_directory"] = filename.split("/")[:-1]
|
| 498 |
chunk.metadata["filetype"] = filename.split(".")[-1]
|
|
|
|
| 499 |
if "page" in chunk.metadata:
|
| 500 |
counter[chunk.metadata['page']] += 1
|
| 501 |
for i in range(len(chunks)):
|
|
@@ -566,15 +571,18 @@ def extract_zip(zip_path):
|
|
| 566 |
|
| 567 |
def split_in_df(files):
|
| 568 |
processed_files = []
|
|
|
|
| 569 |
print("Processing zip files...")
|
| 570 |
for file_path in files:
|
| 571 |
if file_path.endswith('.zip'):
|
| 572 |
extracted_files = extract_zip(file_path)
|
| 573 |
processed_files.extend(extracted_files)
|
|
|
|
| 574 |
else:
|
| 575 |
processed_files.append(file_path)
|
| 576 |
-
|
| 577 |
-
|
|
|
|
| 578 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
| 579 |
print("Finished splitting")
|
| 580 |
df = pd.DataFrame()
|
|
@@ -590,6 +598,7 @@ def split_in_df(files):
|
|
| 590 |
|
| 591 |
doc_data["Token_Length"] = re_doc.metadata['token_length']
|
| 592 |
doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
|
|
|
|
| 593 |
|
| 594 |
# for key, value in zip(metadata_keys, metadata_values):
|
| 595 |
# doc_data[key] = value
|
|
|
|
| 455 |
|
| 456 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 457 |
|
| 458 |
+
def split_doc_in_chunks(input_folder, base_folders):
|
| 459 |
docs = []
|
| 460 |
for i, filename in enumerate(input_folder):
|
| 461 |
path = filename#os.path.join(input_folder, filename)
|
|
|
|
| 465 |
if path.endswith(".pdf"):
|
| 466 |
try:
|
| 467 |
print("Treatment of pdf file", path)
|
| 468 |
+
raw_chunks = split_pdf(path, input_folder)
|
| 469 |
+
for raw_chunk in raw_chunks:
|
| 470 |
+
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
| 471 |
+
chunks = group_chunks_by_section(raw_chunks)
|
| 472 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 473 |
# for chunk in chunks:
|
| 474 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
|
|
| 477 |
elif path.endswith(".docx"):
|
| 478 |
try:
|
| 479 |
print ("Treatment of docx file", path)
|
| 480 |
+
raw_chunks = split_docx(path, input_folder)
|
| 481 |
+
for raw_chunk in raw_chunks:
|
| 482 |
+
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
| 483 |
+
#print(f"RAW :\n***\n{raw_chunks}")
|
| 484 |
+
chunks = group_chunks_by_section(raw_chunks)
|
| 485 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 486 |
#if "cards-Jan 2022-SP.docx" in path:
|
| 487 |
#for chunk in chunks:
|
|
|
|
| 500 |
chunk.metadata["filename"] = filename.split("/")[-1]
|
| 501 |
chunk.metadata["file_directory"] = filename.split("/")[:-1]
|
| 502 |
chunk.metadata["filetype"] = filename.split(".")[-1]
|
| 503 |
+
chunk.metadata["Base Folder"] = base_folders[i]
|
| 504 |
if "page" in chunk.metadata:
|
| 505 |
counter[chunk.metadata['page']] += 1
|
| 506 |
for i in range(len(chunks)):
|
|
|
|
| 571 |
|
| 572 |
def split_in_df(files):
|
| 573 |
processed_files = []
|
| 574 |
+
base_folders = []
|
| 575 |
print("Processing zip files...")
|
| 576 |
for file_path in files:
|
| 577 |
if file_path.endswith('.zip'):
|
| 578 |
extracted_files = extract_zip(file_path)
|
| 579 |
processed_files.extend(extracted_files)
|
| 580 |
+
base_folders.append(os.path.splitext(os.path.basename(file_path))[0])
|
| 581 |
else:
|
| 582 |
processed_files.append(file_path)
|
| 583 |
+
base_folders.append("")
|
| 584 |
+
print("Finished processing zip files\nSplitting files into chunks...")
|
| 585 |
+
documents = split_doc_in_chunks(processed_files, base_folders)
|
| 586 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
| 587 |
print("Finished splitting")
|
| 588 |
df = pd.DataFrame()
|
|
|
|
| 598 |
|
| 599 |
doc_data["Token_Length"] = re_doc.metadata['token_length']
|
| 600 |
doc_data["Titles"] = re_doc.metadata['titles'] if 'titles' in re_doc.metadata else ""
|
| 601 |
+
doc_data["Base Folder"] = re_doc.metadata["Base Folder"]
|
| 602 |
|
| 603 |
# for key, value in zip(metadata_keys, metadata_values):
|
| 604 |
# doc_data[key] = value
|