Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +14 -6
split_files_to_excel.py
CHANGED
|
@@ -68,7 +68,7 @@ text_splitter = CharacterTextSplitter(
|
|
| 68 |
|
| 69 |
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
| 70 |
if choice == "Intelligent split":
|
| 71 |
-
return split_in_df(fi_input)
|
| 72 |
elif choice == "Non intelligent split":
|
| 73 |
return non_intelligent_split(fi_input, chunk_size)
|
| 74 |
else:
|
|
@@ -78,7 +78,7 @@ def change_textbox(dropdown,radio):
|
|
| 78 |
if len(dropdown) == 0 :
|
| 79 |
dropdown = ["introduction", "objective", "summary", "conclusion"]
|
| 80 |
if radio == "Intelligent split by keywords":
|
| 81 |
-
return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(visible=
|
| 82 |
elif radio == "Non intelligent split":
|
| 83 |
return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
|
| 84 |
else:
|
|
@@ -464,7 +464,7 @@ def split_chunks_by_tokens_period(documents, max_length=170, overlap=10, min_chu
|
|
| 464 |
|
| 465 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 466 |
|
| 467 |
-
def split_doc_in_chunks(input_folder, base_folders):
|
| 468 |
docs = []
|
| 469 |
for i, filename in enumerate(input_folder):
|
| 470 |
path = filename#os.path.join(input_folder, filename)
|
|
@@ -478,7 +478,15 @@ def split_doc_in_chunks(input_folder, base_folders):
|
|
| 478 |
for raw_chunk in raw_chunks:
|
| 479 |
print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
| 480 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 483 |
# for chunk in chunks:
|
| 484 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
@@ -579,7 +587,7 @@ def extract_zip(zip_path):
|
|
| 579 |
zip_ref.extract(file_info.filename)
|
| 580 |
return extracted_files
|
| 581 |
|
| 582 |
-
def split_in_df(files):
|
| 583 |
processed_files = []
|
| 584 |
base_folders = []
|
| 585 |
print("Processing zip files...")
|
|
@@ -593,7 +601,7 @@ def split_in_df(files):
|
|
| 593 |
base_folders.append("")
|
| 594 |
print(f"BASE FOLDERS LIST : {base_folders}")
|
| 595 |
print("Finished processing zip files\nSplitting files into chunks...")
|
| 596 |
-
documents = split_doc_in_chunks(processed_files, base_folders)
|
| 597 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
| 598 |
print("Finished splitting")
|
| 599 |
df = pd.DataFrame()
|
|
|
|
| 68 |
|
| 69 |
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
| 70 |
if choice == "Intelligent split":
|
| 71 |
+
return split_in_df(fi_input, nb_pages)
|
| 72 |
elif choice == "Non intelligent split":
|
| 73 |
return non_intelligent_split(fi_input, chunk_size)
|
| 74 |
else:
|
|
|
|
| 78 |
if len(dropdown) == 0 :
|
| 79 |
dropdown = ["introduction", "objective", "summary", "conclusion"]
|
| 80 |
if radio == "Intelligent split by keywords":
|
| 81 |
+
return gr.Dropdown(dropdown, multiselect=True, visible=True, allow_custom_value=True), gr.Number(label="First pages to keep (0 for all)", value=2, interactive=True, visible=True)
|
| 82 |
elif radio == "Non intelligent split":
|
| 83 |
return gr.Dropdown(dropdown, visible=False),gr.Number(label="Chunk size", value=1000, interactive=True, visible=True)
|
| 84 |
else:
|
|
|
|
| 464 |
|
| 465 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 466 |
|
| 467 |
+
def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
| 468 |
docs = []
|
| 469 |
for i, filename in enumerate(input_folder):
|
| 470 |
path = filename#os.path.join(input_folder, filename)
|
|
|
|
| 478 |
for raw_chunk in raw_chunks:
|
| 479 |
print(f"BASE zzzzz LIST : {base_folders} = i = {i}")
|
| 480 |
raw_chunk.metadata["Base Folder"] = base_folders[i]
|
| 481 |
+
sb_chunks = group_chunks_by_section(raw_chunks)
|
| 482 |
+
if nb_pages > 0:
|
| 483 |
+
for sb_chunk in sb_chunks:
|
| 484 |
+
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
| 485 |
+
chunks.append(sb_chunk)
|
| 486 |
+
else:
|
| 487 |
+
break
|
| 488 |
+
else:
|
| 489 |
+
chunks = sb_chunks
|
| 490 |
print(f"Document splitted in {len(chunks)} chunks")
|
| 491 |
# for chunk in chunks:
|
| 492 |
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
|
|
|
| 587 |
zip_ref.extract(file_info.filename)
|
| 588 |
return extracted_files
|
| 589 |
|
| 590 |
+
def split_in_df(files, nb_pages):
|
| 591 |
processed_files = []
|
| 592 |
base_folders = []
|
| 593 |
print("Processing zip files...")
|
|
|
|
| 601 |
base_folders.append("")
|
| 602 |
print(f"BASE FOLDERS LIST : {base_folders}")
|
| 603 |
print("Finished processing zip files\nSplitting files into chunks...")
|
| 604 |
+
documents = split_doc_in_chunks(processed_files, base_folders, nb_pages)
|
| 605 |
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
| 606 |
print("Finished splitting")
|
| 607 |
df = pd.DataFrame()
|