Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +16 -6
split_files_to_excel.py
CHANGED
|
@@ -493,9 +493,9 @@ def split_doc_in_chunks(input_folder):
|
|
| 493 |
return docs
|
| 494 |
|
| 495 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 496 |
-
def resplit_by_end_of_sentence(docs):
|
| 497 |
print("ββ\nResplitting docs by end of sentence\nββ")
|
| 498 |
-
resized_docs = split_chunks_by_tokens_period(docs,
|
| 499 |
try:
|
| 500 |
# add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
|
| 501 |
cur_source = ""
|
|
@@ -553,11 +553,12 @@ def split_in_df(files):
|
|
| 553 |
processed_files.append(file_path)
|
| 554 |
print("Finished processing zip files\Splitting files into chunks...")
|
| 555 |
documents = split_doc_in_chunks(processed_files)
|
|
|
|
| 556 |
print("Finished splitting")
|
| 557 |
df = pd.DataFrame()
|
| 558 |
-
for
|
| 559 |
-
filename =
|
| 560 |
-
content =
|
| 561 |
|
| 562 |
# metadata = document.metadata
|
| 563 |
# metadata_keys = list(metadata.keys())
|
|
@@ -836,4 +837,13 @@ def non_intelligent_split(files, chunk_size = 1000):
|
|
| 836 |
|
| 837 |
df.to_excel("dataframe_keywords.xlsx", index=False)
|
| 838 |
|
| 839 |
-
return "dataframe_keywords.xlsx"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
return docs
|
| 494 |
|
| 495 |
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
|
| 496 |
+
def resplit_by_end_of_sentence(docs, max_len, overlap, min_len):
|
| 497 |
print("ββ\nResplitting docs by end of sentence\nββ")
|
| 498 |
+
resized_docs = split_chunks_by_tokens_period(docs, max_len, overlap, min_len)
|
| 499 |
try:
|
| 500 |
# add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
|
| 501 |
cur_source = ""
|
|
|
|
| 553 |
processed_files.append(file_path)
|
| 554 |
print("Finished processing zip files\Splitting files into chunks...")
|
| 555 |
documents = split_doc_in_chunks(processed_files)
|
| 556 |
+
re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
|
| 557 |
print("Finished splitting")
|
| 558 |
df = pd.DataFrame()
|
| 559 |
+
for re_doc in re_docs:
|
| 560 |
+
filename = re_doc.metadata['filename']
|
| 561 |
+
content = re_doc.page_content
|
| 562 |
|
| 563 |
# metadata = document.metadata
|
| 564 |
# metadata_keys = list(metadata.keys())
|
|
|
|
| 837 |
|
| 838 |
df.to_excel("dataframe_keywords.xlsx", index=False)
|
| 839 |
|
| 840 |
+
return "dataframe_keywords.xlsx"
|
| 841 |
+
|
| 842 |
+
|
| 843 |
+
def function_split_call(fi_input, dropdown, choice, chunk_size):
|
| 844 |
+
if choice == "Intelligent split":
|
| 845 |
+
return split_in_df(fi_input)
|
| 846 |
+
elif choice == "Non intelligent split":
|
| 847 |
+
return non_intelligent_split(fi_input, chunk_size)
|
| 848 |
+
else:
|
| 849 |
+
return split_by_keywords(fi_input,dropdown)
|