Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

YchKhan commited on Apr 30, 2024

Commit

b012677

1 Parent(s): 1a7b560

Update split_files_to_excel.py

Files changed (1) hide show

split_files_to_excel.py CHANGED Viewed

@@ -493,9 +493,9 @@ def split_doc_in_chunks(input_folder):
     return docs
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
-def resplit_by_end_of_sentence(docs):
     print("❌❌\nResplitting docs by end of sentence\n❌❌")
-    resized_docs = split_chunks_by_tokens_period(docs, max_length=200, overlap=40, min_chunk_size=20)
     try:
         # add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
         cur_source = ""
@@ -553,11 +553,12 @@ def split_in_df(files):
             processed_files.append(file_path)
     print("Finished processing zip files\Splitting files into chunks...")
     documents = split_doc_in_chunks(processed_files)
     print("Finished splitting")
     df = pd.DataFrame()
-    for document in documents:
-        filename = document.metadata['filename']
-        content = document.page_content
         # metadata = document.metadata
         # metadata_keys = list(metadata.keys())
@@ -836,4 +837,13 @@ def non_intelligent_split(files, chunk_size = 1000):
     df.to_excel("dataframe_keywords.xlsx", index=False)
-    return "dataframe_keywords.xlsx"

     return docs
 # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
+def resplit_by_end_of_sentence(docs, max_len, overlap, min_len):
     print("❌❌\nResplitting docs by end of sentence\n❌❌")
+    resized_docs = split_chunks_by_tokens_period(docs, max_len, overlap, min_len)
     try:
         # add chunk title to all resplitted chunks #todo move this to split_chunks_by_tokens_period(inject_title = True) with a boolean parameter
         cur_source = ""
             processed_files.append(file_path)
     print("Finished processing zip files\Splitting files into chunks...")
     documents = split_doc_in_chunks(processed_files)
+    re_docs = resplit_by_end_of_sentence(documents, 1000, 100, 1500)
     print("Finished splitting")
     df = pd.DataFrame()
+    for re_doc in re_docs:
+        filename = re_doc.metadata['filename']
+        content = re_doc.page_content
         # metadata = document.metadata
         # metadata_keys = list(metadata.keys())
     df.to_excel("dataframe_keywords.xlsx", index=False)
+    return "dataframe_keywords.xlsx"
+def function_split_call(fi_input, dropdown, choice, chunk_size):
+    if choice == "Intelligent split":
+        return split_in_df(fi_input)
+    elif choice == "Non intelligent split":
+        return non_intelligent_split(fi_input, chunk_size)
+    else:
+        return split_by_keywords(fi_input,dropdown)