Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +26 -1
split_files_to_excel.py
CHANGED
|
@@ -20,6 +20,9 @@ from unstructured.partition.auto import partition
|
|
| 20 |
|
| 21 |
from transformers import AutoTokenizer
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
MODEL = "thenlper/gte-base"
|
| 24 |
CHUNK_SIZE = 1000
|
| 25 |
CHUNK_OVERLAP = 200
|
|
@@ -471,4 +474,26 @@ def build_index(docs, index, output_folder):
|
|
| 471 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 472 |
index.save_local(temp_dir)
|
| 473 |
for f in os.listdir(temp_dir):
|
| 474 |
-
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
from transformers import AutoTokenizer
|
| 22 |
|
| 23 |
+
import pandas as pd
|
| 24 |
+
|
| 25 |
+
|
| 26 |
MODEL = "thenlper/gte-base"
|
| 27 |
CHUNK_SIZE = 1000
|
| 28 |
CHUNK_OVERLAP = 200
|
|
|
|
| 474 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 475 |
index.save_local(temp_dir)
|
| 476 |
for f in os.listdir(temp_dir):
|
| 477 |
+
output_folder.upload_file(f, os.path.join(temp_dir, f))
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def split_in_df(files):
|
| 481 |
+
documents = split_doc_in_chunks(files)
|
| 482 |
+
df = pd.DataFrame()
|
| 483 |
+
for document in documents:
|
| 484 |
+
content = document.page_content
|
| 485 |
+
|
| 486 |
+
metadata = document.metadata
|
| 487 |
+
metadata_keys = list(metadata.keys())
|
| 488 |
+
metadata_values = list(metadata.values())
|
| 489 |
+
|
| 490 |
+
doc_data = {'Content': content}
|
| 491 |
+
|
| 492 |
+
for key, value in zip(metadata_keys, metadata_values):
|
| 493 |
+
doc_data[key] = value
|
| 494 |
+
|
| 495 |
+
df = df.append(doc_data, ignore_index=True)
|
| 496 |
+
|
| 497 |
+
df.to_excel("dataframe.xlsx", index=False)
|
| 498 |
+
|
| 499 |
+
return "dataframe.xlsx"
|