Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +20 -20
split_files_to_excel.py
CHANGED
|
@@ -475,27 +475,27 @@ def split_doc_in_chunks(input_folder, base_folders, nb_pages):
|
|
| 475 |
# Select the appropriate document loader
|
| 476 |
chunks=[]
|
| 477 |
if path.endswith(".pdf"):
|
| 478 |
-
try:
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
else:
|
| 491 |
-
break
|
| 492 |
else:
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
print("
|
|
|
|
|
|
|
| 499 |
elif path.endswith(".docx"):
|
| 500 |
try:
|
| 501 |
print ("Treatment of docx file", path)
|
|
|
|
| 475 |
# Select the appropriate document loader
|
| 476 |
chunks=[]
|
| 477 |
if path.endswith(".pdf"):
|
| 478 |
+
# try:
|
| 479 |
+
print("Treatment of pdf file", path)
|
| 480 |
+
raw_chunks = split_pdf(path, input_folder)
|
| 481 |
+
for j, raw_chunk in enumerate(raw_chunks):
|
| 482 |
+
print(f"BASE zzzzz LIST : {base_folders} = i = {j}")
|
| 483 |
+
raw_chunk.metadata["Base Folder"] = base_folders[j]
|
| 484 |
+
sb_chunks = group_chunks_by_section(raw_chunks)
|
| 485 |
+
if nb_pages > 0:
|
| 486 |
+
for sb_chunk in sb_chunks:
|
| 487 |
+
print(f"CHUNK PAGENUM = {sb_chunk.metadata['page_number']}")
|
| 488 |
+
if int(sb_chunk.metadata["page_number"])<nb_pages:
|
| 489 |
+
chunks.append(sb_chunk)
|
|
|
|
|
|
|
| 490 |
else:
|
| 491 |
+
break
|
| 492 |
+
else:
|
| 493 |
+
chunks = sb_chunks
|
| 494 |
+
print(f"Document splitted in {len(chunks)} chunks")
|
| 495 |
+
# for chunk in chunks:
|
| 496 |
+
# print(f"\n\n____\n\n\nPDF CONTENT: \n{chunk.page_content}\ntitle: {chunk.metadata['title']}\nFile Name: {chunk.metadata['filename']}\n\n")
|
| 497 |
+
# except Exception as e:
|
| 498 |
+
# print("Error while splitting the pdf file: ", e)
|
| 499 |
elif path.endswith(".docx"):
|
| 500 |
try:
|
| 501 |
print ("Treatment of docx file", path)
|