Spaces:
Sleeping
Sleeping
Update split_files_to_excel.py
Browse files- split_files_to_excel.py +139 -4
split_files_to_excel.py
CHANGED
|
@@ -25,7 +25,8 @@ from pypdf import PdfReader
|
|
| 25 |
|
| 26 |
import pandas as pd
|
| 27 |
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
MODEL = "thenlper/gte-base"
|
| 31 |
CHUNK_SIZE = 1000
|
|
@@ -530,12 +531,42 @@ def split_in_df(files):
|
|
| 530 |
# -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
|
| 531 |
|
| 532 |
def split_by_keywords(files, key_words, words_limit=1000):
|
|
|
|
| 533 |
extracted_content = []
|
| 534 |
-
|
| 535 |
tabLine = []
|
| 536 |
-
for file in files:
|
| 537 |
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
file_name = file
|
| 540 |
file = PdfReader(file)
|
| 541 |
pdfNumberPages = len(file.pages)
|
|
@@ -629,6 +660,9 @@ def split_by_keywords(files, key_words, words_limit=1000):
|
|
| 629 |
tabLine.append([file_name, selectedText, key])
|
| 630 |
print(f"Selected line in keywords is: {line}")
|
| 631 |
|
|
|
|
|
|
|
|
|
|
| 632 |
for r in tabLine:
|
| 633 |
text_joined = ''.join(r[1])
|
| 634 |
text_joined = r[2] + " : \n " + text_joined
|
|
@@ -654,3 +688,104 @@ def split_by_keywords(files, key_words, words_limit=1000):
|
|
| 654 |
|
| 655 |
return "dataframe_keywords.xlsx"
|
| 656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
import pandas as pd
|
| 27 |
|
| 28 |
+
import requests
|
| 29 |
+
import json
|
| 30 |
|
| 31 |
MODEL = "thenlper/gte-base"
|
| 32 |
CHUNK_SIZE = 1000
|
|
|
|
| 531 |
# -------------------------------------------------------------------------------- SPLIT FILES BY KEYWORDS
|
| 532 |
|
| 533 |
def split_by_keywords(files, key_words, words_limit=1000):
|
| 534 |
+
processed_files = []
|
| 535 |
extracted_content = []
|
|
|
|
| 536 |
tabLine = []
|
|
|
|
| 537 |
|
| 538 |
+
# For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
|
| 539 |
+
try:
|
| 540 |
+
not_duplicate = True
|
| 541 |
+
for f in files:
|
| 542 |
+
for p in processed_files:
|
| 543 |
+
if (f[:f.rfind('.')] == p[:p.rfind('.')]):
|
| 544 |
+
not_duplicate = False
|
| 545 |
+
if not_duplicate:
|
| 546 |
+
if f.endswith('.zip'):
|
| 547 |
+
extracted_files = extract_zip(f)
|
| 548 |
+
print(f"Those are my extracted files{extracted_files}")
|
| 549 |
+
|
| 550 |
+
for doc in extracted_files:
|
| 551 |
+
if doc.endswith('.doc') or doc.endswith('.docx'):
|
| 552 |
+
processed_files.append(transform_to_pdf(doc))
|
| 553 |
+
|
| 554 |
+
if doc.endswith('.pdf'):
|
| 555 |
+
processed_files.append(doc)
|
| 556 |
+
|
| 557 |
+
if f.endswith('.pdf'):
|
| 558 |
+
processed_files.append(f)
|
| 559 |
+
|
| 560 |
+
if f.endswith('.doc') or f.endswith('.docx'):
|
| 561 |
+
processed_files.append(transform_to_pdf(f))
|
| 562 |
+
|
| 563 |
+
except Exception as ex:
|
| 564 |
+
print(f"Error occured while processing files : {ex}")
|
| 565 |
+
|
| 566 |
+
# For each processed files extract content
|
| 567 |
+
for file in processed_files:
|
| 568 |
+
|
| 569 |
+
try:
|
| 570 |
file_name = file
|
| 571 |
file = PdfReader(file)
|
| 572 |
pdfNumberPages = len(file.pages)
|
|
|
|
| 660 |
tabLine.append([file_name, selectedText, key])
|
| 661 |
print(f"Selected line in keywords is: {line}")
|
| 662 |
|
| 663 |
+
except Exception as ex:
|
| 664 |
+
print(f"Error occured while extracting content : {ex}")
|
| 665 |
+
|
| 666 |
for r in tabLine:
|
| 667 |
text_joined = ''.join(r[1])
|
| 668 |
text_joined = r[2] + " : \n " + text_joined
|
|
|
|
| 688 |
|
| 689 |
return "dataframe_keywords.xlsx"
|
| 690 |
|
| 691 |
+
# -------------------------------------------------------------------------------- NON INTELLIGENT SPLIT
|
| 692 |
+
|
| 693 |
+
def transform_to_pdf(doc):
|
| 694 |
+
instructions = {'parts': [{'file': 'document'}]}
|
| 695 |
+
|
| 696 |
+
response = requests.request(
|
| 697 |
+
'POST',
|
| 698 |
+
'https://api.pspdfkit.com/build',
|
| 699 |
+
headers = { 'Authorization': 'Bearer pdf_live_nS6tyylSW57PNw9TIEKKL3Tt16NmLCazlQWQ9D33t0Q'},
|
| 700 |
+
files = {'document': open(doc, 'rb')},
|
| 701 |
+
data = {'instructions': json.dumps(instructions)},
|
| 702 |
+
stream = True
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
pdf_name = doc[:doc.find(".doc")] + ".pdf"
|
| 706 |
+
|
| 707 |
+
if response.ok:
|
| 708 |
+
with open(pdf_name, 'wb') as fd:
|
| 709 |
+
for chunk in response.iter_content(chunk_size=8096):
|
| 710 |
+
fd.write(chunk)
|
| 711 |
+
return pdf_name
|
| 712 |
+
|
| 713 |
+
else:
|
| 714 |
+
print(response.text)
|
| 715 |
+
exit()
|
| 716 |
+
return none
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
def non_intelligent_split(files, chunk_size = 1000):
|
| 720 |
+
extracted_content = []
|
| 721 |
+
processed_files = []
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
# For each files : stock the PDF, extract the Zips and convert the Doc & Docx to PDF
|
| 725 |
+
try:
|
| 726 |
+
not_duplicate = True
|
| 727 |
+
for f in files:
|
| 728 |
+
for p in processed_files:
|
| 729 |
+
if (f[:f.rfind('.')] == p[:p.rfind('.')]):
|
| 730 |
+
not_duplicate = False
|
| 731 |
+
if not_duplicate:
|
| 732 |
+
if f.endswith('.zip'):
|
| 733 |
+
extracted_files = extract_zip(f)
|
| 734 |
+
print(f"Those are my extracted files{extracted_files}")
|
| 735 |
+
|
| 736 |
+
for doc in extracted_files:
|
| 737 |
+
if doc.endswith('.doc') or doc.endswith('.docx'):
|
| 738 |
+
processed_files.append(transform_to_pdf(doc))
|
| 739 |
+
|
| 740 |
+
if doc.endswith('.pdf'):
|
| 741 |
+
processed_files.append(doc)
|
| 742 |
+
|
| 743 |
+
if f.endswith('.pdf'):
|
| 744 |
+
processed_files.append(f)
|
| 745 |
+
|
| 746 |
+
if f.endswith('.doc') or f.endswith('.docx'):
|
| 747 |
+
processed_files.append(transform_to_pdf(f))
|
| 748 |
+
|
| 749 |
+
except Exception as ex:
|
| 750 |
+
print(f"Error occured while processing files : {ex}")
|
| 751 |
+
|
| 752 |
+
# Extract content from each processed files
|
| 753 |
+
try:
|
| 754 |
+
for f in processed_files:
|
| 755 |
+
print(f"my filename is : {f}")
|
| 756 |
+
file = PdfReader(f)
|
| 757 |
+
pdfNumberPages = len(file.pages)
|
| 758 |
+
selectedText = ""
|
| 759 |
+
|
| 760 |
+
for pdfPage in range(0, pdfNumberPages):
|
| 761 |
+
load_page = file.get_page(pdfPage)
|
| 762 |
+
text = load_page.extract_text()
|
| 763 |
+
lines = text.split("\n")
|
| 764 |
+
sizeOfLines = 0
|
| 765 |
+
|
| 766 |
+
for index, line in enumerate(lines):
|
| 767 |
+
sizeOfLines += len(line)
|
| 768 |
+
selectedText += " " + line
|
| 769 |
+
if sizeOfLines >= chunk_size:
|
| 770 |
+
textContent = (f"Page {str(pdfPage)} : {selectedText}")
|
| 771 |
+
extracted_content.append([f, textContent])
|
| 772 |
+
sizeOfLines = 0
|
| 773 |
+
selectedText = ""
|
| 774 |
+
|
| 775 |
+
textContent = (f"Page {str(pdfNumberPages)} : {selectedText}")
|
| 776 |
+
extracted_content.append([f, textContent])
|
| 777 |
+
except Exception as ex:
|
| 778 |
+
print(f"Error occured while extracting content from processed files : {ex}")
|
| 779 |
+
|
| 780 |
+
df = pd.DataFrame()
|
| 781 |
+
for content in extracted_content:
|
| 782 |
+
filename = content[0]
|
| 783 |
+
text = content[1]
|
| 784 |
+
|
| 785 |
+
doc_data = {'Filename': filename[filename.rfind("/")+1:], 'Content': text}
|
| 786 |
+
|
| 787 |
+
df = pd.concat([df, pd.DataFrame([doc_data])], ignore_index=True)
|
| 788 |
+
|
| 789 |
+
df.to_excel("dataframe_keywords.xlsx", index=False)
|
| 790 |
+
|
| 791 |
+
return "dataframe_keywords.xlsx"
|