Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
import PyPDF2
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
import streamlit as st
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
def convert_pdf_to_excel(pdf_file):
|
| 9 |
-
# Use tabula to extract tables from PDF
|
| 10 |
inputpdf = PyPDF2.PdfReader(pdf_file)
|
| 11 |
pages_no = len(inputpdf.pages)
|
| 12 |
whole_data = []
|
|
@@ -19,6 +18,9 @@ def convert_pdf_to_excel(pdf_file):
|
|
| 19 |
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
|
| 20 |
data = each_table.split('\n')
|
| 21 |
each_table_data = []
|
|
|
|
|
|
|
|
|
|
| 22 |
for index in range(len(data)):
|
| 23 |
if data[index].strip() == 'Part No.':
|
| 24 |
each_table_data.append(data[index+1].replace('Part Color Code',""))
|
|
@@ -29,14 +31,46 @@ def convert_pdf_to_excel(pdf_file):
|
|
| 29 |
|
| 30 |
if 'Part Name' in data[index].strip():
|
| 31 |
each_table_data.append(data[index+1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
whole_data.append(each_table_data)
|
| 33 |
|
| 34 |
whole_data = pd.DataFrame(whole_data)
|
| 35 |
-
whole_data.columns = ["Part No.","Part Color Code","Part Name"]
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def main():
|
| 42 |
st.title("PDF to Excel Converter")
|
|
@@ -48,22 +82,54 @@ def main():
|
|
| 48 |
st.write("Uploaded PDF file:", uploaded_file.name)
|
| 49 |
|
| 50 |
# Convert PDF to Excel
|
| 51 |
-
|
| 52 |
|
| 53 |
# Download link for the Excel file
|
| 54 |
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
|
| 55 |
|
| 56 |
-
if os.path.exists(
|
| 57 |
-
with open(
|
| 58 |
excel_bytes = f.read()
|
| 59 |
st.download_button(
|
| 60 |
label="Download Excel file",
|
| 61 |
data=excel_bytes,
|
| 62 |
-
file_name=
|
| 63 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 64 |
)
|
| 65 |
else:
|
| 66 |
st.error("Error: Converted Excel file not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
if __name__ == "__main__":
|
| 69 |
main()
|
|
|
|
| 1 |
import PyPDF2
|
| 2 |
import pandas as pd
|
| 3 |
import os
|
| 4 |
+
import ast
|
| 5 |
import streamlit as st
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
def convert_pdf_to_excel(pdf_file):
|
|
|
|
| 9 |
inputpdf = PyPDF2.PdfReader(pdf_file)
|
| 10 |
pages_no = len(inputpdf.pages)
|
| 11 |
whole_data = []
|
|
|
|
| 18 |
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
|
| 19 |
data = each_table.split('\n')
|
| 20 |
each_table_data = []
|
| 21 |
+
date_qty = []
|
| 22 |
+
row_start_index = 0
|
| 23 |
+
row_stop_index = 0
|
| 24 |
for index in range(len(data)):
|
| 25 |
if data[index].strip() == 'Part No.':
|
| 26 |
each_table_data.append(data[index+1].replace('Part Color Code',""))
|
|
|
|
| 31 |
|
| 32 |
if 'Part Name' in data[index].strip():
|
| 33 |
each_table_data.append(data[index+1])
|
| 34 |
+
|
| 35 |
+
if data[index].strip() == 'ADJ':
|
| 36 |
+
row_start_index = index + 1
|
| 37 |
+
|
| 38 |
+
if data[index].strip() == 'Total':
|
| 39 |
+
row_stop_index = index
|
| 40 |
+
|
| 41 |
+
if row_start_index>0 and row_stop_index>0:
|
| 42 |
+
for index in range(row_start_index,row_stop_index):
|
| 43 |
+
if '/' in data[index].strip():
|
| 44 |
+
date_qty.append([data[index].strip()[-5:].strip(),data[index+1].strip()])
|
| 45 |
+
if not date_qty:
|
| 46 |
+
date_qty = [["",""]]
|
| 47 |
+
each_table_data.append(date_qty)
|
| 48 |
whole_data.append(each_table_data)
|
| 49 |
|
| 50 |
whole_data = pd.DataFrame(whole_data)
|
| 51 |
+
whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty']
|
| 52 |
+
extracted_file = "Data Extracted.xlsx"
|
| 53 |
+
data_for_mapping = "Data Mapping.xlsx"
|
| 54 |
+
extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1)
|
| 55 |
+
extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"])
|
| 56 |
+
whole_data.to_excel(extracted_file, index=False)
|
| 57 |
+
extracted_data_for_mapping.to_excel(data_for_mapping, index=False)
|
| 58 |
+
return extracted_file,data_for_mapping
|
| 59 |
+
|
| 60 |
+
def map_data_to_template(excel_file, mapping_file):
|
| 61 |
+
# Load Excel file and mapping file
|
| 62 |
+
extracted_data = pd.read_excel(excel_file)
|
| 63 |
+
mapping_data = pd.read_excel(mapping_file)
|
| 64 |
+
mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
|
| 65 |
+
|
| 66 |
+
# Perform mapping
|
| 67 |
+
extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x))
|
| 68 |
+
extracted_data = extracted_data.explode('Date Qty')
|
| 69 |
+
extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index)
|
| 70 |
+
extracted_data = extracted_data.drop('Date Qty',axis=1)
|
| 71 |
+
mapped_data = extracted_data.merge(mapping_data, on =['Part No.'])[['Item Code','SchDate','Qty']]
|
| 72 |
+
|
| 73 |
+
return mapped_data
|
| 74 |
|
| 75 |
def main():
|
| 76 |
st.title("PDF to Excel Converter")
|
|
|
|
| 82 |
st.write("Uploaded PDF file:", uploaded_file.name)
|
| 83 |
|
| 84 |
# Convert PDF to Excel
|
| 85 |
+
extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file)
|
| 86 |
|
| 87 |
# Download link for the Excel file
|
| 88 |
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
|
| 89 |
|
| 90 |
+
if os.path.exists(data_for_mapping):
|
| 91 |
+
with open(data_for_mapping, "rb") as f:
|
| 92 |
excel_bytes = f.read()
|
| 93 |
st.download_button(
|
| 94 |
label="Download Excel file",
|
| 95 |
data=excel_bytes,
|
| 96 |
+
file_name=data_for_mapping,
|
| 97 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 98 |
)
|
| 99 |
else:
|
| 100 |
st.error("Error: Converted Excel file not found")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
st.markdown("## Upload the Data Master file with Item Code mapping")
|
| 104 |
+
mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
|
| 105 |
+
|
| 106 |
+
if mapping_uploaded_file is not None:
|
| 107 |
+
st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name)
|
| 108 |
+
|
| 109 |
+
# Perform data mapping
|
| 110 |
+
mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file)
|
| 111 |
+
|
| 112 |
+
# Provide a link to download the final Excel file after mapping
|
| 113 |
+
st.markdown("### Final Excel File After Mapping")
|
| 114 |
+
|
| 115 |
+
final_excel_file = 'Final Data.xlsx'
|
| 116 |
+
mapped_data.to_excel(final_excel_file, index=False)
|
| 117 |
+
|
| 118 |
+
if os.path.exists(final_excel_file):
|
| 119 |
+
with open(final_excel_file, "rb") as f:
|
| 120 |
+
excel_bytes = f.read()
|
| 121 |
+
st.download_button(
|
| 122 |
+
label="Download Excel file",
|
| 123 |
+
data=excel_bytes,
|
| 124 |
+
file_name=final_excel_file,
|
| 125 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
st.error("Error: Converted Excel file not found")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
|
| 134 |
if __name__ == "__main__":
|
| 135 |
main()
|