Spaces:

hardik27
/

dataextraction

Sleeping

App Files Files Community

hardik27 commited on Apr 3, 2024

Commit

f20a244

verified ·

1 Parent(s): f2ce0d3

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import PyPDF2
+import pandas as pd
+import os
+import streamlit as st
+import pandas as pd
+import tabula
+def convert_pdf_to_excel(pdf_file):
+    # Use tabula to extract tables from PDF
+    inputpdf = PyPDF2.PdfReader(pdf_file)
+    pages_no = len(inputpdf.pages)
+    whole_data = []
+    for i in range(pages_no):
+        inputpdf = PyPDF2.PdfReader(pdf_file)
+        # output = PyPDF2.PdfWriter()
+        # output.add_page(inputpdf.pages[i])
+        pageObj = inputpdf.pages[i]
+        page_content = pageObj.extract_text()
+        for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
+            data = each_table.split('\n')
+            each_table_data = []
+            for index in range(len(data)):
+                if data[index].strip() == 'Part No.':
+                    each_table_data.append(data[index+1].replace('Part Color Code',""))
+                    if 'Part Name' not in data[index+2]:
+                        each_table_data.append(data[index+2].replace('Part Color Code',""))
+                    else:
+                        each_table_data.append("")
+                if 'Part Name' in data[index].strip():
+                    each_table_data.append(data[index+1])
+            whole_data.append(each_table_data)
+    whole_data = pd.DataFrame(whole_data)
+    whole_data.columns = ["Part No.","Part Color Code","Part Name"]
+    # whole_data.to_csv("Extracted_Data.csv",index=False)
+    # Convert each table into a DataFrame
+    # dfs = []
+    # for table in tables:
+    #     dfs.append(table)
+    # # Concatenate all DataFrames into a single DataFrame
+    # result = pd.concat(dfs)
+    # Convert DataFrame to Excel
+    # excel_file = pdf_file.name.replace('.pdf', '.xlsx')
+    # result.to_excel(excel_file, index=False)
+    excel_file = pdf_file.name.replace('.pdf', '.xlsx')
+    whole_data.to_excel(excel_file, index=False)
+    return excel_file
+    # whole_data.to_csv(excel_file,index=False)
+    # return excel_file
+def main():
+    st.title("PDF to Excel Converter")
+    # File uploader
+    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+    if uploaded_file is not None:
+        st.write("Uploaded PDF file:", uploaded_file.name)
+        # Convert PDF to Excel
+        excel_file = convert_pdf_to_excel(uploaded_file)
+        # Download link for the Excel file
+        # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
+        if os.path.exists(excel_file):
+            with open(excel_file, "rb") as f:
+                excel_bytes = f.read()
+            st.download_button(
+                label="Download Excel file",
+                data=excel_bytes,
+                file_name=excel_file,
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+        else:
+            st.error("Error: Converted Excel file not found")
+if __name__ == "__main__":
+    main()
+# file_name = input("Give Complete file location")
+# file_name = '/home/hardik/Downloads/data extraction/HSCI.2.20231121154327.WG.IFORD001.0492.4348.5M09-01.pdf'
+# pdf_in_file = open(file_name,'rb')