Spaces:

jithenderchoudary
/

Bhel_Po

Build error

App Files Files Community

jithenderchoudary commited on Nov 30, 2024

Commit

91fd973

verified ·

1 Parent(s): 71c8832

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -38

app.py CHANGED Viewed

@@ -1,76 +1,70 @@
 import pdfplumber
-import re
 import PyPDF2
-import pandas as pd
 import re
 import gradio as gr
 # Function to extract tables from PDF using pdfplumber
 def extract_table_from_pdf(pdf_path):
     with pdfplumber.open(pdf_path) as pdf:
         first_page = pdf.pages[0]
         table = first_page.extract_table()
         return table
-# Function to extract total amount from the PDF using regex
-def extract_total_from_pdf(pdf_path):
     with open(pdf_path, 'rb') as f:
         pdf = PyPDF2.PdfReader(f)
         page = pdf.pages[0]
         text = page.extract_text()
-    total_amount = re.search(r'Total\s+(\d+\.\d{2})', text)
-    if total_amount:
-        return total_amount.group(1)
-    return None
-# Function to extract item details using regex
-def extract_items_from_pdf(pdf_path):
-    with open(pdf_path, 'rb') as f:
-        pdf = PyPDF2.PdfReader(f)
-        page = pdf.pages[0]
-        text = page.extract_text()
-    # Regex to extract item code, description, and quantity
-    item_code_pattern = r'\b(\d{3,})\b'
-    description_pattern = r'Description\s*[:\s]*(.*?)(?=\s+Quantity|$)'
-    quantity_pattern = r'Quantity\s*[:\s]*(\d+)'
     item_codes = re.findall(item_code_pattern, text)
     descriptions = re.findall(description_pattern, text)
-    quantities = re.findall(quantity_pattern, text)
     # Return data as a dictionary
-    items_data = {'Item Code': item_codes, 'Description': descriptions, 'Quantity': quantities}
-    return items_data
-# Function to extract data and create a DataFrame
 def process_po(pdf_path):
-    items_data = extract_items_from_pdf(pdf_path)
-    total_amount = extract_total_from_pdf(pdf_path)
-    # Create a DataFrame for the extracted data
-    df = pd.DataFrame(items_data)
-    # Add total amount as a new column in the DataFrame
-    df['Total Amount'] = total_amount
-    return df
-# Gradio Interface
 def gradio_interface(pdf_file):
     """
     Interface function for Gradio to process the PDF and return the Excel file.
     """
-    return extract_data(pdf_file.name)
 # Define Gradio interface
 interface = gr.Interface(
     fn=gradio_interface,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Accurate Excel"),
-    title="Accurate BHEL PO Data Extractor",
-    description="Upload a PDF to extract accurate Material Numbers and related data into an Excel file."
 )
 if __name__ == "__main__":

 import pdfplumber
 import PyPDF2
 import re
+import pandas as pd
 import gradio as gr
+import os
 # Function to extract tables from PDF using pdfplumber
 def extract_table_from_pdf(pdf_path):
     with pdfplumber.open(pdf_path) as pdf:
+        # Assuming the table is on the first page
         first_page = pdf.pages[0]
         table = first_page.extract_table()
         return table
+# Function to extract data using regex from raw text
+def extract_data_from_text(pdf_path):
     with open(pdf_path, 'rb') as f:
         pdf = PyPDF2.PdfReader(f)
         page = pdf.pages[0]
         text = page.extract_text()
+    # Define regex patterns for the fields to extract
+    item_code_pattern = r'(\d{6,})'  # Pattern for Material Number
+    description_pattern = r'Material Number: (\d+)\s*HSN Code:(.*?)\s*IGST'  # Material Description
+    igst_pattern = r'IGST\s*[:\s]*(\d{1,2}\s*%)'  # Extract IGST value
     item_codes = re.findall(item_code_pattern, text)
     descriptions = re.findall(description_pattern, text)
+    igsts = re.findall(igst_pattern, text)
     # Return data as a dictionary
+    extracted_data = {
+        'Material Number': item_codes,
+        'Description': descriptions,
+        'IGST': igsts
+    }
+    return extracted_data
+# Function to process PO and generate Excel file
 def process_po(pdf_path):
+    extracted_data = extract_data_from_text(pdf_path)
+    # Create DataFrame
+    df = pd.DataFrame(extracted_data)
+    # Save the DataFrame to Excel
+    excel_path = pdf_path.replace('.pdf', '_extracted.xlsx')
+    df.to_excel(excel_path, index=False)
+    return excel_path
+# Gradio Interface function
 def gradio_interface(pdf_file):
     """
     Interface function for Gradio to process the PDF and return the Excel file.
     """
+    return process_po(pdf_file.name)
 # Define Gradio interface
 interface = gr.Interface(
     fn=gradio_interface,
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.File(label="Download Accurate Excel"),
+    title="BHEL PO Data Extractor",
+    description="Upload a BHEL Purchase Order (PO) PDF to extract material numbers, descriptions, and IGST information into an Excel file."
 )
 if __name__ == "__main__":