jithenderchoudary commited on
Commit
91fd973
·
verified ·
1 Parent(s): 71c8832

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -38
app.py CHANGED
@@ -1,76 +1,70 @@
1
  import pdfplumber
2
- import re
3
  import PyPDF2
4
- import pandas as pd
5
  import re
 
6
  import gradio as gr
 
7
 
8
  # Function to extract tables from PDF using pdfplumber
9
  def extract_table_from_pdf(pdf_path):
10
  with pdfplumber.open(pdf_path) as pdf:
 
11
  first_page = pdf.pages[0]
12
  table = first_page.extract_table()
13
  return table
14
 
15
- # Function to extract total amount from the PDF using regex
16
- def extract_total_from_pdf(pdf_path):
17
  with open(pdf_path, 'rb') as f:
18
  pdf = PyPDF2.PdfReader(f)
19
  page = pdf.pages[0]
20
  text = page.extract_text()
21
 
22
- total_amount = re.search(r'Total\s+(\d+\.\d{2})', text)
23
- if total_amount:
24
- return total_amount.group(1)
25
- return None
26
-
27
- # Function to extract item details using regex
28
- def extract_items_from_pdf(pdf_path):
29
- with open(pdf_path, 'rb') as f:
30
- pdf = PyPDF2.PdfReader(f)
31
- page = pdf.pages[0]
32
- text = page.extract_text()
33
-
34
- # Regex to extract item code, description, and quantity
35
- item_code_pattern = r'\b(\d{3,})\b'
36
- description_pattern = r'Description\s*[:\s]*(.*?)(?=\s+Quantity|$)'
37
- quantity_pattern = r'Quantity\s*[:\s]*(\d+)'
38
 
39
  item_codes = re.findall(item_code_pattern, text)
40
  descriptions = re.findall(description_pattern, text)
41
- quantities = re.findall(quantity_pattern, text)
42
 
43
  # Return data as a dictionary
44
- items_data = {'Item Code': item_codes, 'Description': descriptions, 'Quantity': quantities}
45
- return items_data
 
 
 
 
 
46
 
47
- # Function to extract data and create a DataFrame
48
  def process_po(pdf_path):
49
- items_data = extract_items_from_pdf(pdf_path)
50
- total_amount = extract_total_from_pdf(pdf_path)
51
-
52
- # Create a DataFrame for the extracted data
53
- df = pd.DataFrame(items_data)
54
-
55
- # Add total amount as a new column in the DataFrame
56
- df['Total Amount'] = total_amount
57
-
58
- return df
59
 
60
- # Gradio Interface
61
  def gradio_interface(pdf_file):
62
  """
63
  Interface function for Gradio to process the PDF and return the Excel file.
64
  """
65
- return extract_data(pdf_file.name)
66
 
67
  # Define Gradio interface
68
  interface = gr.Interface(
69
  fn=gradio_interface,
70
  inputs=gr.File(label="Upload PDF"),
71
  outputs=gr.File(label="Download Accurate Excel"),
72
- title="Accurate BHEL PO Data Extractor",
73
- description="Upload a PDF to extract accurate Material Numbers and related data into an Excel file."
74
  )
75
 
76
  if __name__ == "__main__":
 
1
  import pdfplumber
 
2
  import PyPDF2
 
3
  import re
4
+ import pandas as pd
5
  import gradio as gr
6
+ import os
7
 
8
  # Function to extract tables from PDF using pdfplumber
9
  def extract_table_from_pdf(pdf_path):
10
  with pdfplumber.open(pdf_path) as pdf:
11
+ # Assuming the table is on the first page
12
  first_page = pdf.pages[0]
13
  table = first_page.extract_table()
14
  return table
15
 
16
+ # Function to extract data using regex from raw text
17
+ def extract_data_from_text(pdf_path):
18
  with open(pdf_path, 'rb') as f:
19
  pdf = PyPDF2.PdfReader(f)
20
  page = pdf.pages[0]
21
  text = page.extract_text()
22
 
23
+ # Define regex patterns for the fields to extract
24
+ item_code_pattern = r'(\d{6,})' # Pattern for Material Number
25
+ description_pattern = r'Material Number: (\d+)\s*HSN Code:(.*?)\s*IGST' # Material Description
26
+ igst_pattern = r'IGST\s*[:\s]*(\d{1,2}\s*%)' # Extract IGST value
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  item_codes = re.findall(item_code_pattern, text)
29
  descriptions = re.findall(description_pattern, text)
30
+ igsts = re.findall(igst_pattern, text)
31
 
32
  # Return data as a dictionary
33
+ extracted_data = {
34
+ 'Material Number': item_codes,
35
+ 'Description': descriptions,
36
+ 'IGST': igsts
37
+ }
38
+
39
+ return extracted_data
40
 
41
+ # Function to process PO and generate Excel file
42
  def process_po(pdf_path):
43
+ extracted_data = extract_data_from_text(pdf_path)
44
+
45
+ # Create DataFrame
46
+ df = pd.DataFrame(extracted_data)
47
+
48
+ # Save the DataFrame to Excel
49
+ excel_path = pdf_path.replace('.pdf', '_extracted.xlsx')
50
+ df.to_excel(excel_path, index=False)
51
+
52
+ return excel_path
53
 
54
+ # Gradio Interface function
55
  def gradio_interface(pdf_file):
56
  """
57
  Interface function for Gradio to process the PDF and return the Excel file.
58
  """
59
+ return process_po(pdf_file.name)
60
 
61
  # Define Gradio interface
62
  interface = gr.Interface(
63
  fn=gradio_interface,
64
  inputs=gr.File(label="Upload PDF"),
65
  outputs=gr.File(label="Download Accurate Excel"),
66
+ title="BHEL PO Data Extractor",
67
+ description="Upload a BHEL Purchase Order (PO) PDF to extract material numbers, descriptions, and IGST information into an Excel file."
68
  )
69
 
70
  if __name__ == "__main__":