jithenderchoudary commited on
Commit
e09caec
·
verified ·
1 Parent(s): b0c8573

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -5
app.py CHANGED
@@ -7,13 +7,16 @@ import re
7
  def extract_po_to_excel(pdf_file):
8
  # Regular expressions to match key fields
9
  item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
10
- data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)')
 
 
11
 
12
  # Initialize list to store extracted data
13
  extracted_data = []
14
 
15
  # Load PDF
16
  with fitz.open(pdf_file.name) as pdf:
 
17
  for page_num in range(pdf.page_count):
18
  page = pdf[page_num]
19
  text = page.get_text("text")
@@ -25,18 +28,22 @@ def extract_po_to_excel(pdf_file):
25
 
26
  # Process each line and add it to the data list
27
  for match in matches:
28
- pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match
29
  extracted_data.append({
30
- "Position": pos,
31
- "Item Code": item_code,
32
- "Unit": unit,
 
33
  "Delivery Date": delivery_date,
34
  "Quantity": quantity,
35
  "Basic Price": basic_price,
36
  "Discount": discount,
37
  "Currency": currency,
 
 
38
  "Amount": amount
39
  })
 
40
 
41
  # Create DataFrame
42
  df = pd.DataFrame(extracted_data)
@@ -67,3 +74,4 @@ if __name__ == "__main__":
67
 
68
 
69
 
 
 
7
  def extract_po_to_excel(pdf_file):
8
  # Regular expressions to match key fields
9
  item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
10
+ data_pattern = re.compile(
11
+ r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
12
+ )
13
 
14
  # Initialize list to store extracted data
15
  extracted_data = []
16
 
17
  # Load PDF
18
  with fitz.open(pdf_file.name) as pdf:
19
+ sno = 1 # Start serial number from 1
20
  for page_num in range(pdf.page_count):
21
  page = pdf[page_num]
22
  text = page.get_text("text")
 
28
 
29
  # Process each line and add it to the data list
30
  for match in matches:
31
+ pos, item_code, description, delivery_date, quantity, basic_price, discount, currency, amount, cgst_rate, cgst_amount, sgst_rate, sgst_amount = match
32
  extracted_data.append({
33
+ "Sno.": sno,
34
+ "Pos.": pos,
35
+ "Item Code": f"{item_code}, {description}",
36
+ "Unit": "NOS", # assuming NOS as unit if consistent
37
  "Delivery Date": delivery_date,
38
  "Quantity": quantity,
39
  "Basic Price": basic_price,
40
  "Discount": discount,
41
  "Currency": currency,
42
+ "Central GST": f"{cgst_rate} ({cgst_amount})",
43
+ "State GST": f"{sgst_rate} ({sgst_amount})",
44
  "Amount": amount
45
  })
46
+ sno += 1 # Increment serial number
47
 
48
  # Create DataFrame
49
  df = pd.DataFrame(extracted_data)
 
74
 
75
 
76
 
77
+