jithenderchoudary commited on
Commit
eb705ff
·
verified ·
1 Parent(s): e09caec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -27
app.py CHANGED
@@ -5,14 +5,10 @@ import tempfile
5
  import re
6
 
7
  def extract_po_to_excel(pdf_file):
8
- # Regular expressions to match key fields
9
- item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
10
- data_pattern = re.compile(
11
- r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
12
- )
13
-
14
- # Initialize list to store extracted data
15
  extracted_data = []
 
 
 
16
 
17
  # Load PDF
18
  with fitz.open(pdf_file.name) as pdf:
@@ -20,30 +16,42 @@ def extract_po_to_excel(pdf_file):
20
  for page_num in range(pdf.page_count):
21
  page = pdf[page_num]
22
  text = page.get_text("text")
23
-
24
- # Find the table start position
25
- if item_pattern.search(text):
26
- # Find all matching data lines
27
- matches = data_pattern.findall(text)
 
 
 
28
 
29
- # Process each line and add it to the data list
30
- for match in matches:
31
- pos, item_code, description, delivery_date, quantity, basic_price, discount, currency, amount, cgst_rate, cgst_amount, sgst_rate, sgst_amount = match
32
- extracted_data.append({
 
33
  "Sno.": sno,
34
- "Pos.": pos,
35
  "Item Code": f"{item_code}, {description}",
36
- "Unit": "NOS", # assuming NOS as unit if consistent
37
  "Delivery Date": delivery_date,
38
  "Quantity": quantity,
39
  "Basic Price": basic_price,
40
- "Discount": discount,
41
- "Currency": currency,
42
- "Central GST": f"{cgst_rate} ({cgst_amount})",
43
- "State GST": f"{sgst_rate} ({sgst_amount})",
44
- "Amount": amount
45
- })
46
- sno += 1 # Increment serial number
 
 
 
 
 
 
 
 
47
 
48
  # Create DataFrame
49
  df = pd.DataFrame(extracted_data)
@@ -73,5 +81,3 @@ if __name__ == "__main__":
73
 
74
 
75
 
76
-
77
-
 
5
  import re
6
 
7
  def extract_po_to_excel(pdf_file):
 
 
 
 
 
 
 
8
  extracted_data = []
9
+ pos_pattern = re.compile(r'Pos\.\s*(\d+)', re.IGNORECASE)
10
+ item_code_pattern = re.compile(r'(\d{9,})\s+(.*)\s+(\d+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)', re.IGNORECASE)
11
+ gst_pattern = re.compile(r'Central GST\s+9%\s+([\d.]+)\s+([\d.]+)\s+State GST\s+9%\s+([\d.]+)\s+([\d.]+)')
12
 
13
  # Load PDF
14
  with fitz.open(pdf_file.name) as pdf:
 
16
  for page_num in range(pdf.page_count):
17
  page = pdf[page_num]
18
  text = page.get_text("text")
19
+ lines = text.splitlines()
20
+
21
+ current_pos = None
22
+ for line in lines:
23
+ # Match position and item details
24
+ pos_match = pos_pattern.match(line)
25
+ if pos_match:
26
+ current_pos = pos_match.group(1)
27
 
28
+ # Match item code details
29
+ item_match = item_code_pattern.search(line)
30
+ if item_match:
31
+ item_code, description, unit, delivery_date, quantity, basic_price, amount = item_match.groups()
32
+ data_entry = {
33
  "Sno.": sno,
34
+ "Pos.": current_pos,
35
  "Item Code": f"{item_code}, {description}",
36
+ "Unit": unit,
37
  "Delivery Date": delivery_date,
38
  "Quantity": quantity,
39
  "Basic Price": basic_price,
40
+ "Discount": "0.0000",
41
+ "Currency": "INR",
42
+ "Amount": amount,
43
+ "Central GST": "",
44
+ "State GST": ""
45
+ }
46
+ extracted_data.append(data_entry)
47
+ sno += 1
48
+
49
+ # Match GST details and update the last entry
50
+ gst_match = gst_pattern.search(line)
51
+ if gst_match and extracted_data:
52
+ cgst_rate, cgst_amount, sgst_rate, sgst_amount = gst_match.groups()
53
+ extracted_data[-1]["Central GST"] = f"{cgst_rate} ({cgst_amount})"
54
+ extracted_data[-1]["State GST"] = f"{sgst_rate} ({sgst_amount})"
55
 
56
  # Create DataFrame
57
  df = pd.DataFrame(extracted_data)
 
81
 
82
 
83