SathvikGanta commited on
Commit
9d67179
·
verified ·
1 Parent(s): 22ebb3e

Update parse_bhel_po.py

Browse files
Files changed (1) hide show
  1. parse_bhel_po.py +30 -27
parse_bhel_po.py CHANGED
@@ -3,66 +3,70 @@ import pandas as pd
3
  import re
4
 
5
  def extract_po_details(file_obj):
6
- data = []
 
 
7
  with pdfplumber.open(file_obj) as pdf:
8
  for page_number, page in enumerate(pdf.pages, start=1):
9
  text = page.extract_text()
10
  if not text:
11
  print(f"[DEBUG] No text found on page {page_number}. Skipping.")
12
  continue
13
-
14
- print(f"[DEBUG] Processing text from page {page_number}")
15
 
16
- # Split text by lines to process each line individually
17
- lines = text.splitlines()
18
- current_record = {}
19
 
20
  for line_number, line in enumerate(lines, start=1):
21
  print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
22
 
23
- # Identify 'Sl No'
24
  sl_no_match = re.match(r'(\d+)\s+', line)
25
- if sl_no_match and not current_record.get("Sl No"):
26
  current_record["Sl No"] = sl_no_match.group(1)
27
 
28
- # Match Material Description if not already matched
29
- if 'Material Description' not in current_record:
30
- material_desc_match = re.search(r'BPS\s+\d+\s+Material Number:\s*\d+\s*HSN Code:\s*\d+\s*IGST\s*:\s*\d+%', line)
31
- if material_desc_match:
32
  current_record["Material Description"] = material_desc_match.group(0)
33
-
 
 
 
34
  # Match Unit
35
- unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
36
- if unit_match and not current_record.get("Unit"):
37
- current_record["Unit"] = unit_match.group(0)
 
38
 
39
  # Match Quantity
40
- quantity_match = re.search(r'\bQuantity\s+(\d+)\b', line)
41
- if quantity_match and not current_record.get("Quantity"):
42
  current_record["Quantity"] = quantity_match.group(1)
43
 
44
  # Match Delivery Quantity
45
- dely_qty_match = re.search(r'\bDely Qty\s+(\d+)\b', line)
46
- if dely_qty_match and not current_record.get("Dely Qty"):
47
  current_record["Dely Qty"] = dely_qty_match.group(1)
48
 
49
  # Match Delivery Date
50
- dely_date_match = re.search(r'\d{2}\.\d{2}\.\d{4}', line)
51
- if dely_date_match and not current_record.get("Dely Date"):
52
  current_record["Dely Date"] = dely_date_match.group(0)
53
 
54
  # Match Unit Rate
55
  unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
56
- if unit_rate_match and not current_record.get("Unit Rate"):
57
  current_record["Unit Rate"] = unit_rate_match.group(1)
58
 
59
  # Match Value
60
  value_match = re.search(r'Value\s+([\d.]+)', line)
61
- if value_match and not current_record.get("Value"):
62
  current_record["Value"] = value_match.group(1)
63
-
64
  # Check if we have a complete record
65
  if len(current_record) == 8:
 
66
  data.append([
67
  current_record["Sl No"],
68
  current_record["Material Description"],
@@ -73,8 +77,7 @@ def extract_po_details(file_obj):
73
  current_record["Unit Rate"],
74
  current_record["Value"]
75
  ])
76
- print(f"[DEBUG] Extracted record: {current_record}")
77
- current_record = {} # Reset for the next record
78
 
79
  # Create DataFrame if data was extracted
80
  if data:
 
3
  import re
4
 
5
  def extract_po_details(file_obj):
6
+ data = [] # Store completed rows here
7
+ current_record = {} # Temporary storage for each record until complete
8
+
9
  with pdfplumber.open(file_obj) as pdf:
10
  for page_number, page in enumerate(pdf.pages, start=1):
11
  text = page.extract_text()
12
  if not text:
13
  print(f"[DEBUG] No text found on page {page_number}. Skipping.")
14
  continue
 
 
15
 
16
+ print(f"[DEBUG] Processing text from page {page_number}")
17
+ lines = text.splitlines() # Process line by line for better control
 
18
 
19
  for line_number, line in enumerate(lines, start=1):
20
  print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
21
 
22
+ # Match Sl No (if it’s the start of a new item entry)
23
  sl_no_match = re.match(r'(\d+)\s+', line)
24
+ if sl_no_match and "Sl No" not in current_record:
25
  current_record["Sl No"] = sl_no_match.group(1)
26
 
27
+ # Material Description (Multi-line support)
28
+ material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
29
+ if material_desc_match:
30
+ if "Material Description" not in current_record:
31
  current_record["Material Description"] = material_desc_match.group(0)
32
+ else:
33
+ # Append if description spans multiple lines
34
+ current_record["Material Description"] += " " + material_desc_match.group(0)
35
+
36
  # Match Unit
37
+ if "Unit" not in current_record:
38
+ unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
39
+ if unit_match:
40
+ current_record["Unit"] = unit_match.group(0)
41
 
42
  # Match Quantity
43
+ quantity_match = re.search(r'(\d+)\s+Quantity', line)
44
+ if quantity_match and "Quantity" not in current_record:
45
  current_record["Quantity"] = quantity_match.group(1)
46
 
47
  # Match Delivery Quantity
48
+ dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
49
+ if dely_qty_match and "Dely Qty" not in current_record:
50
  current_record["Dely Qty"] = dely_qty_match.group(1)
51
 
52
  # Match Delivery Date
53
+ dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
54
+ if dely_date_match and "Dely Date" not in current_record:
55
  current_record["Dely Date"] = dely_date_match.group(0)
56
 
57
  # Match Unit Rate
58
  unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
59
+ if unit_rate_match and "Unit Rate" not in current_record:
60
  current_record["Unit Rate"] = unit_rate_match.group(1)
61
 
62
  # Match Value
63
  value_match = re.search(r'Value\s+([\d.]+)', line)
64
+ if value_match and "Value" not in current_record:
65
  current_record["Value"] = value_match.group(1)
66
+
67
  # Check if we have a complete record
68
  if len(current_record) == 8:
69
+ print(f"[DEBUG] Complete record found: {current_record}")
70
  data.append([
71
  current_record["Sl No"],
72
  current_record["Material Description"],
 
77
  current_record["Unit Rate"],
78
  current_record["Value"]
79
  ])
80
+ current_record = {} # Reset for next record
 
81
 
82
  # Create DataFrame if data was extracted
83
  if data: