Spaces:

jithenderchoudary
/

poext

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 5, 2024

Commit

eb705ff

verified ·

1 Parent(s): e09caec

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -27

app.py CHANGED Viewed

@@ -5,14 +5,10 @@ import tempfile
 import re
 def extract_po_to_excel(pdf_file):
-    # Regular expressions to match key fields
-    item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
-    data_pattern = re.compile(
-        r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
-    )
-    # Initialize list to store extracted data
     extracted_data = []
     # Load PDF
     with fitz.open(pdf_file.name) as pdf:
@@ -20,30 +16,42 @@ def extract_po_to_excel(pdf_file):
         for page_num in range(pdf.page_count):
             page = pdf[page_num]
             text = page.get_text("text")
-            # Find the table start position
-            if item_pattern.search(text):
-                # Find all matching data lines
-                matches = data_pattern.findall(text)
-                # Process each line and add it to the data list
-                for match in matches:
-                    pos, item_code, description, delivery_date, quantity, basic_price, discount, currency, amount, cgst_rate, cgst_amount, sgst_rate, sgst_amount = match
-                    extracted_data.append({
                         "Sno.": sno,
-                        "Pos.": pos,
                         "Item Code": f"{item_code}, {description}",
-                        "Unit": "NOS",  # assuming NOS as unit if consistent
                         "Delivery Date": delivery_date,
                         "Quantity": quantity,
                         "Basic Price": basic_price,
-                        "Discount": discount,
-                        "Currency": currency,
-                        "Central GST": f"{cgst_rate} ({cgst_amount})",
-                        "State GST": f"{sgst_rate} ({sgst_amount})",
-                        "Amount": amount
-                    })
-                    sno += 1  # Increment serial number
     # Create DataFrame
     df = pd.DataFrame(extracted_data)
@@ -73,5 +81,3 @@ if __name__ == "__main__":

 import re
 def extract_po_to_excel(pdf_file):
     extracted_data = []
+    pos_pattern = re.compile(r'Pos\.\s*(\d+)', re.IGNORECASE)
+    item_code_pattern = re.compile(r'(\d{9,})\s+(.*)\s+(\d+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)', re.IGNORECASE)
+    gst_pattern = re.compile(r'Central GST\s+9%\s+([\d.]+)\s+([\d.]+)\s+State GST\s+9%\s+([\d.]+)\s+([\d.]+)')
     # Load PDF
     with fitz.open(pdf_file.name) as pdf:
         for page_num in range(pdf.page_count):
             page = pdf[page_num]
             text = page.get_text("text")
+            lines = text.splitlines()
+            current_pos = None
+            for line in lines:
+                # Match position and item details
+                pos_match = pos_pattern.match(line)
+                if pos_match:
+                    current_pos = pos_match.group(1)
+                # Match item code details
+                item_match = item_code_pattern.search(line)
+                if item_match:
+                    item_code, description, unit, delivery_date, quantity, basic_price, amount = item_match.groups()
+                    data_entry = {
                         "Sno.": sno,
+                        "Pos.": current_pos,
                         "Item Code": f"{item_code}, {description}",
+                        "Unit": unit,
                         "Delivery Date": delivery_date,
                         "Quantity": quantity,
                         "Basic Price": basic_price,
+                        "Discount": "0.0000",
+                        "Currency": "INR",
+                        "Amount": amount,
+                        "Central GST": "",
+                        "State GST": ""
+                    }
+                    extracted_data.append(data_entry)
+                    sno += 1
+                # Match GST details and update the last entry
+                gst_match = gst_pattern.search(line)
+                if gst_match and extracted_data:
+                    cgst_rate, cgst_amount, sgst_rate, sgst_amount = gst_match.groups()
+                    extracted_data[-1]["Central GST"] = f"{cgst_rate} ({cgst_amount})"
+                    extracted_data[-1]["State GST"] = f"{sgst_rate} ({sgst_amount})"
     # Create DataFrame
     df = pd.DataFrame(extracted_data)