Spaces:

jithenderchoudary
/

poext

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 5, 2024

Commit

e09caec

verified ·

1 Parent(s): b0c8573

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -5

app.py CHANGED Viewed

@@ -7,13 +7,16 @@ import re
 def extract_po_to_excel(pdf_file):
     # Regular expressions to match key fields
     item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
-    data_pattern = re.compile(r'(\d+)\s+(\d+)\s+(\w+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)')
     # Initialize list to store extracted data
     extracted_data = []
     # Load PDF
     with fitz.open(pdf_file.name) as pdf:
         for page_num in range(pdf.page_count):
             page = pdf[page_num]
             text = page.get_text("text")
@@ -25,18 +28,22 @@ def extract_po_to_excel(pdf_file):
                 # Process each line and add it to the data list
                 for match in matches:
-                    pos, item_code, unit, delivery_date, quantity, basic_price, discount, currency, amount = match
                     extracted_data.append({
-                        "Position": pos,
-                        "Item Code": item_code,
-                        "Unit": unit,
                         "Delivery Date": delivery_date,
                         "Quantity": quantity,
                         "Basic Price": basic_price,
                         "Discount": discount,
                         "Currency": currency,
                         "Amount": amount
                     })
     # Create DataFrame
     df = pd.DataFrame(extracted_data)
@@ -67,3 +74,4 @@ if __name__ == "__main__":

 def extract_po_to_excel(pdf_file):
     # Regular expressions to match key fields
     item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
+    data_pattern = re.compile(
+        r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
+    )
     # Initialize list to store extracted data
     extracted_data = []
     # Load PDF
     with fitz.open(pdf_file.name) as pdf:
+        sno = 1  # Start serial number from 1
         for page_num in range(pdf.page_count):
             page = pdf[page_num]
             text = page.get_text("text")
                 # Process each line and add it to the data list
                 for match in matches:
+                    pos, item_code, description, delivery_date, quantity, basic_price, discount, currency, amount, cgst_rate, cgst_amount, sgst_rate, sgst_amount = match
                     extracted_data.append({
+                        "Sno.": sno,
+                        "Pos.": pos,
+                        "Item Code": f"{item_code}, {description}",
+                        "Unit": "NOS",  # assuming NOS as unit if consistent
                         "Delivery Date": delivery_date,
                         "Quantity": quantity,
                         "Basic Price": basic_price,
                         "Discount": discount,
                         "Currency": currency,
+                        "Central GST": f"{cgst_rate} ({cgst_amount})",
+                        "State GST": f"{sgst_rate} ({sgst_amount})",
                         "Amount": amount
                     })
+                    sno += 1  # Increment serial number
     # Create DataFrame
     df = pd.DataFrame(extracted_data)