Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,13 +7,16 @@ import re
|
|
| 7 |
def extract_po_to_excel(pdf_file):
|
| 8 |
# Regular expressions to match key fields
|
| 9 |
item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
|
| 10 |
-
data_pattern = re.compile(
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Initialize list to store extracted data
|
| 13 |
extracted_data = []
|
| 14 |
|
| 15 |
# Load PDF
|
| 16 |
with fitz.open(pdf_file.name) as pdf:
|
|
|
|
| 17 |
for page_num in range(pdf.page_count):
|
| 18 |
page = pdf[page_num]
|
| 19 |
text = page.get_text("text")
|
|
@@ -25,18 +28,22 @@ def extract_po_to_excel(pdf_file):
|
|
| 25 |
|
| 26 |
# Process each line and add it to the data list
|
| 27 |
for match in matches:
|
| 28 |
-
pos, item_code,
|
| 29 |
extracted_data.append({
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
|
|
|
| 33 |
"Delivery Date": delivery_date,
|
| 34 |
"Quantity": quantity,
|
| 35 |
"Basic Price": basic_price,
|
| 36 |
"Discount": discount,
|
| 37 |
"Currency": currency,
|
|
|
|
|
|
|
| 38 |
"Amount": amount
|
| 39 |
})
|
|
|
|
| 40 |
|
| 41 |
# Create DataFrame
|
| 42 |
df = pd.DataFrame(extracted_data)
|
|
@@ -67,3 +74,4 @@ if __name__ == "__main__":
|
|
| 67 |
|
| 68 |
|
| 69 |
|
|
|
|
|
|
| 7 |
def extract_po_to_excel(pdf_file):
|
| 8 |
# Regular expressions to match key fields
|
| 9 |
item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
|
| 10 |
+
data_pattern = re.compile(
|
| 11 |
+
r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
|
| 12 |
+
)
|
| 13 |
|
| 14 |
# Initialize list to store extracted data
|
| 15 |
extracted_data = []
|
| 16 |
|
| 17 |
# Load PDF
|
| 18 |
with fitz.open(pdf_file.name) as pdf:
|
| 19 |
+
sno = 1 # Start serial number from 1
|
| 20 |
for page_num in range(pdf.page_count):
|
| 21 |
page = pdf[page_num]
|
| 22 |
text = page.get_text("text")
|
|
|
|
| 28 |
|
| 29 |
# Process each line and add it to the data list
|
| 30 |
for match in matches:
|
| 31 |
+
pos, item_code, description, delivery_date, quantity, basic_price, discount, currency, amount, cgst_rate, cgst_amount, sgst_rate, sgst_amount = match
|
| 32 |
extracted_data.append({
|
| 33 |
+
"Sno.": sno,
|
| 34 |
+
"Pos.": pos,
|
| 35 |
+
"Item Code": f"{item_code}, {description}",
|
| 36 |
+
"Unit": "NOS", # assuming NOS as unit if consistent
|
| 37 |
"Delivery Date": delivery_date,
|
| 38 |
"Quantity": quantity,
|
| 39 |
"Basic Price": basic_price,
|
| 40 |
"Discount": discount,
|
| 41 |
"Currency": currency,
|
| 42 |
+
"Central GST": f"{cgst_rate} ({cgst_amount})",
|
| 43 |
+
"State GST": f"{sgst_rate} ({sgst_amount})",
|
| 44 |
"Amount": amount
|
| 45 |
})
|
| 46 |
+
sno += 1 # Increment serial number
|
| 47 |
|
| 48 |
# Create DataFrame
|
| 49 |
df = pd.DataFrame(extracted_data)
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
|
| 77 |
+
|