Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,14 +5,10 @@ import tempfile
|
|
| 5 |
import re
|
| 6 |
|
| 7 |
def extract_po_to_excel(pdf_file):
|
| 8 |
-
# Regular expressions to match key fields
|
| 9 |
-
item_pattern = re.compile(r'Pos\.\s*Item Code\s*Unit\s*Delivery Date\s*Quantity\s*Basic Price\s*Discount\s*Cur\.\s*Amount', re.IGNORECASE)
|
| 10 |
-
data_pattern = re.compile(
|
| 11 |
-
r'(\d+)\s+(\d+)\s+([\w\s\-().]+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+(\w+)\s+([\d.]+)\s+Central GST 9%\s+([\d.]+)\s+([\d.]+)\s+State GST 9%\s+([\d.]+)\s+([\d.]+)'
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
# Initialize list to store extracted data
|
| 15 |
extracted_data = []
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Load PDF
|
| 18 |
with fitz.open(pdf_file.name) as pdf:
|
|
@@ -20,30 +16,42 @@ def extract_po_to_excel(pdf_file):
|
|
| 20 |
for page_num in range(pdf.page_count):
|
| 21 |
page = pdf[page_num]
|
| 22 |
text = page.get_text("text")
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
| 33 |
"Sno.": sno,
|
| 34 |
-
"Pos.":
|
| 35 |
"Item Code": f"{item_code}, {description}",
|
| 36 |
-
"Unit":
|
| 37 |
"Delivery Date": delivery_date,
|
| 38 |
"Quantity": quantity,
|
| 39 |
"Basic Price": basic_price,
|
| 40 |
-
"Discount":
|
| 41 |
-
"Currency":
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
}
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Create DataFrame
|
| 49 |
df = pd.DataFrame(extracted_data)
|
|
@@ -73,5 +81,3 @@ if __name__ == "__main__":
|
|
| 73 |
|
| 74 |
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 5 |
import re
|
| 6 |
|
| 7 |
def extract_po_to_excel(pdf_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
extracted_data = []
|
| 9 |
+
pos_pattern = re.compile(r'Pos\.\s*(\d+)', re.IGNORECASE)
|
| 10 |
+
item_code_pattern = re.compile(r'(\d{9,})\s+(.*)\s+(\d+)\s+(\d{4}-\d{2}-\d{2})\s+([\d.]+)\s+([\d.]+)\s+INR\s+([\d.]+)', re.IGNORECASE)
|
| 11 |
+
gst_pattern = re.compile(r'Central GST\s+9%\s+([\d.]+)\s+([\d.]+)\s+State GST\s+9%\s+([\d.]+)\s+([\d.]+)')
|
| 12 |
|
| 13 |
# Load PDF
|
| 14 |
with fitz.open(pdf_file.name) as pdf:
|
|
|
|
| 16 |
for page_num in range(pdf.page_count):
|
| 17 |
page = pdf[page_num]
|
| 18 |
text = page.get_text("text")
|
| 19 |
+
lines = text.splitlines()
|
| 20 |
+
|
| 21 |
+
current_pos = None
|
| 22 |
+
for line in lines:
|
| 23 |
+
# Match position and item details
|
| 24 |
+
pos_match = pos_pattern.match(line)
|
| 25 |
+
if pos_match:
|
| 26 |
+
current_pos = pos_match.group(1)
|
| 27 |
|
| 28 |
+
# Match item code details
|
| 29 |
+
item_match = item_code_pattern.search(line)
|
| 30 |
+
if item_match:
|
| 31 |
+
item_code, description, unit, delivery_date, quantity, basic_price, amount = item_match.groups()
|
| 32 |
+
data_entry = {
|
| 33 |
"Sno.": sno,
|
| 34 |
+
"Pos.": current_pos,
|
| 35 |
"Item Code": f"{item_code}, {description}",
|
| 36 |
+
"Unit": unit,
|
| 37 |
"Delivery Date": delivery_date,
|
| 38 |
"Quantity": quantity,
|
| 39 |
"Basic Price": basic_price,
|
| 40 |
+
"Discount": "0.0000",
|
| 41 |
+
"Currency": "INR",
|
| 42 |
+
"Amount": amount,
|
| 43 |
+
"Central GST": "",
|
| 44 |
+
"State GST": ""
|
| 45 |
+
}
|
| 46 |
+
extracted_data.append(data_entry)
|
| 47 |
+
sno += 1
|
| 48 |
+
|
| 49 |
+
# Match GST details and update the last entry
|
| 50 |
+
gst_match = gst_pattern.search(line)
|
| 51 |
+
if gst_match and extracted_data:
|
| 52 |
+
cgst_rate, cgst_amount, sgst_rate, sgst_amount = gst_match.groups()
|
| 53 |
+
extracted_data[-1]["Central GST"] = f"{cgst_rate} ({cgst_amount})"
|
| 54 |
+
extracted_data[-1]["State GST"] = f"{sgst_rate} ({sgst_amount})"
|
| 55 |
|
| 56 |
# Create DataFrame
|
| 57 |
df = pd.DataFrame(extracted_data)
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
|
|
|
|
|
|