Spaces:
Sleeping
Sleeping
File size: 4,753 Bytes
6803196 b6fb9fd 9d67179 b6fb9fd c109f3a 6803196 c109f3a 5bfecd6 c109f3a 5bfecd6 9d67179 b6fb9fd 5bfecd6 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 4dfd00c 9d67179 6803196 5bfecd6 c109f3a 5bfecd6 4dfd00c 5bfecd6 4dfd00c c109f3a 5bfecd6 c109f3a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | import pdfplumber
import pandas as pd
import re
def extract_po_details(file_obj):
data = [] # Store completed rows here
current_record = {} # Temporary storage for each record until complete
with pdfplumber.open(file_obj) as pdf:
for page_number, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if not text:
print(f"[DEBUG] No text found on page {page_number}. Skipping.")
continue
print(f"[DEBUG] Processing text from page {page_number}")
lines = text.splitlines() # Process line by line for better control
for line_number, line in enumerate(lines, start=1):
print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")
# Match Sl No (if it’s the start of a new item entry)
sl_no_match = re.match(r'(\d+)\s+', line)
if sl_no_match and "Sl No" not in current_record:
current_record["Sl No"] = sl_no_match.group(1)
# Material Description (Multi-line support)
material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
if material_desc_match:
if "Material Description" not in current_record:
current_record["Material Description"] = material_desc_match.group(0)
else:
# Append if description spans multiple lines
current_record["Material Description"] += " " + material_desc_match.group(0)
# Match Unit
if "Unit" not in current_record:
unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
if unit_match:
current_record["Unit"] = unit_match.group(0)
# Match Quantity
quantity_match = re.search(r'(\d+)\s+Quantity', line)
if quantity_match and "Quantity" not in current_record:
current_record["Quantity"] = quantity_match.group(1)
# Match Delivery Quantity
dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
if dely_qty_match and "Dely Qty" not in current_record:
current_record["Dely Qty"] = dely_qty_match.group(1)
# Match Delivery Date
dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
if dely_date_match and "Dely Date" not in current_record:
current_record["Dely Date"] = dely_date_match.group(0)
# Match Unit Rate
unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
if unit_rate_match and "Unit Rate" not in current_record:
current_record["Unit Rate"] = unit_rate_match.group(1)
# Match Value
value_match = re.search(r'Value\s+([\d.]+)', line)
if value_match and "Value" not in current_record:
current_record["Value"] = value_match.group(1)
# Check if we have a complete record
if len(current_record) == 8:
print(f"[DEBUG] Complete record found: {current_record}")
data.append([
current_record["Sl No"],
current_record["Material Description"],
current_record["Unit"],
current_record["Quantity"],
current_record["Dely Qty"],
current_record["Dely Date"],
current_record["Unit Rate"],
current_record["Value"]
])
current_record = {} # Reset for next record
# Create DataFrame if data was extracted
if data:
columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
df = pd.DataFrame(data, columns=columns)
# Convert numeric columns to appropriate types
df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')
df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce')
df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce')
df["Value"] = pd.to_numeric(df["Value"], errors='coerce')
print(f"[DEBUG] Data extracted successfully:\n{df}")
return df
else:
print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.")
return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])
|