import pdfplumber import pandas as pd import re def extract_po_details(file_obj): data = [] # Store completed rows here current_record = {} # Temporary storage for each record until complete with pdfplumber.open(file_obj) as pdf: for page_number, page in enumerate(pdf.pages, start=1): text = page.extract_text() if not text: print(f"[DEBUG] No text found on page {page_number}. Skipping.") continue print(f"[DEBUG] Processing text from page {page_number}") lines = text.splitlines() # Process line by line for better control for line_number, line in enumerate(lines, start=1): print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}") # Match Sl No (if it’s the start of a new item entry) sl_no_match = re.match(r'(\d+)\s+', line) if sl_no_match and "Sl No" not in current_record: current_record["Sl No"] = sl_no_match.group(1) # Material Description (Multi-line support) material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line) if material_desc_match: if "Material Description" not in current_record: current_record["Material Description"] = material_desc_match.group(0) else: # Append if description spans multiple lines current_record["Material Description"] += " " + material_desc_match.group(0) # Match Unit if "Unit" not in current_record: unit_match = re.search(r'\b(No|Kg|Pack)\b', line) if unit_match: current_record["Unit"] = unit_match.group(0) # Match Quantity quantity_match = re.search(r'(\d+)\s+Quantity', line) if quantity_match and "Quantity" not in current_record: current_record["Quantity"] = quantity_match.group(1) # Match Delivery Quantity dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line) if dely_qty_match and "Dely Qty" not in current_record: current_record["Dely Qty"] = dely_qty_match.group(1) # Match Delivery Date dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line) if dely_date_match and "Dely Date" not in current_record: current_record["Dely Date"] = dely_date_match.group(0) # Match Unit Rate unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line) if unit_rate_match and "Unit Rate" not in current_record: current_record["Unit Rate"] = unit_rate_match.group(1) # Match Value value_match = re.search(r'Value\s+([\d.]+)', line) if value_match and "Value" not in current_record: current_record["Value"] = value_match.group(1) # Check if we have a complete record if len(current_record) == 8: print(f"[DEBUG] Complete record found: {current_record}") data.append([ current_record["Sl No"], current_record["Material Description"], current_record["Unit"], current_record["Quantity"], current_record["Dely Qty"], current_record["Dely Date"], current_record["Unit Rate"], current_record["Value"] ]) current_record = {} # Reset for next record # Create DataFrame if data was extracted if data: columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] df = pd.DataFrame(data, columns=columns) # Convert numeric columns to appropriate types df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce') df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce') df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce') df["Value"] = pd.to_numeric(df["Value"], errors='coerce') print(f"[DEBUG] Data extracted successfully:\n{df}") return df else: print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.") return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])