Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import pandas as pd | |
| import re | |
| def extract_po_details(file_obj): | |
| data = [] # Store completed rows here | |
| current_record = {} # Temporary storage for each record until complete | |
| with pdfplumber.open(file_obj) as pdf: | |
| for page_number, page in enumerate(pdf.pages, start=1): | |
| text = page.extract_text() | |
| if not text: | |
| print(f"[DEBUG] No text found on page {page_number}. Skipping.") | |
| continue | |
| print(f"[DEBUG] Processing text from page {page_number}") | |
| lines = text.splitlines() # Process line by line for better control | |
| for line_number, line in enumerate(lines, start=1): | |
| print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}") | |
| # Match Sl No (if it’s the start of a new item entry) | |
| sl_no_match = re.match(r'(\d+)\s+', line) | |
| if sl_no_match and "Sl No" not in current_record: | |
| current_record["Sl No"] = sl_no_match.group(1) | |
| # Material Description (Multi-line support) | |
| material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line) | |
| if material_desc_match: | |
| if "Material Description" not in current_record: | |
| current_record["Material Description"] = material_desc_match.group(0) | |
| else: | |
| # Append if description spans multiple lines | |
| current_record["Material Description"] += " " + material_desc_match.group(0) | |
| # Match Unit | |
| if "Unit" not in current_record: | |
| unit_match = re.search(r'\b(No|Kg|Pack)\b', line) | |
| if unit_match: | |
| current_record["Unit"] = unit_match.group(0) | |
| # Match Quantity | |
| quantity_match = re.search(r'(\d+)\s+Quantity', line) | |
| if quantity_match and "Quantity" not in current_record: | |
| current_record["Quantity"] = quantity_match.group(1) | |
| # Match Delivery Quantity | |
| dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line) | |
| if dely_qty_match and "Dely Qty" not in current_record: | |
| current_record["Dely Qty"] = dely_qty_match.group(1) | |
| # Match Delivery Date | |
| dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line) | |
| if dely_date_match and "Dely Date" not in current_record: | |
| current_record["Dely Date"] = dely_date_match.group(0) | |
| # Match Unit Rate | |
| unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line) | |
| if unit_rate_match and "Unit Rate" not in current_record: | |
| current_record["Unit Rate"] = unit_rate_match.group(1) | |
| # Match Value | |
| value_match = re.search(r'Value\s+([\d.]+)', line) | |
| if value_match and "Value" not in current_record: | |
| current_record["Value"] = value_match.group(1) | |
| # Check if we have a complete record | |
| if len(current_record) == 8: | |
| print(f"[DEBUG] Complete record found: {current_record}") | |
| data.append([ | |
| current_record["Sl No"], | |
| current_record["Material Description"], | |
| current_record["Unit"], | |
| current_record["Quantity"], | |
| current_record["Dely Qty"], | |
| current_record["Dely Date"], | |
| current_record["Unit Rate"], | |
| current_record["Value"] | |
| ]) | |
| current_record = {} # Reset for next record | |
| # Create DataFrame if data was extracted | |
| if data: | |
| columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] | |
| df = pd.DataFrame(data, columns=columns) | |
| # Convert numeric columns to appropriate types | |
| df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce') | |
| df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce') | |
| df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce') | |
| df["Value"] = pd.to_numeric(df["Value"], errors='coerce') | |
| print(f"[DEBUG] Data extracted successfully:\n{df}") | |
| return df | |
| else: | |
| print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.") | |
| return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]) | |