import pdfplumber
import pandas as pd
import re

def extract_po_details(file_obj):
    data = []  # Store completed rows here
    current_record = {}  # Temporary storage for each record until complete

    with pdfplumber.open(file_obj) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                print(f"[DEBUG] No text found on page {page_number}. Skipping.")
                continue

            print(f"[DEBUG] Processing text from page {page_number}")
            lines = text.splitlines()  # Process line by line for better control

            for line_number, line in enumerate(lines, start=1):
                print(f"[DEBUG] Processing line {line_number} on page {page_number}: {line}")

                # Match Sl No (if it’s the start of a new item entry)
                sl_no_match = re.match(r'(\d+)\s+', line)
                if sl_no_match and "Sl No" not in current_record:
                    current_record["Sl No"] = sl_no_match.group(1)

                # Material Description (Multi-line support)
                material_desc_match = re.search(r'(BPS\s+\d+.*?HSN Code:\d+\s+IGST\s*:\s*\d+%)', line)
                if material_desc_match:
                    if "Material Description" not in current_record:
                        current_record["Material Description"] = material_desc_match.group(0)
                    else:
                        # Append if description spans multiple lines
                        current_record["Material Description"] += " " + material_desc_match.group(0)

                # Match Unit
                if "Unit" not in current_record:
                    unit_match = re.search(r'\b(No|Kg|Pack)\b', line)
                    if unit_match:
                        current_record["Unit"] = unit_match.group(0)

                # Match Quantity
                quantity_match = re.search(r'(\d+)\s+Quantity', line)
                if quantity_match and "Quantity" not in current_record:
                    current_record["Quantity"] = quantity_match.group(1)

                # Match Delivery Quantity
                dely_qty_match = re.search(r'(\d+)\s+Dely Qty', line)
                if dely_qty_match and "Dely Qty" not in current_record:
                    current_record["Dely Qty"] = dely_qty_match.group(1)

                # Match Delivery Date
                dely_date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', line)
                if dely_date_match and "Dely Date" not in current_record:
                    current_record["Dely Date"] = dely_date_match.group(0)

                # Match Unit Rate
                unit_rate_match = re.search(r'Unit Rate\s+([\d.]+)', line)
                if unit_rate_match and "Unit Rate" not in current_record:
                    current_record["Unit Rate"] = unit_rate_match.group(1)

                # Match Value
                value_match = re.search(r'Value\s+([\d.]+)', line)
                if value_match and "Value" not in current_record:
                    current_record["Value"] = value_match.group(1)

                # Check if we have a complete record
                if len(current_record) == 8:
                    print(f"[DEBUG] Complete record found: {current_record}")
                    data.append([
                        current_record["Sl No"],
                        current_record["Material Description"],
                        current_record["Unit"],
                        current_record["Quantity"],
                        current_record["Dely Qty"],
                        current_record["Dely Date"],
                        current_record["Unit Rate"],
                        current_record["Value"]
                    ])
                    current_record = {}  # Reset for next record

    # Create DataFrame if data was extracted
    if data:
        columns = ["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"]
        df = pd.DataFrame(data, columns=columns)
        
        # Convert numeric columns to appropriate types
        df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')
        df["Dely Qty"] = pd.to_numeric(df["Dely Qty"], errors='coerce')
        df["Unit Rate"] = pd.to_numeric(df["Unit Rate"], errors='coerce')
        df["Value"] = pd.to_numeric(df["Value"], errors='coerce')
        
        print(f"[DEBUG] Data extracted successfully:\n{df}")
        return df
    else:
        print("[DEBUG] No data extracted. Check if the PDF structure or regex is correct.")
        return pd.DataFrame(columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"])