Spaces:

MakPr016
/

dynamic-parser

Running

File size: 36,415 Bytes

import pdfplumber
import os
import json
import io
import re
from google import genai
from google.genai import types

GEMINI_MODEL = "gemini-2.5-pro"

_client = None

PLACEHOLDER_PATTERNS = ["click or tap", "click here", "enter text", "type here"]
SKIP_DESCS = {
    "total", "subtotal", "grand total", "amount", "description", "item description",
    "transportation price", "insurance price", "installation price", "training price",
    "other charges (specify)", "other charges", "total price",
    "total final and all-inclusive price",
}

DESC_RE = re.compile(r'(description|specifications|commodity|item\s*name|item\s*desc)')
QTY_RE  = re.compile(r'(qty|quant|quantity|total\s*qty|total\s*quantity)')
SR_RE   = re.compile(r'\b(sr|item\s*no|pos\.?)\b|^no\.?$')
UNIT_RE = re.compile(r'(unit|uom|pack\s*size|measure)')


# ---------------------------------------------------------------------------
# CATEGORY DEFINITIONS (ported from old parser)
# Ordered by specificity. Whole-word boundary matching is applied.
# ---------------------------------------------------------------------------
CATEGORY_DEFINITIONS = {
    "Pharmaceuticals & Biologics": [
        "tablet", "tab", "capsule", "cap", "syrup", "suspension", "susp", "injection", "inj", "vial", "ampoule", "amp",
        "drops", "gtt", "inhaler", "vaccine", "insulin", "dose", "drug", "medication", "ointment", "cream", "gel",
        "lotion", "suppository", "supp", "antibiotic", "antiviral", "analgesic", "anesthetic", "hormone", "steroid",
        "vitamin", "mineral", "supplement", "lozenge", "patch", "solution", "powder for suspension", "elixir", "serum",
        "antitoxin",
    ],
    "Surgical Products": [
        "scalpel", "forceps", "retractor", "clamp", "suture", "stapler", "surgical mesh", "hemostatic", "sealant",
        "surgical drape", "surgical gown", "laparoscopic", "robotic surgery", "electrosurgical", "surgical laser",
        "surgical blade", "trocar", "surgical clip", "surgical scissor", "needle holder",
    ],
    "Orthopedic & Spine": [
        "orthopedic", "spine", "joint replacement", "trauma fixation", "bone plate", "bone screw",
        "intramedullary rod", "bone nail", "spinal implant", "spinal fusion", "bone graft", "orthopedic brace",
        "cast", "arthroscopy", "fixator", "prosthesis", "bone drill", "bone saw",
    ],
    "Cardiovascular Products": [
        "cardiac stent", "pacemaker", "defibrillator", "icd", "heart valve", "vascular graft", "cardiac catheter",
        "guidewire", "cardiac balloon", "ablation", "coronary", "angioplasty", "introducer sheath",
    ],
    "Medical Imaging Equipment": [
        "mri", "ct scanner", "x-ray", "ultrasound", "mammography", "fluoroscopy", "pet scanner", "c-arm",
        "medical imaging", "transducer", "x-ray film", "contrast media", "lead apron",
    ],
    "Diagnostic Products": [
        "diagnostic", "test kit", "glucose test", "reagent", "immunoassay", "chemistry analyzer", "hematology",
        "microbiology", "culture media", "pregnancy test", "covid", "rapid test", "urinalysis", "penlight",
        "specula", "otoscope", "ophthalmoscope", "lancet", "glucometer strips", "test strip",
    ],
    "Patient Monitoring Equipment": [
        "vital signs", "ecg", "ekg", "pulse oximeter", "blood pressure monitor", "sphygmomanometer",
        "medical thermometer", "capnography", "fetal monitor", "telemetry", "spo2 sensor", "bp cuff",
        "temperature probe",
    ],
    "Respiratory & Anesthesia": [
        "ventilator", "anesthesia machine", "oxygen concentrator", "nebulizer", "cpap", "bipap", "respiratory",
        "endotracheal", "tracheostomy", "spirometer", "oxygen mask", "breathing circuit", "nasal cannula",
        "resuscitator", "laryngoscope",
    ],
    "Infusion & Vascular Access": [
        "infusion pump", "syringe pump", "iv set", "iv catheter", "venous", "picc", "iv port",
        "dialysis catheter", "administration set", "extension set", "stopcock", "giving set", "saline",
        "dextrose", "ringer", "sodium chloride", "water for injection",
    ],
    "Wound Care & Tissue Management": [
        "wound dressing", "bandage", "gauze", "medical tape", "plaster", "adhesive", "wound foam", "alginate",
        "hydrocolloid", "compression bandage", "ostomy", "skin substitute", "negative pressure",
    ],
    "Dialysis & Renal Care": [
        "hemodialysis", "peritoneal", "dialyzer", "blood line", "fistula needle", "dialysis concentrate",
        "bicarbonate",
    ],
    "Ophthalmic Products": [
        "intraocular", "intraocular lens", "phaco", "vitrectomy", "lasik", "contact lens", "viscoelastic",
        "ophthalmic solution", "eye drops",
    ],
    "Dental Products": [
        "dental implant", "orthodontic", "dental bracket", "dental wire", "dental drill", "dental handpiece",
        "dental cement", "dental composite", "amalgam", "impression material", "teeth whitening", "dental chair",
    ],
    "Neurology & Neurosurgery": [
        "neurostimulation", "spinal cord stimulator", "neuro coil", "flow diverter", "cranial", "shunt",
        "neuro electrode", "eeg", "emg",
    ],
    "Laboratory Equipment & Supplies": [
        "microscope", "lab centrifuge", "incubator", "autoclave", "pipette", "glassware", "test tube",
        "petri dish", "flask", "beaker", "microscope slide", "cover glass", "fume hood", "biosafety cabinet",
    ],
    "Personal Protective Equipment (PPE)": [
        "ppe", "n95", "face shield", "safety eyewear", "goggles", "protective apron", "shoe cover",
        "head cover", "coverall", "isolation gown", "hazmat", "surgical mask",
    ],
    "Sterilization & Disinfection": [
        "sterilization", "disinfectant", "antiseptic", "povidone", "iodine", "chlorhexidine", "alcohol swab",
        "hand sanitizer", "medical soap", "enzymatic cleaner", "detergent", "washer disinfector", "sterilizer",
        "sterilization indicator",
    ],
    "Hospital Furniture & Equipment": [
        "hospital bed", "examination table", "stretcher", "medical trolley", "medical cart", "medical cabinet",
        "bedside locker", "overbed table", "iv pole", "wheelchair",
    ],
    "Rehabilitation & Physical Therapy": [
        "rehabilitation", "physiotherapy", "walker", "walking cane", "crutch", "exercise band", "traction",
        "electrotherapy", "massage table", "orthosis",
    ],
    "Home Healthcare Products": [
        "home care", "blood glucose meter", "hearing aid", "mobility aid", "bathroom safety", "commode",
    ],
    "Emergency & Trauma Care": [
        "emergency kit", "trauma kit", "first aid", "aed", "defibrillator", "manual resuscitator",
        "suction unit", "immobilizer", "cervical collar", "splint", "tourniquet", "crash cart",
    ],
    "Maternal & Neonatal Care": [
        "maternal", "neonatal", "infant incubator", "infant warmer", "phototherapy", "breast pump",
        "obstetric", "birthing bed", "fetal doppler", "umbilical",
    ],
    "Urology Products": [
        "urology", "foley catheter", "urine bag", "urinary drainage", "ureteral stent", "stone basket",
    ],
    "Gastroenterology & Endoscopy": [
        "endoscope", "gastroscope", "colonoscope", "biopsy forceps", "polypectomy snare", "gastric balloon",
        "ercp",
    ],
    "Oncology Products": [
        "oncology", "chemotherapy", "radiotherapy", "brachytherapy", "port-a-cath", "cancer diagnostic",
    ],
    "Pain Management": [
        "pain management", "pca pump", "epidural", "nerve block", "tens unit",
    ],
    "Sleep Medicine": [
        "sleep apnea", "cpap mask", "bipap mask", "sleep tubing", "polysomnography",
    ],
    "Telemedicine & Digital Health": [
        "telemedicine", "telehealth", "remote monitor", "medical software", "health app",
    ],
    "Blood Management": [
        "blood bag", "blood transfusion", "blood bank", "blood warmer", "apheresis",
    ],
    "Mortuary & Pathology": [
        "mortuary", "autopsy", "body bag", "morgue fridge", "dissection table", "microtome",
        "tissue processor",
    ],
    "Environmental Control": [
        "medical gas", "medical vacuum", "medical air plant", "gas manifold", "gas outlet", "gas alarm",
    ],
    "Mobility & Accessibility": [
        "patient lift", "patient hoist", "wheelchair ramp", "stair lift", "transfer board",
    ],
    "Bariatric Products": [
        "bariatric bed", "bariatric wheelchair", "heavy duty scale",
    ],
    "Medical Textiles": [
        "hospital linen", "bed sheet", "pillow case", "medical blanket", "towel", "privacy curtain",
        "medical uniform", "scrub suit", "lab coat",
    ],
    "Infection Control Products": [
        "waste bin", "sharps container", "biohazard bag", "spill kit", "air purifier",
    ],
    "Medical Gases & Cryogenics": [
        "gas cylinder", "oxygen regulator", "flowmeter", "liquid oxygen", "nitrogen tank",
    ],
    "Nutrition & Feeding": [
        "enteral feeding", "clinical nutrition", "nasogastric tube", "feeding pump", "feeding set", "peg tube",
    ],
    "Specimen Collection & Transport": [
        "specimen container", "sample collection", "transport media", "transport swab", "urine container",
        "stool container", "cool box", "transport bag",
    ],
    "Medical Software & IT": [
        "emr", "ehr", "pacs", "ris", "lis", "his", "hospital information system",
    ],
    "Aesthetics & Dermatology": [
        "dermatology", "aesthetic laser", "ipl", "dermal filler", "botulinum", "botox", "chemical peel",
        "microdermabrasion",
    ],
    # Catch-all — must remain last
    "Medical Supplies & Consumables": [
        "syringe", "needle", "glove", "examination glove", "disposable", "consumable", "cotton wool",
        "alcohol prep", "urinal", "bedpan", "underpad", "tongue depressor", "applicator",
        "lubricant jelly", "cannula",
    ],
}


def determine_item_category(description: str, unit: str = "") -> str:
    """
    Returns the best-matching category for a line item using whole-word regex
    matching against CATEGORY_DEFINITIONS.  Falls back to
    'Medical Supplies & Consumables' if nothing matches.
    """
    text = (description + " " + unit).lower()
    for category, keywords in CATEGORY_DEFINITIONS.items():
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword) + r'\b'
            if re.search(pattern, text):
                return category
    return "Medical Supplies & Consumables"


# ---------------------------------------------------------------------------
# Remaining helpers (unchanged from original)
# ---------------------------------------------------------------------------

def _get_genai_client():
    global _client
    if _client is None:
        api_key = os.environ.get("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("GOOGLE_API_KEY is not configured")
        _client = genai.Client(api_key=api_key)
    return _client


def _clean(cell):
    return str(cell).replace("\n", " ").strip() if cell else ""


def _is_placeholder(text):
    t = text.lower()
    return any(p in t for p in PLACEHOLDER_PATTERNS)


def _parse_qty(s):
    q = re.sub(r"[^\d.]", "", s)
    if not q:
        return 0
    try:
        v = float(q)
        return int(v) if v.is_integer() else v
    except Exception:
        return 0


def _detect_header(table):
    for r_i, row in enumerate(table[:6]):
        cells = [_clean(c).lower() for c in row]
        flat = " ".join(cells)
        if not (DESC_RE.search(flat) and (QTY_RE.search(flat) or UNIT_RE.search(flat))):
            continue
        idx = {"sr": -1, "desc": -1, "unit": -1, "qty": -1}
        for c_i, h in enumerate(cells):
            if not h:
                continue
            if SR_RE.search(h) and idx["sr"] == -1:
                idx["sr"] = c_i
            elif DESC_RE.search(h) and idx["desc"] == -1:
                idx["desc"] = c_i
            elif QTY_RE.search(h) and idx["qty"] == -1:
                idx["qty"] = c_i
            elif UNIT_RE.search(h) and idx["unit"] == -1:
                idx["unit"] = c_i
        if idx["desc"] != -1:
            return r_i, idx, len(row)
    return -1, None, 0


def _remap_by_data_row(idx_map, table, header_idx):
    sample = next(
        (r for r in table[header_idx + 1:] if any(c is not None for c in r)),
        None
    )
    if not sample:
        return idx_map

    non_none = [i for i, c in enumerate(sample) if c is not None]
    if len(non_none) < 2:
        return idx_map

    remapped = {
        "sr":   non_none[0]  if len(non_none) > 0 else -1,
        "desc": non_none[1]  if len(non_none) > 1 else -1,
        "unit": non_none[-2] if len(non_none) > 2 else -1,
        "qty":  non_none[-1] if len(non_none) > 1 else -1,
    }
    return remapped


def _looks_like_item_continuation(table):
    hits = 0
    for row in table[:8]:
        non_empty = [_clean(c) for c in row if c is not None and _clean(c)]
        if len(non_empty) >= 2 and re.match(r'^\d+\.?$', non_empty[0]) and len(non_empty[1]) > 3:
            hits += 1
    return hits >= 2


def _extract_rows(rows, idx_map, num_cols, seen_srs, items):
    def _parse_description_parts(raw_desc):
        text = raw_desc.strip()
        if not text:
            return "", "", ""

        # Pull dosage-like fragments such as "156 Mg/5ml" or "500 mg".
        dosage_match = re.search(
            r"\b\d+(?:\.\d+)?\s*(?:mg|mcg|g|iu|ml|mg/ml|mcg/ml|g/ml)\b(?:\s*/\s*\d+(?:\.\d+)?\s*ml)?",
            text,
            flags=re.IGNORECASE,
        )
        dosage = dosage_match.group(0) if dosage_match else ""

        # Common dosage forms that appear in descriptions.
        form_match = re.search(
            r"\b(tablet|tab|capsule|cap|suspension|syrup|injection|inj|vial|ampoule|amp|drops|inhaler|ointment|cream|gel|lotion|suppository|supp|solution|powder|elixir|serum)\b",
            text,
            flags=re.IGNORECASE,
        )
        form = form_match.group(0) if form_match else ""

        cleaned = text
        for fragment in [dosage, form]:
            if fragment:
                cleaned = re.sub(re.escape(fragment), "", cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(" ,.-")

        return cleaned, dosage, form

    def _parse_pack_from_unit(raw_unit):
        text = raw_unit.strip()
        if not text:
            return "", 0, ""

        # Match patterns like "Pack of 20 Tablet" or "Box of 100".
        pack_match = re.search(r"\b(pack|box|bottle|bag|tube|vial|ampoule|amp|ea|each|single unit)\b", text, flags=re.IGNORECASE)
        unit_type = pack_match.group(0) if pack_match else ""

        qty_match = re.search(r"\b(\d+(?:\.\d+)?)\b", text)
        pack_size = 0
        if qty_match:
            try:
                pack_size_val = float(qty_match.group(1))
                pack_size = int(pack_size_val) if pack_size_val.is_integer() else pack_size_val
            except Exception:
                pack_size = 0

        pack_unit = ""
        trailing = text
        if qty_match:
            trailing = text[qty_match.end():]
        if trailing:
            m = re.search(r"\b([a-zA-Z]+(?:\s+[a-zA-Z]+)?)\b", trailing)
            if m:
                pack_unit = m.group(1).strip()

        return unit_type.title() if unit_type else "", pack_size, pack_unit.title() if pack_unit else ""

    for row in rows:
        row_clean = [_clean(c) for c in row]
        row_clean = (row_clean + [""] * num_cols)[:num_cols]

        if not any(row_clean):
            continue
        if any(_is_placeholder(c) for c in row_clean):
            continue

        sr_val = None
        if idx_map["sr"] != -1 and idx_map["sr"] < len(row_clean):
            m = re.search(r'\d+', row_clean[idx_map["sr"]])
            if m:
                sr_val = int(m.group())
        if sr_val is None:
            non_empty = [c for c in row_clean if c]
            if non_empty and re.match(r'^\d+\.?$', non_empty[0]):
                sr_val = int(re.sub(r'\D', '', non_empty[0]))

        desc = ""
        if idx_map["desc"] != -1 and idx_map["desc"] < len(row_clean):
            desc = row_clean[idx_map["desc"]]
        if not desc:
            for c in row_clean:
                if c and not re.match(r'^[\d.,]+$', c) and not _is_placeholder(c):
                    desc = c
                    break

        desc = desc.strip()
        if not desc or len(desc) < 3 or desc.lower() in SKIP_DESCS or _is_placeholder(desc):
            continue

        unit_val = ""
        if idx_map["unit"] != -1 and idx_map["unit"] < len(row_clean):
            unit_val = row_clean[idx_map["unit"]]

        qty_val = 0
        if idx_map["qty"] != -1 and idx_map["qty"] < len(row_clean):
            qty_val = _parse_qty(row_clean[idx_map["qty"]])

        key = sr_val if sr_val is not None else desc
        if key in seen_srs:
            continue
        seen_srs.add(key)

        clean_desc, dosage, form = _parse_description_parts(desc)
        unit_type, pack_size, pack_unit = _parse_pack_from_unit(unit_val)

        # --- NEW: classify the item ---
        category = determine_item_category(clean_desc or desc, unit_val)

        items.append({
            "sr": sr_val if sr_val is not None else len(items) + 1,
            "description": clean_desc or desc,
            "dosage": dosage,
            "form": form.title() if form else "",
            "pack_size": pack_size,
            "pack_unit": pack_unit,
            "unit": unit_type,
            "qty": qty_val,
            "unit_price": None,
            "total_price": None,
            "brand": "",
            "expiry_date": "",
            "remarks": "",
            "category": category,       # ← new field
        })


def extract_line_items(pdf_bytes):
    items = []
    seen_srs = set()
    active_schema = None

    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            if not tables:
                continue

            for table in tables:
                if len(table) < 2:
                    continue

                h_idx, idx_map, num_cols = _detect_header(table)

                if h_idx != -1 and idx_map and idx_map["desc"] != -1:
                    remapped = _remap_by_data_row(idx_map, table, h_idx)
                    active_schema = {"idx": remapped, "num_cols": num_cols}
                    _extract_rows(table[h_idx + 1:], remapped, num_cols, seen_srs, items)
                    continue

                if active_schema and _looks_like_item_continuation(table):
                    actual_cols = max(len(r) for r in table)
                    sample = next((r for r in table if any(c is not None for c in r)), None)
                    none_ratio = sum(1 for c in (sample or []) if c is None) / max(len(sample or [1]), 1)

                    if none_ratio > 0.4:
                        non_none = [i for i, c in enumerate(sample) if c is not None]
                        remapped = {
                            "sr":   non_none[0]  if len(non_none) > 0 else -1,
                            "desc": non_none[1]  if len(non_none) > 1 else -1,
                            "unit": non_none[-2] if len(non_none) > 2 else -1,
                            "qty":  non_none[-1] if len(non_none) > 1 else -1,
                        }
                    else:
                        remapped = {"sr": 0, "desc": 1, "unit": 2, "qty": 3}

                    _extract_rows(table, remapped, actual_cols, seen_srs, items)

    return items


def _extract_line_items_from_llm(full_text, use_gemini: bool = True):
    if not use_gemini:
        return []

    system_prompt = (
        "You are an expert at parsing RFQ documents. Extract ALL line items / schedule of requirements from the text. "
        "Return a JSON array only. Each object must have exactly these keys: "
        '{"sr": integer, "description": "string", "unit": "string or empty string", "qty": number or 0, '
        '"unit_price": null, "total_price": null, "brand": "", "expiry_date": "", "remarks": "", "category": "string"}. '
        "For 'category', classify each item into the most appropriate medical supply category "
        "(e.g. 'Pharmaceuticals & Biologics', 'Surgical Products', 'Diagnostic Products', etc.). "
        "If no line items are found, return []. RETURN JSON ARRAY ONLY, no markdown, no preamble."
    )
    try:
        client = _get_genai_client()
        response = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=full_text[:30000],
            config=types.GenerateContentConfig(
                system_instruction=system_prompt,
                response_mime_type="application/json",
                temperature=0,
            ),
        )
        result = json.loads(response.text)
        if isinstance(result, list):
            # Apply local rule-based categorisation as a safety net in case
            # the LLM returns an empty or generic category string.
            for item in result:
                if not item.get("category") or item["category"] in ("string", ""):
                    item["category"] = determine_item_category(
                        item.get("description", ""),
                        item.get("unit", ""),
                    )
            return result
        return []
    except Exception:
        return []



# ---------------------------------------------------------------------------
# RULE-BASED STRUCTURE EXTRACTOR (no LLM)
# ---------------------------------------------------------------------------

_SECTION_SIGNALS = [
    (re.compile(r'(quotation|quote|rfq|tender)\s*(submission|instruction|guideline)', re.I), 'Quotation Submission'),
    (re.compile(r'vendor|supplier|company\s*info|bidder\s*info', re.I),                     'Vendor Information'),
    (re.compile(r'declaration|conformity|compliance\s*statement|certif', re.I),              'Declaration of Conformity'),
    (re.compile(r'schedule\s*of\s*req|item\s*list|line\s*item|bill\s*of\s*material', re.I), 'Schedule of Requirements'),
    (re.compile(r'technical\s*(offer|proposal|spec)|financial\s*(offer|proposal)', re.I),   'Technical & Financial Offer'),
    (re.compile(r'delivery|compliance|lead\s*time|incoterm|warranty', re.I),                'Compliance & Delivery'),
]

_FIELD_RULES = [
    # --- Quotation Submission ---
    (re.compile(r'rfq\s*(number|no\.?|ref)', re.I),
     dict(id='rfq_number',        label='RFQ Number',              type='text',     section='Quotation Submission',   required=True,  placeholder='e.g. RFQ-2024-001')),
    (re.compile(r'(submission|closing|deadline|due)\s*(date|by)', re.I),
     dict(id='submission_date',   label='Submission Deadline',     type='date',     section='Quotation Submission',   required=True,  placeholder='DD/MM/YYYY')),
    (re.compile(r'validity\s*(period|days|of\s*offer)', re.I),
     dict(id='validity_period',   label='Validity Period (days)',  type='number',   section='Quotation Submission',   required=True,  placeholder='e.g. 90')),
    (re.compile(r'(submit|send|deliver).{0,30}(email|electronically|portal)', re.I),
     dict(id='submission_method', label='Submission Method',       type='dropdown', section='Quotation Submission',   required=True,  options=['Email', 'Portal', 'Hard Copy'])),
    (re.compile(r'\bcurrency\b', re.I),
     dict(id='currency',          label='Currency',                type='dropdown', section='Quotation Submission',   required=True,  options=['USD', 'EUR', 'GBP', 'LYD', 'AED', 'SAR'])),
    (re.compile(r'(price|quote|quotation).{0,20}(all.inclusive|include.*vat|include.*tax)', re.I),
     dict(id='price_inclusive',   label='Price Inclusive of All Taxes', type='checkbox', section='Quotation Submission', required=False)),
    (re.compile(r'payment\s*(terms?|condition|method)', re.I),
     dict(id='payment_terms',     label='Payment Terms',           type='text',     section='Quotation Submission',   required=False, placeholder='e.g. Net 30')),

    # --- Vendor Information ---
    (re.compile(r'(company|vendor|supplier|bidder|firm)\s*(name|full\s*name)', re.I),
     dict(id='company_name',      label='Company Name',            type='text',     section='Vendor Information',     required=True,  placeholder='Legal registered name')),
    (re.compile(r'(company|vendor|business|registered)\s*(address|location|headquarter)', re.I),
     dict(id='company_address',   label='Company Address',         type='textarea', section='Vendor Information',     required=True,  placeholder='Full postal address')),
    (re.compile(r'country\s*(of\s*)?(origin|registration|incorporation)', re.I),
     dict(id='country',           label='Country',                 type='text',     section='Vendor Information',     required=True,  placeholder='e.g. Libya')),
    (re.compile(r'contact\s*(person|name|individual|representative)', re.I),
     dict(id='contact_person',    label='Contact Person',          type='text',     section='Vendor Information',     required=True,  placeholder='Full name')),
    (re.compile(r'(phone|telephone|mobile|tel)\s*(number|no\.?)?', re.I),
     dict(id='phone',             label='Phone Number',            type='phone',    section='Vendor Information',     required=True,  placeholder='+xxx-xxx-xxxxxxx')),
    (re.compile(r'(email|e-mail)\s*(address)?', re.I),
     dict(id='email',             label='Email Address',           type='email',    section='Vendor Information',     required=True,  placeholder='vendor@company.com')),
    (re.compile(r'(vat|tax|gst|tin)\s*(number|no\.?|registration|id)', re.I),
     dict(id='vat_number',        label='VAT / Tax Number',        type='text',     section='Vendor Information',     required=False, placeholder='Tax registration number')),
    (re.compile(r'(commercial|trade|business)\s*(registr|licen|certif)', re.I),
     dict(id='trade_license',     label='Trade License / Registration', type='file', section='Vendor Information',   required=False)),
    (re.compile(r'bank\s*(name|details?|account|information)', re.I),
     dict(id='bank_name',         label='Bank Name',               type='text',     section='Vendor Information',     required=False, placeholder='Bank name')),
    (re.compile(r'iban|account\s*(number|no\.?)', re.I),
     dict(id='iban',              label='IBAN / Account Number',   type='text',     section='Vendor Information',     required=False, placeholder='IBAN or account number')),

    # --- Declaration of Conformity ---
    (re.compile(r'(authorized|authorised)\s*(signator|representative|person)', re.I),
     dict(id='authorized_signatory', label='Authorized Signatory Name', type='text', section='Declaration of Conformity', required=True, placeholder='Full name of signing authority')),
    (re.compile(r'(signature|sign\s*here|signed\s*by)', re.I),
     dict(id='signature',         label='Signature',               type='file',     section='Declaration of Conformity', required=True)),
    (re.compile(r'(stamp|seal|company\s*stamp)', re.I),
     dict(id='company_stamp',     label='Company Stamp',           type='file',     section='Declaration of Conformity', required=False)),
    (re.compile(r'(date\s*of\s*(sign|submission)|signed\s*on|date\s*signed)', re.I),
     dict(id='declaration_date',  label='Date of Declaration',     type='date',     section='Declaration of Conformity', required=True,  placeholder='DD/MM/YYYY')),

    # --- Technical & Financial Offer ---
    (re.compile(r'(brand|manufacturer|make)\s*(name|proposed|offered)?', re.I),
     dict(id='brand_offered',     label='Brand / Manufacturer',    type='text',     section='Technical & Financial Offer', required=False, placeholder='Proposed brand name')),
    (re.compile(r'(catalogue|catalog|model|part)\s*(number|no\.?|ref)', re.I),
     dict(id='catalogue_number',  label='Catalogue / Model Number',type='text',     section='Technical & Financial Offer', required=False, placeholder='e.g. CAT-12345')),
    (re.compile(r'(unit|item)\s*price', re.I),
     dict(id='unit_price',        label='Unit Price',              type='number',   section='Technical & Financial Offer', required=True,  placeholder='Price per unit')),
    (re.compile(r'(total|overall)\s*(price|amount|value)', re.I),
     dict(id='total_price',       label='Total Price',             type='number',   section='Technical & Financial Offer', required=True,  placeholder='Total quoted amount')),
    (re.compile(r'(country|place)\s*of\s*(manufacture|origin|production)', re.I),
     dict(id='country_of_origin', label='Country of Origin',      type='text',     section='Technical & Financial Offer', required=False, placeholder='e.g. Germany')),
    (re.compile(r'(registration|approval|certif).{0,20}(ministry|moh|fda|ce\b|iso)', re.I),
     dict(id='registration_cert', label='Regulatory Registration Certificate', type='file', section='Technical & Financial Offer', required=True)),
    (re.compile(r'(shelf\s*life|expiry|expiration)', re.I),
     dict(id='shelf_life',        label='Shelf Life / Expiry Date',type='text',     section='Technical & Financial Offer', required=False, placeholder='e.g. min. 18 months upon delivery')),

    # --- Compliance & Delivery ---
    (re.compile(r'(delivery\s*(date|time|schedule)|lead\s*time)', re.I),
     dict(id='delivery_lead_time',label='Delivery Lead Time',      type='text',     section='Compliance & Delivery',  required=True,  placeholder='e.g. 4-6 weeks after PO')),
    (re.compile(r'(delivery\s*(term|condition|location|address)|destination|ship\s*to)', re.I),
     dict(id='delivery_address',  label='Delivery Address / Terms',type='textarea', section='Compliance & Delivery',  required=True,  placeholder='Delivery destination and Incoterms')),
    (re.compile(r'\bincoterm', re.I),
     dict(id='incoterms',         label='Incoterms',               type='dropdown', section='Compliance & Delivery',  required=False, options=['EXW', 'FOB', 'CIF', 'DDP', 'DAP', 'CPT'])),
    (re.compile(r'warranty\s*(period|term|duration)?', re.I),
     dict(id='warranty',          label='Warranty Period',         type='text',     section='Compliance & Delivery',  required=False, placeholder='e.g. 12 months')),
    (re.compile(r'(after.?sales?|technical\s*support|maintenance\s*support)', re.I),
     dict(id='after_sales_support',label='After-Sales Support',    type='textarea', section='Compliance & Delivery',  required=False, placeholder='Describe support offered')),
    (re.compile(r'(packing|packaging)\s*(standard|requirement|specification)?', re.I),
     dict(id='packing_standard',  label='Packing Standard',        type='text',     section='Compliance & Delivery',  required=False, placeholder='e.g. Original manufacturer packaging')),
]

_DEFAULT_FIELD_VALIDATION = {'min': None, 'max': None, 'pattern': None}

_KNOWN_SECTIONS = [
    'Quotation Submission',
    'Vendor Information',
    'Declaration of Conformity',
    'Schedule of Requirements',
    'Technical & Financial Offer',
    'Compliance & Delivery',
]


def _extract_structure_rule_based(full_text: str) -> dict:
    """
    Parse title, sections, and fields from raw PDF text without an LLM.
    Produces a best-effort result; quality depends on how legible the PDF text is.
    """
    lines = [l.strip() for l in full_text.splitlines()]
    non_empty = [l for l in lines if l and not l.startswith('---')]

    # Title: first substantive non-page-marker line
    title = 'RFQ Document'
    for line in non_empty[:15]:
        if len(line) > 5:
            title = line[:150]
            break

    # Sections: scan every line for signals
    found_sections = []
    section_order = {s: i for i, s in enumerate(_KNOWN_SECTIONS)}
    for line in lines:
        for pattern, section_name in _SECTION_SIGNALS:
            if pattern.search(line) and section_name not in found_sections:
                found_sections.append(section_name)
                break
    found_sections.sort(key=lambda s: section_order.get(s, 99))
    if 'Schedule of Requirements' not in found_sections:
        found_sections.append('Schedule of Requirements')

    # Fields: slide a 3-line window and match rules
    windows = [' '.join(lines[i:i + 3]) for i in range(len(lines))]
    seen_ids = set()
    fields = []
    for window in windows:
        for pattern, field_def in _FIELD_RULES:
            if pattern.search(window) and field_def['id'] not in seen_ids:
                if field_def['section'] in found_sections or field_def['required']:
                    seen_ids.add(field_def['id'])
                    fields.append({
                        'id':            field_def['id'],
                        'label':         field_def['label'],
                        'type':          field_def['type'],
                        'section':       field_def['section'],
                        'required':      field_def.get('required', False),
                        'default_value': None,
                        'placeholder':   field_def.get('placeholder', ''),
                        'options':       field_def.get('options', []),
                        'validation':    _DEFAULT_FIELD_VALIDATION.copy(),
                    })

    return {
        'title':       title,
        'description': '',
        'sections':    found_sections,
        'fields':      fields,
    }


def parse_rfq_pdf(pdf_bytes, use_gemini: bool = True):
    full_text = ""
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        total_pages = len(pdf.pages)
        pages_to_read = range(total_pages) if total_pages <= 10 else (
            list(range(5)) + list(range(total_pages - 5, total_pages))
        )
        for p_idx in pages_to_read:
            text = pdf.pages[p_idx].extract_text()
            if text:
                full_text += f"\n--- Page {p_idx + 1} ---\n{text}"

    # --- Main document structure extraction ---
    if use_gemini:
        system_prompt = """You are an expert RFQ Parser. Extract data from the RFQ text into the exact JSON structure below.

    JSON OUTPUT STRUCTURE:
    {
      "title": "string",
      "description": "string",
      "sections": [
         "Quotation Submission",
         "Vendor Information",
         "Declaration of Conformity",
         "Schedule of Requirements",
         "Technical & Financial Offer",
         "Compliance & Delivery"
      ],
      "fields": [
        {
          "id": "snake_case_id",
          "label": "Human Readable Label",
          "type": "file" | "text" | "number" | "date" | "dropdown" | "checkbox" | "email" | "phone" | "textarea",
          "section": "Quotation Submission" | "Vendor Information" | "Declaration of Conformity" | "Schedule of Requirements" | "Technical & Financial Offer" | "Compliance & Delivery",
          "required": boolean,
          "default_value": null,
          "placeholder": "Helpful hint",
          "options": ["Option1", "Option2"],
          "validation": {"min": null, "max": null, "pattern": null}
        }
      ]
    }
    """
        try:
            client = _get_genai_client()
            response = client.models.generate_content(
                model=GEMINI_MODEL,
                contents=full_text[:30000],
                config=types.GenerateContentConfig(
                    system_instruction=system_prompt + "\nRETURN JSON ONLY.",
                    response_mime_type="application/json",
                    temperature=0,
                ),
            )
            llm_data = json.loads(response.text)
        except Exception:
            llm_data = {"title": "Error Parsing", "description": "", "sections": [], "fields": []}
    else:
        llm_data = _extract_structure_rule_based(full_text)

    # --- Line item extraction ---
    line_items = extract_line_items(pdf_bytes)

    valid_items = [
        item for item in line_items
        if item.get("description") and not _is_placeholder(item["description"])
    ]

    if not valid_items:
        # use_gemini=False makes this return [] immediately (no API call)
        valid_items = _extract_line_items_from_llm(full_text, use_gemini=use_gemini)

    return {
        "title": llm_data.get("title", "RFQ Document"),
        "description": llm_data.get("description", ""),
        "sections": llm_data.get("sections", []),
        "line_items": valid_items,
        "fields": llm_data.get("fields", []),
        "gemini_used": use_gemini,
    }