import io
import json
import re
from pathlib import Path
from typing import List, Optional, Dict, Any, Tuple
from functools import lru_cache

import pandas as pd
from fastapi import FastAPI, UploadFile, File, HTTPException, Query
from fastapi.responses import JSONResponse, StreamingResponse
import difflib
from fastapi.middleware.cors import CORSMiddleware
import asyncio

app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid - Optimized)")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # lock this down in prod
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ---------- Fixed Tender Template ----------
TEMPLATE_COLUMNS = [
    "id", "tender_id", "tender_code", "customer_id", "customer_name", "fy", "category", "code",
    "current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
]

# ---------- OPTIMIZED:  Compile regex patterns once at module level ----------
UNIT_PATTERN_COMPILED = re.compile(
    r'\b\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\. u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v|microgram|milligram|gram|kilogram|liter|milliliter)\b',
    re.IGNORECASE
)

FORMS_PATTERN_COMPILED = re. compile(
    r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b',
    re.IGNORECASE
)

FRACTION_PATTERN = re.compile(r'\d+\s*/\s*\d+')
STANDALONE_NUM_PATTERN = re.compile(r'\b\d+(?:\.\d+)?\b')
WV_PATTERN = re.compile(r'\b[wv]\s*/\s*[wv]\b', re.IGNORECASE)
WHITESPACE_PATTERN = re.compile(r'\s+')
NON_WORD_PATTERN = re.compile(r'[^\w\s. %/+-]')

# ---------- Normalization ----------

# OPTIMIZED: Use lru_cache for frequently repeated strings
@lru_cache(maxsize=10000)
def norm_base(s: str) -> str:
    s = str(s or "")
    s = s.lower()
    s = s.replace("+", " ").replace("/", " ")
    s = NON_WORD_PATTERN. sub(" ", s)
    s = WHITESPACE_PATTERN.sub(" ", s).strip()
    return s


@lru_cache(maxsize=10000)
def extract_numbers(s: str) -> Tuple[str, ... ]:  # Return tuple for hashability
    s2 = norm_base(s)
    num_unit = UNIT_PATTERN_COMPILED.findall(s2)
    nums = STANDALONE_NUM_PATTERN.findall(s2)
    all_numbers = num_unit + nums
    return tuple(sorted(set([x. strip() for x in all_numbers])))


@lru_cache(maxsize=10000)
def token_set(s: str) -> Tuple[str, ...]:  # Return tuple for hashability
    return tuple(t for t in norm_base(s).split(" ") if t)


# ---------- Synonyms / detection ----------
SYNONYMS:  Dict[str, List[str]] = {
    "generic_name": [
        "generic name", "generic", "molecule", "molecule name", "molecule with strength",
        "composition", "salt", "api", "active ingredient"
    ],
    "current_brand_description": ["brand name", "brand", "trade name", "product", "product name", "item", "item name", "drug name"],
    "annual_volume_qty": ["potential annual volume", "annual volume qty", "annual qty", "annual volume", "qty", "quantity", "rfq qty", "order qty", "excepted annual consumption qty_total", "annual consumption"],
    "quotation Price": ["offer price(unit wise) without taxes in rs", "offer price", "unit price", "quoted rate", "rate", "basic rate", "price per unit", "price"],
    "code": ["item code", "product code", "sku", "catalogue no", "catalog no", "catalog number", "code"],
    "customer_name": ["customer name", "hospital name", "hospital", "buyer", "consignee", "institution", "institute", "organisation", "organization"],
    "fy": ["fy", "financial year", "f.y.", "year"],
    "id": ["s no", "sr no", "serial", "s.no", "line id", "id"],
    "tender_id": ["tender id", "rfq id", "enquiry id"],
    "tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
    "category": ["category", "schedule", "section", "chapter", "dept"],
    "dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
    "__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
    "__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
    "__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
}

# ---------- Header mapping ----------

def score_header(tcol: str, scol: str) -> float:
    tn, sn = norm_base(tcol), norm_base(scol)
    tset, sset = set(tn. split()), set(sn.split())
    jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
    contains = 1.0 if (tn in sn or sn in tn) else 0.0
    fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
    return 0.60*jacc + 0.25*contains + 0.15*fuzzy


def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
    src_cols = [str(c) for c in src_cols]
    src_norm_map = {norm_base(c): c for c in src_cols}
    mapping:  Dict[str, Optional[str]] = {}
    for tcol in target_cols:
        # 1) exact synonym
        for alias in SYNONYMS.get(tcol, []):
            n = norm_base(alias)
            if n in src_norm_map:
                mapping[tcol] = src_norm_map[n]
                break
        else:
            # 2) contains any synonym
            hit = None
            for alias in SYNONYMS.get(tcol, []):
                n = norm_base(alias)
                contain = [orig for nn, orig in src_norm_map.items()
                           if (n in nn or nn in n)]
                if contain:
                    hit = contain[0]
                    break
            if hit:
                mapping[tcol] = hit
            else:
                # 3) best score
                best_src, best_score = None, -1.0
                for scol in src_cols:
                    sc = score_header(tcol, scol)
                    if sc > best_score:
                        best_score, best_src = sc, scol
                mapping[tcol] = best_src if best_score >= 0.35 else None
    return mapping


def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
    cols = [str(c) for c in df.columns]
    norm_map = {norm_base(c): c for c in cols}
    # exact first
    for alias in SYNONYMS.get(logical_name, []):
        n = norm_base(alias)
        if n in norm_map:
            return norm_map[n]
    # contains next
    for alias in SYNONYMS.get(logical_name, []):
        n = norm_base(alias)
        for nn, orig in norm_map.items():
            if n in nn or nn in n:
                return orig
    # fallback:  score
    best_col, best_score = None, -1.0
    for c in cols:
        sc = score_header(logical_name, c)
        if sc > best_score:
            best_score, best_col = sc, c
    return best_col if best_score >= 0.35 else None

# ---------- File reading ----------

def guess_delimiter(sample: str) -> str:
    for d in ["\t", ";", "|", ","]:
        if d in sample:
            return d if d != "\t" else "\t"
    return ","


def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
    keep = [c for c in df. columns if not str(c).startswith("Unnamed")]
    return df.loc[:, keep]


def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
    df. columns = [str(c) for c in df.columns]
    return df


def choose_best_sheet_and_header(xl:  pd.ExcelFile, max_header_rows: int = 30):
    best = {"score": -1, "df": None, "sheet": None,
            "header": None, "mapping": None}
    for sheet in xl.sheet_names:
        for header in range(max_header_rows + 1):
            try:
                df = pd.read_excel(xl, sheet_name=sheet, header=header)
                df = drop_unnamed_columns(df)
                if df.dropna(how="all").empty:
                    continue
                df = ensure_str_columns(df)
                m = map_headers_auto(df.columns. tolist(), TEMPLATE_COLUMNS)
                score = sum(1 for v in m.values() if v is not None)
                if score > best["score"]:
                    best = {"score": score, "df":  df, "sheet": sheet,
                            "header": header, "mapping": m}
            except: 
                continue
    if best["df"] is None:
        raise ValueError("No readable tables found in the Excel workbook.")
    return best


def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
    ext = Path(filename).suffix.lower()
    if ext in [".xlsx", ".xls", ".xlsm", ". ods"]:
        xl = pd.ExcelFile(io.BytesIO(data))
        best = choose_best_sheet_and_header(xl)
        return best["df"]
    if ext in [".csv", ".tsv"]:
        text = data.decode("utf-8", errors="ignore")
        delim = guess_delimiter(text[: 4096])
        return pd.read_csv(io. StringIO(text), sep=delim, engine="python")
    if ext == ".json":
        js = json.loads(data.decode("utf-8", errors="ignore"))
        if isinstance(js, list):
            return pd.DataFrame(js)
        if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
            return pd.json_normalize(js["data"])
        raise ValueError(
            "Product master JSON must be a list of objects or an object with a 'data' array.")
    raise ValueError(f"Unsupported file type: {ext}")


def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Optional[str]]]:
    src_df = ensure_str_columns(drop_unnamed_columns(src_df))
    mapping = map_headers_auto(src_df.columns.tolist(), TEMPLATE_COLUMNS)
    out = pd.DataFrame(index=src_df.index)
    for tcol in TEMPLATE_COLUMNS:
        src = mapping.get(tcol)
        out[tcol] = src_df[str(src)] if src else pd.Series(
            [pd.NA]*len(src_df), index=src_df.index)
    return out, mapping

# ---------- OPTIMIZED: Molecule extraction with caching ----------

@lru_cache(maxsize=10000)
def extract_molecule_base(s: str) -> str:
    """Extract core molecule name by removing dosages, units, and forms."""
    s_norm = norm_base(s)
    
    # Step 1: Remove dosage forms FIRST
    s_norm = FORMS_PATTERN_COMPILED.sub(' ', s_norm)
    
    # Step 2: Remove number+unit patterns
    s_norm = UNIT_PATTERN_COMPILED.sub(' ', s_norm)
    
    # Step 3: Remove fractions and ratios
    s_norm = FRACTION_PATTERN. sub(' ', s_norm)
    
    # Step 4: Remove standalone numbers
    s_norm = STANDALONE_NUM_PATTERN.sub(' ', s_norm)
    
    # Step 5: Remove w/w, w/v, v/v
    s_norm = WV_PATTERN.sub(' ', s_norm)
    
    # Step 6: Clean up spaces
    s_norm = WHITESPACE_PATTERN.sub(' ', s_norm).strip()
    
    return s_norm


# ---------- OPTIMIZED: Pre-computed product master ----------

class PrecomputedProductMaster: 
    """Pre-compute all expensive operations once for the product master"""
    
    def __init__(self, pm_df: pd.DataFrame, molecule_col: str, 
                 brand_id_col: Optional[str], brand_name_col: Optional[str]):
        subset = pm_df. dropna(subset=[molecule_col]).copy()
        
        # Store original data
        self.molecule_col = molecule_col
        self.mol_raw = subset[molecule_col].astype(str).tolist()
        self.brand_ids = subset[brand_id_col].astype(str).tolist() \
            if brand_id_col and brand_id_col in subset. columns else [None] * len(subset)
        self.brand_names = subset[brand_name_col].astype(str).tolist() \
            if brand_name_col and brand_name_col in subset.columns else [None] * len(subset)
        self.idxs = subset.index.tolist()
        
        # Pre-compute normalized forms
        print(f"Pre-computing {len(self.mol_raw)} product master entries...")
        self.mol_norm = [norm_base(m) for m in self.mol_raw]
        self.mol_base = [extract_molecule_base(m) for m in self.mol_raw]
        self.mol_tokens = [set(token_set(mb)) for mb in self.mol_base]
        self.mol_numbers = [set(extract_numbers(m)) for m in self.mol_raw]
        print("Pre-computation complete!")
    
    def __len__(self):
        return len(self.mol_raw)


# ---------- OPTIMIZED: Fast pre-filter ----------

def quick_filter(g_tokens:  set, pm_tokens: set, threshold: float = 0.15) -> bool:
    """Fast token overlap check to skip obvious non-matches"""
    if not g_tokens or not pm_tokens:
        return False
    overlap = len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens)
    return overlap >= threshold


# ---------- OPTIMIZED: Hybrid similarity with pre-computed data ----------

def hybrid_similarity_optimized(
    g_norm: str, g_base: str, g_tokens: set, g_numbers: set,
    pm_norm: str, pm_base: str, pm_tokens: set, pm_numbers: set
) -> Dict[str, float]:
    """
    Enhanced similarity using pre-computed normalized forms.
    """
    # Exact match = perfect score
    if g_norm == pm_norm:
        return {"diff": 100.0, "jacc": 100.0, "num":  100.0, "mol_base": 100.0, "score": 100.0}
    
    # 1. Full text difflib similarity
    diff = difflib.SequenceMatcher(None, g_norm, pm_norm).ratio() * 100.0
    
    # 2. Token Jaccard similarity
    jacc = (len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens) * 100.0) if (g_tokens and pm_tokens) else 0.0
    
    # 3. Number matching (bonus only)
    num_match = 100.0 if (g_numbers and pm_numbers and g_numbers == pm_numbers) else 0.0
    
    # 4. Molecule base matching
    mol_base_score = 0.0
    
    if g_base and pm_base:
        if g_base == pm_base: 
            mol_base_score = 100.0
        else:
            mol_base_diff = difflib.SequenceMatcher(None, g_base, pm_base).ratio() * 100.0
            
            base_tokens_g = set(g_base. split())
            base_tokens_pm = set(pm_base. split())
            
            if base_tokens_g and base_tokens_pm:
                base_jacc = len(base_tokens_g & base_tokens_pm) / len(base_tokens_g | base_tokens_pm) * 100.0
                mol_base_score = 0.40 * mol_base_diff + 0.60 * base_jacc
            else:
                mol_base_score = mol_base_diff
    
    # 5. Scoring formula
    if mol_base_score >= 95: 
        score = (0.60 * mol_base_score + 0.20 * diff + 0.15 * jacc + 0.05 * num_match)
    else:
        score = (0.50 * mol_base_score + 0.25 * diff + 0.20 * jacc + 0.05 * num_match)
    
    return {
        "diff": round(diff, 2),
        "jacc": round(jacc, 2),
        "num": round(num_match, 2),
        "mol_base": round(mol_base_score, 2),
        "score": round(score, 2)
    }


# ---------- OPTIMIZED: Batch matching ----------

def match_generic_to_product_master_optimized(
    generic_list: List[str],
    pm:  PrecomputedProductMaster,
    min_score: float = 60.0,
    return_all: bool = False,
    batch_size: int = 100
) -> List[Dict[str, Any]]:
    """Optimized matching using pre-computed product master"""
    
    results = []
    total = len(generic_list)
    
    for batch_start in range(0, total, batch_size):
        batch_end = min(batch_start + batch_size, total)
        batch = generic_list[batch_start:batch_end]
        
        if batch_start % 500 == 0:
            print(f"Processing RFQ rows {batch_start}-{batch_end} of {total}...")
        
        for i_in_batch, g in enumerate(batch):
            i = batch_start + i_in_batch
            g_str = str(g or "").strip()
            if not g_str:
                continue
            
            # Pre-compute for this generic
            g_norm = norm_base(g_str)
            g_base = extract_molecule_base(g_str)
            g_tokens = set(token_set(g_base))
            g_numbers = set(extract_numbers(g_str))
            
            best_score, best_pos, best_parts = -1.0, None, None
            
            for pos in range(len(pm)):
                # Quick filter to skip obvious non-matches
                if not quick_filter(g_tokens, pm.mol_tokens[pos]):
                    continue
                
                # Full similarity calculation only for candidates
                parts = hybrid_similarity_optimized(
                    g_norm, g_base, g_tokens, g_numbers,
                    pm.mol_norm[pos], pm.mol_base[pos], pm.mol_tokens[pos], pm.mol_numbers[pos]
                )
                
                if parts["score"] > best_score:
                    best_score, best_pos, best_parts = parts["score"], pos, parts
            
            if best_pos is None:
                continue
            
            item = {
                "row_index": i,
                "generic_name": g_str,
                "matched_name": pm.mol_raw[best_pos],
                "matched_brand_name": pm.brand_names[best_pos],
                "match_percent": round(best_score, 2),
                "brand_id": pm.brand_ids[best_pos],
                "brand_name": pm.brand_names[best_pos],
                "master_row_index": int(pm.idxs[best_pos]),
            }
            
            if return_all:
                item["_debug"] = best_parts
                results.append(item)
            else:
                if best_score >= min_score:
                    results.append(item)
    
    return results


# ---------- OPTIMIZED: Grouped matcher ----------

def match_generic_to_product_master_grouped_for_row_optimized(
    generic_value: str,
    pm: PrecomputedProductMaster,
    min_score: float = 60.0,
    top_n: int = 3
) -> List[Dict[str, Any]]:
    """Optimized grouped matching for a single row"""
    
    g_str = str(generic_value or "").strip()
    if not g_str:
        return []
    
    # Pre-compute for this generic
    g_norm = norm_base(g_str)
    g_base = extract_molecule_base(g_str)
    g_tokens = set(token_set(g_base))
    g_numbers = set(extract_numbers(g_str))
    
    scored = []
    
    for idx in range(len(pm)):
        # Quick filter
        if not quick_filter(g_tokens, pm.mol_tokens[idx]):
            continue
        
        # Full calculation
        parts = hybrid_similarity_optimized(
            g_norm, g_base, g_tokens, g_numbers,
            pm.mol_norm[idx], pm.mol_base[idx], pm.mol_tokens[idx], pm.mol_numbers[idx]
        )
        score = parts["score"]
        
        if score >= min_score:
            scored. append({
                "matched_name": pm.mol_raw[idx],
                "brand_name": pm.brand_names[idx],
                "brand_id": pm.brand_ids[idx],
                "match_percent": round(score, 2),
                "_debug": parts
            })
    
    scored.sort(key=lambda x: x["match_percent"], reverse=True)
    return scored[:top_n]


# ---------- OPTIMIZED Endpoints ----------

@app.post("/match-difflib")
async def match_with_difflib(
    rfq_file: UploadFile = File(...),
    product_master_json: UploadFile = File(...),
    min_score: float = Query(60.0, description="Minimum composite score (0-100)")
):
    try:
        # RFQ
        rfq_bytes = await rfq_file.read()
        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
        mapped, mapping = build_mapped_rfq(rfq_df)
        
        if "generic_name" not in mapped. columns:
            raise HTTPException(
                status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
        
        gen_series = mapped["generic_name"]
        nonempty_mask = gen_series.notna() & gen_series.astype(
            str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
        generic_list = gen_series[nonempty_mask].astype(str).tolist()
        
        # Product master
        pm_bytes = await product_master_json. read()
        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
        
        molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
        brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
        brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
        
        if not molecule_col: 
            raise HTTPException(
                status_code=400, detail="Could not detect molecule column in product master JSON.")
        
        # OPTIMIZED: Pre-compute product master
        pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
        
        # OPTIMIZED: Use optimized matching
        matches = match_generic_to_product_master_optimized(
            generic_list, pm,
            min_score=min_score,
            return_all=False
        )
        
        return JSONResponse({
            "rfq_rows": int(nonempty_mask.sum()),
            "product_master_detected": {
                "molecule_col": molecule_col,
                "brand_id_col": brand_id_col,
                "brand_name_col": brand_name_col
            },
            "product_master_size": len(pm),
            "matches_returned": len(matches),
            "data": matches
        })
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/test-extract-base")
def test_extract_base(text: str):
    """Test molecule base extraction"""
    normalized = norm_base(text)
    mol_base = extract_molecule_base(text)
    
    return {
        "original": text,
        "normalized":  normalized,
        "molecule_base": mol_base,
        "numbers_extracted": list(extract_numbers(text)),
        "tokens":  list(token_set(text))
    }


@app.post("/match-difflib-debug")
async def match_with_difflib_debug(
    rfq_file: UploadFile = File(...),
    product_master_json: UploadFile = File(...),
    sample:  int = Query(5, ge=1, le=200),
    min_score: float = Query(60.0),
    sample_contains:  str = Query("", description="Filter RFQ rows by substring (case-insensitive)")
):
    """
    Diagnostics:  return BEST match (+%) for the first N RFQ rows, optionally filtered by text. 
    """
    try: 
        # RFQ
        rfq_bytes = await rfq_file.read()
        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
        mapped, mapping = build_mapped_rfq(rfq_df)
        
        gen_series = mapped. get("generic_name", pd.Series([], dtype=object))
        nonempty_mask = gen_series.notna() & gen_series.astype(
            str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
        generic_list_all = gen_series[nonempty_mask].astype(str)
        
        if sample_contains:
            flt = generic_list_all.str.contains(sample_contains, case=False, na=False)
            generic_list = generic_list_all[flt]. tolist()[:sample]
        else:
            generic_list = generic_list_all.tolist()[:sample]
        
        # Product master
        pm_bytes = await product_master_json.read()
        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
        
        molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
        brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
        brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
        
        # OPTIMIZED: Pre-compute
        pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
        
        demo_matches = match_generic_to_product_master_optimized(
            generic_list, pm,
            min_score=min_score,
            return_all=True
        )
        
        return JSONResponse({
            "rfq_detected_headers": list(map(str, rfq_df.columns)),
            "template_mapping": mapping,
            "nonempty_generic_count": int(nonempty_mask.sum()),
            "product_master_detected": {
                "molecule_col": molecule_col,
                "brand_id_col": brand_id_col,
                "brand_name_col": brand_name_col
            },
            "product_master_size": len(pm),
            "filter":  sample_contains or None,
            "examples": demo_matches
        })
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.post("/match-difflib-grouped")
async def match_with_difflib_grouped(
    rfq_file: UploadFile = File(...),
    product_master_json: UploadFile = File(...),
    min_score: float = Query(60.0, description="Minimum score to include"),
    top_n: int = Query(3, description="Max number of matches per RFQ row")
):
    """
    Return ALL extracted RFQ rows with matches array. 
    OPTIMIZED version with pre-computation and batching.
    """
    try:
        # RFQ
        rfq_bytes = await rfq_file.read()
        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
        mapped, mapping = build_mapped_rfq(rfq_df)
        
        for col in TEMPLATE_COLUMNS:
            if col not in mapped.columns:
                mapped[col] = pd.NA
        
        # Product master
        pm_bytes = await product_master_json.read()
        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
        
        molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
        brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
        brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
        
        if not molecule_col:
            raise HTTPException(
                status_code=400, detail="Could not detect molecule column in product master JSON.")
        
        # OPTIMIZED:  Pre-compute product master
        pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
        
        # Build response data
        data_out = []
        match_rows_with_any = 0
        total = len(mapped)
        
        print(f"Processing {total} RFQ rows against {len(pm)} products...")
        
        for idx, row in mapped.iterrows():
            if idx % 100 == 0:
                print(f"Processing RFQ row {idx}/{total}...")
            
            rfq_record = {col: (None if pd.isna(row. get(col)) else str(row.get(col))) 
                         for col in TEMPLATE_COLUMNS}
            
            g_val = rfq_record.get("generic_name") or ""
            
            # OPTIMIZED: Use optimized matching
            matches = match_generic_to_product_master_grouped_for_row_optimized(
                generic_value=g_val,
                pm=pm,
                min_score=min_score,
                top_n=top_n
            )
            
            if matches:
                match_rows_with_any += 1
            
            data_out.append({
                "row_index": int(idx),
                "rfq":  rfq_record,
                "matches": matches
            })
        
        print(f"Completed!  {match_rows_with_any}/{total} rows had matches.")
        
        return {
            "rfq_rows":  int(len(mapped)),
            "product_master_detected": {
                "molecule_col": molecule_col,
                "brand_id_col": brand_id_col,
                "brand_name_col": brand_name_col
            },
            "product_master_size": len(pm),
            "rows_with_matches": match_rows_with_any,
            "data": data_out
        }
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/debug-score")
def debug_score(a: str, b: str):
    """Quick check for two strings."""
    # Pre-compute both sides
    a_norm = norm_base(a)
    a_base = extract_molecule_base(a)
    a_tokens = set(token_set(a_base))
    a_numbers = set(extract_numbers(a))
    
    b_norm = norm_base(b)
    b_base = extract_molecule_base(b)
    b_tokens = set(token_set(b_base))
    b_numbers = set(extract_numbers(b))
    
    result = hybrid_similarity_optimized(
        a_norm, a_base, a_tokens, a_numbers,
        b_norm, b_base, b_tokens, b_numbers
    )
    
    return {
        "a": a,
        "b":  b,
        "a_normalized": a_norm,
        "b_normalized": b_norm,
        "a_base": a_base,
        "b_base": b_base,
        "a_tokens": list(a_tokens),
        "b_tokens": list(b_tokens),
        "quick_filter_pass": quick_filter(a_tokens, b_tokens),
        "similarity":  result
    }


@app. get("/")
def root():
    return {
        "status": "ok", 
        "message": "OPTIMIZED version with pre-computation and batching",
        "endpoints": {
            "/match-difflib": "Standard matching",
            "/match-difflib-grouped": "Grouped matching (recommended)",
            "/match-difflib-debug": "Debug mode",
            "/debug-score": "Test two strings",
            "/test-extract-base": "Test molecule extraction"
        }
    }


if __name__ == "__main__":
    import uvicorn
    # INCREASED TIMEOUT:  10 minutes (600 seconds)
    uvicorn.run(app, host="0.0.0.0", port=7860, timeout_keep_alive=600)