Spaces:

aniket9909
/

tenderapi

Sleeping

App Files Files Community

aniket9909 commited on Nov 29, 2025

Commit

f14504f

verified ·

1 Parent(s): 197636a

Create app.py

Browse files

Files changed (1) hide show

app.py +566 -0

app.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import io
+import json
+import re
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Tuple
+import pandas as pd
+from fastapi import FastAPI, UploadFile, File, HTTPException, Query
+from fastapi.responses import JSONResponse
+import difflib
+from fastapi.middleware.cors import CORSMiddleware
+app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid)")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # lock this down in prod
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ---------- Fixed Tender Template ----------
+TEMPLATE_COLUMNS = [
+    "id", "tender_id", "tender_code", "customer_id", "customer_name", "fy", "category", "code",
+    "current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
+]
+# ---------- Normalization ----------
+UNIT_PATTERN = r"(mg|mcg|g|iu|ml|%)"
+def norm_base(s: str) -> str:
+    s = str(s or "")
+    s = s.lower()
+    s = s.replace("+", " ").replace("/", " ")
+    # keep word chars, digits, ., %, /, +, -
+    s = re.sub(r"[^\w\s.%/+-]", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def extract_numbers(s: str) -> List[str]:
+    s2 = norm_base(s)
+    num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
+    nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
+    return sorted(set([x.strip() for x in num_unit + nums]))
+def token_set(s: str) -> List[str]:
+    return [t for t in norm_base(s).split(" ") if t]
+# ---------- Synonyms / detection ----------
+SYNONYMS: Dict[str, List[str]] = {
+    # RFQ → template mapping
+    "generic_name": [
+        "generic name", "generic", "molecule", "molecule name", "molecule with strength",
+        "composition", "salt", "api", "active ingredient"
+    ],
+    "current_brand_description": ["brand name", "brand", "trade name", "product", "product name", "item", "item name", "drug name"],
+    "annual_volume_qty": ["potential annual volume", "annual volume qty", "annual qty", "annual volume", "qty", "quantity", "rfq qty", "order qty", "excepted annual consumption qty_total", "annual consumption"],
+    "quotation Price": ["offer price(unit wise) without taxes in rs", "offer price", "unit price", "quoted rate", "rate", "basic rate", "price per unit", "price"],
+    "code": ["item code", "product code", "sku", "catalogue no", "catalog no", "catalog number", "code"],
+    "customer_name": ["customer name", "hospital name", "hospital", "buyer", "consignee", "institution", "institute", "organisation", "organization"],
+    "fy": ["fy", "financial year", "f.y.", "year"],
+    "id": ["s no", "sr no", "serial", "s.no", "line id", "id"],
+    "tender_id": ["tender id", "rfq id", "enquiry id"],
+    "tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
+    "category": ["category", "schedule", "section", "chapter", "dept"],
+    "dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
+    # Product master detection (support your original schema)
+    "__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
+    "__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
+    "__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
+}
+# ---------- Header mapping ----------
+def score_header(tcol: str, scol: str) -> float:
+    tn, sn = norm_base(tcol), norm_base(scol)
+    tset, sset = set(tn.split()), set(sn.split())
+    jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
+    contains = 1.0 if (tn in sn or sn in tn) else 0.0
+    fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
+    return 0.60*jacc + 0.25*contains + 0.15*fuzzy
+def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
+    src_cols = [str(c) for c in src_cols]
+    src_norm_map = {norm_base(c): c for c in src_cols}
+    mapping: Dict[str, Optional[str]] = {}
+    for tcol in target_cols:
+        # 1) exact synonym
+        for alias in SYNONYMS.get(tcol, []):
+            n = norm_base(alias)
+            if n in src_norm_map:
+                mapping[tcol] = src_norm_map[n]
+                break
+        else:
+            # 2) contains any synonym
+            hit = None
+            for alias in SYNONYMS.get(tcol, []):
+                n = norm_base(alias)
+                contain = [orig for nn, orig in src_norm_map.items()
+                           if (n in nn or nn in n)]
+                if contain:
+                    hit = contain[0]
+                    break
+            if hit:
+                mapping[tcol] = hit
+            else:
+                # 3) best score
+                best_src, best_score = None, -1.0
+                for scol in src_cols:
+                    sc = score_header(tcol, scol)
+                    if sc > best_score:
+                        best_score, best_src = sc, scol
+                mapping[tcol] = best_src if best_score >= 0.35 else None
+    return mapping
+def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
+    cols = [str(c) for c in df.columns]
+    norm_map = {norm_base(c): c for c in cols}
+    # exact first
+    for alias in SYNONYMS.get(logical_name, []):
+        n = norm_base(alias)
+        if n in norm_map:
+            return norm_map[n]
+    # contains next
+    for alias in SYNONYMS.get(logical_name, []):
+        n = norm_base(alias)
+        for nn, orig in norm_map.items():
+            if n in nn or nn in n:
+                return orig
+    # fallback: score
+    best_col, best_score = None, -1.0
+    for c in cols:
+        sc = score_header(logical_name, c)
+        if sc > best_score:
+            best_score, best_col = sc, c
+    return best_col if best_score >= 0.35 else None
+# ---------- File reading ----------
+def guess_delimiter(sample: str) -> str:
+    for d in ["\t", ";", "|", ","]:
+        if d in sample:
+            return d if d != "\t" else "\t"
+    return ","
+def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
+    keep = [c for c in df.columns if not str(c).startswith("Unnamed")]
+    return df.loc[:, keep]
+def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
+    df.columns = [str(c) for c in df.columns]
+    return df
+def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
+    best = {"score": -1, "df": None, "sheet": None,
+            "header": None, "mapping": None}
+    for sheet in xl.sheet_names:
+        for header in range(max_header_rows + 1):
+            try:
+                df = pd.read_excel(xl, sheet_name=sheet, header=header)
+                df = drop_unnamed_columns(df)
+                if df.dropna(how="all").empty:
+                    continue
+                df = ensure_str_columns(df)
+                m = map_headers_auto(df.columns.tolist(), TEMPLATE_COLUMNS)
+                score = sum(1 for v in m.values() if v is not None)
+                if score > best["score"]:
+                    best = {"score": score, "df": df, "sheet": sheet,
+                            "header": header, "mapping": m}
+            except:
+                continue
+    if best["df"] is None:
+        raise ValueError("No readable tables found in the Excel workbook.")
+    return best
+def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
+    ext = Path(filename).suffix.lower()
+    if ext in [".xlsx", ".xls", ".xlsm", ".ods"]:
+        xl = pd.ExcelFile(io.BytesIO(data))
+        best = choose_best_sheet_and_header(xl)
+        return best["df"]
+    if ext in [".csv", ".tsv"]:
+        text = data.decode("utf-8", errors="ignore")
+        delim = guess_delimiter(text[:4096])
+        return pd.read_csv(io.StringIO(text), sep=delim, engine="python")
+    if ext == ".json":
+        js = json.loads(data.decode("utf-8", errors="ignore"))
+        # Accept both raw list and your original object with "data"
+        if isinstance(js, list):
+            return pd.DataFrame(js)
+        if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
+            return pd.json_normalize(js["data"])
+        raise ValueError(
+            "Product master JSON must be a list of objects or an object with a 'data' array.")
+    raise ValueError(f"Unsupported file type: {ext}")
+def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Optional[str]]]:
+    src_df = ensure_str_columns(drop_unnamed_columns(src_df))
+    mapping = map_headers_auto(src_df.columns.tolist(), TEMPLATE_COLUMNS)
+    out = pd.DataFrame(index=src_df.index)
+    for tcol in TEMPLATE_COLUMNS:
+        src = mapping.get(tcol)
+        out[tcol] = src_df[str(src)] if src else pd.Series(
+            [pd.NA]*len(src_df), index=src_df.index)
+    return out, mapping
+# ---------- Hybrid difflib score ----------
+def hybrid_similarity(a: str, b: str) -> Dict[str, float]:
+    a_n, b_n = norm_base(a), norm_base(b)
+    if a_n == b_n:
+        return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "score": 100.0}
+    diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0
+    aset, bset = set(token_set(a)), set(token_set(b))
+    jacc = (len(aset & bset) / len(aset | bset)
+            * 100.0) if (aset and bset) else 0.0
+    anums, bnums = extract_numbers(a), extract_numbers(b)
+    num_bonus = 100.0 if (anums and bnums and set(anums)
+                          == set(bnums)) else 0.0
+    score = 0.60*diff + 0.30*jacc + 0.10*num_bonus
+    return {
+        "diff": round(diff, 2),
+        "jacc": round(jacc, 2),
+        "num": 100.0 if num_bonus else 0.0,
+        "score": round(score, 2)
+    }
+def match_generic_to_product_master(
+    generic_list: List[str],
+    pm_df: pd.DataFrame,
+    molecule_col: str,
+    brand_id_col: Optional[str],
+    brand_name_col: Optional[str],
+    min_score: float = 80.0,
+    return_all: bool = False
+) -> List[Dict[str, Any]]:
+    subset = pm_df.dropna(subset=[molecule_col]).copy()
+    mol_raw = subset[molecule_col].astype(str).tolist()
+    # brand id/name fallbacks are handled by detect function below; arrays may be None
+    brand_ids = subset[brand_id_col].astype(str).tolist(
+    ) if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
+    # brand name: prefer brand_name; else brand; else product (detect_single_column will choose)
+    brand_names = subset[brand_name_col].astype(str).tolist(
+    ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
+    idxs = subset.index.tolist()
+    results = []
+    for i, g in enumerate(generic_list):
+        g_str = str(g or "").strip()
+        if not g_str:
+            continue
+        best_score, best_pos, best_parts = -1.0, None, None
+        for pos, cand in enumerate(mol_raw):
+            parts = hybrid_similarity(g_str, cand)
+            if parts["score"] > best_score:
+                best_score, best_pos, best_parts = parts["score"], pos, parts
+        if best_pos is None:
+            continue
+        item = {
+            "row_index": i,
+            "generic_name": g_str,
+            "matched_name": mol_raw[best_pos],
+            "match_percent": round(best_score, 2),
+            "brand_id": brand_ids[best_pos],
+            "brand_name": brand_names[best_pos],
+            "master_row_index": int(idxs[best_pos]),
+        }
+        if return_all:
+            item["_debug"] = best_parts
+            results.append(item)
+        else:
+            if best_score >= min_score:
+                results.append(item)
+    return results
+# ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
+def match_generic_to_product_master_grouped_for_row(
+    generic_value: str,
+    pm_df: pd.DataFrame,
+    molecule_col: str,
+    brand_id_col: Optional[str],
+    brand_name_col: Optional[str],
+    min_score: float = 60.0,
+    top_n: int = 3
+) -> List[Dict[str, Any]]:
+    """Compute matches for a *single* RFQ row's generic name."""
+    subset = pm_df.dropna(subset=[molecule_col]).copy()
+    mol_raw = subset[molecule_col].astype(str).tolist()
+    brand_ids = subset[brand_id_col].astype(str).tolist(
+    ) if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
+    brand_names = subset[brand_name_col].astype(str).tolist(
+    ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
+    g_str = str(generic_value or "").strip()
+    if not g_str:
+        return []
+    scored = []
+    for idx, cand in enumerate(mol_raw):
+        parts = hybrid_similarity(g_str, cand)
+        score = parts["score"]
+        if score >= min_score:
+            scored.append({
+                "matched_name": cand,
+                "match_percent": round(score, 2),
+                "brand_id": brand_ids[idx],
+                "brand_name": brand_names[idx]
+            })
+    scored.sort(key=lambda x: x["match_percent"], reverse=True)
+    return scored[:top_n]
+# ---------- Endpoints ----------
+@app.post("/match-difflib")
+async def match_with_difflib(
+    rfq_file: UploadFile = File(...),
+    product_master_json: UploadFile = File(...),
+    min_score: float = Query(
+        80.0, description="Minimum composite score (0-100)")
+):
+    try:
+        # RFQ
+        rfq_bytes = await rfq_file.read()
+        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
+        mapped, mapping = build_mapped_rfq(rfq_df)
+        if "generic_name" not in mapped.columns:
+            raise HTTPException(
+                status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
+        gen_series = mapped["generic_name"]
+        nonempty_mask = gen_series.notna() & gen_series.astype(
+            str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
+        generic_list = gen_series[nonempty_mask].astype(str).tolist()
+        # Product master (supports your original JSON shape)
+        pm_bytes = await product_master_json.read()
+        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
+        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
+        molecule_col = detect_single_column(
+            pm_df, "__product_master_molecule__")
+        # brand id: prefer brand_id, else id
+        brand_id_col = detect_single_column(
+            pm_df, "__product_master_brand_id__")
+        # brand name: prefer brand_name, else brand, else product
+        brand_name_col = detect_single_column(
+            pm_df, "__product_master_brand_name__")
+        if not molecule_col:
+            raise HTTPException(
+                status_code=400, detail="Could not detect molecule column in product master JSON.")
+        matches = match_generic_to_product_master(
+            generic_list, pm_df,
+            molecule_col=molecule_col,
+            brand_id_col=brand_id_col,
+            brand_name_col=brand_name_col,
+            min_score=min_score,
+            return_all=False
+        )
+        return JSONResponse({
+            "rfq_rows": int(nonempty_mask.sum()),
+            "product_master_detected": {
+                "molecule_col": molecule_col,
+                "brand_id_col": brand_id_col,
+                "brand_name_col": brand_name_col
+            },
+            "matches_returned": len(matches),
+            "data": matches
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/match-difflib-debug")
+async def match_with_difflib_debug(
+    rfq_file: UploadFile = File(...),
+    product_master_json: UploadFile = File(...),
+    sample: int = Query(5, ge=1, le=200),
+    min_score: float = Query(80.0),
+    sample_contains: str = Query(
+        "", description="Filter RFQ rows by substring (case-insensitive)")
+):
+    """
+    Diagnostics: return BEST match (+%) for the first N RFQ rows, optionally filtered by text.
+    Always returns best match, even if below min_score, so you can inspect behavior.
+    """
+    try:
+        # RFQ
+        rfq_bytes = await rfq_file.read()
+        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
+        mapped, mapping = build_mapped_rfq(rfq_df)
+        gen_series = mapped.get("generic_name", pd.Series([], dtype=object))
+        nonempty_mask = gen_series.notna() & gen_series.astype(
+            str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
+        generic_list_all = gen_series[nonempty_mask].astype(str)
+        if sample_contains:
+            flt = generic_list_all.str.contains(
+                sample_contains, case=False, na=False)
+            generic_list = generic_list_all[flt].tolist()[:sample]
+        else:
+            generic_list = generic_list_all.tolist()[:sample]
+        # Product master
+        pm_bytes = await product_master_json.read()
+        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
+        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
+        molecule_col = detect_single_column(
+            pm_df, "__product_master_molecule__")
+        brand_id_col = detect_single_column(
+            pm_df, "__product_master_brand_id__")
+        brand_name_col = detect_single_column(
+            pm_df, "__product_master_brand_name__")
+        demo_matches = match_generic_to_product_master(
+            generic_list, pm_df,
+            molecule_col=molecule_col,
+            brand_id_col=brand_id_col,
+            brand_name_col=brand_name_col,
+            min_score=min_score,
+            return_all=True
+        )
+        return JSONResponse({
+            "rfq_detected_headers": list(map(str, rfq_df.columns)),
+            "template_mapping": mapping,
+            "nonempty_generic_count": int(nonempty_mask.sum()),
+            "product_master_detected": {
+                "molecule_col": molecule_col,
+                "brand_id_col": brand_id_col,
+                "brand_name_col": brand_name_col
+            },
+            "filter": sample_contains or None,
+            "examples": demo_matches
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# ---------- NEW: Grouped endpoint ----------
+@app.post("/match-difflib-grouped")
+async def match_with_difflib_grouped(
+    rfq_file: UploadFile = File(...),
+    product_master_json: UploadFile = File(...),
+    min_score: float = Query(60.0, description="Minimum score to include"),
+    top_n: int = Query(3, description="Max number of matches per RFQ row")
+):
+    """
+    Return ALL extracted RFQ rows (template-aligned fields), each with a `matches` array of
+    product master molecules (matched_name, match_percent, brand_id, brand_name) scoring ≥ min_score.
+    Rows with no matches still appear with an empty `matches` list.
+    """
+    try:
+        # RFQ
+        rfq_bytes = await rfq_file.read()
+        rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
+        mapped, mapping = build_mapped_rfq(rfq_df)
+        # Ensure columns exist even if not mapped
+        for col in TEMPLATE_COLUMNS:
+            if col not in mapped.columns:
+                mapped[col] = pd.NA
+        # Product master
+        pm_bytes = await product_master_json.read()
+        pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
+        pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
+        molecule_col = detect_single_column(
+            pm_df, "__product_master_molecule__")
+        brand_id_col = detect_single_column(
+            pm_df, "__product_master_brand_id__")
+        brand_name_col = detect_single_column(
+            pm_df, "__product_master_brand_name__")
+        if not molecule_col:
+            raise HTTPException(
+                status_code=400, detail="Could not detect molecule column in product master JSON.")
+        # Build response data: include every RFQ row as extracted, plus matches
+        data_out = []
+        match_rows_with_any = 0
+        # Work only with the same index order; keep all rows
+        for idx, row in mapped.iterrows():
+            # serialize RFQ row (template-aligned)
+            rfq_record = {col: (None if pd.isna(row.get(col)) else str(
+                row.get(col))) for col in TEMPLATE_COLUMNS}
+            # compute matches based on this row's generic_name
+            g_val = rfq_record.get("generic_name") or ""
+            matches = match_generic_to_product_master_grouped_for_row(
+                generic_value=g_val,
+                pm_df=pm_df,
+                molecule_col=molecule_col,
+                brand_id_col=brand_id_col,
+                brand_name_col=brand_name_col,
+                min_score=min_score,
+                top_n=top_n
+            )
+            if matches:
+                match_rows_with_any += 1
+            data_out.append({
+                "row_index": int(idx),
+                # ALL extracted fields (id, generic_name, annual_volume_qty, etc.)
+                "rfq": rfq_record,
+                "matches": matches                 # zero or more matches
+            })
+        return {
+            "rfq_rows": int(len(mapped)),
+            "product_master_detected": {
+                "molecule_col": molecule_col,
+                "brand_id_col": brand_id_col,
+                "brand_name_col": brand_name_col
+            },
+            "rows_with_matches": match_rows_with_any,
+            "data": data_out
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/debug-score")
+def debug_score(a: str, b: str):
+    """Quick check for two strings."""
+    return hybrid_similarity(a, b)
+@app.get("/")
+def root():
+    return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}