Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,14 +3,16 @@ import json
|
|
| 3 |
import re
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import List, Optional, Dict, Any, Tuple
|
|
|
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
from fastapi import FastAPI, UploadFile, File, HTTPException, Query
|
| 9 |
-
from fastapi.responses import JSONResponse
|
| 10 |
import difflib
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 12 |
|
| 13 |
-
app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid)")
|
| 14 |
|
| 15 |
app.add_middleware(
|
| 16 |
CORSMiddleware,
|
|
@@ -26,42 +28,52 @@ TEMPLATE_COLUMNS = [
|
|
| 26 |
"current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
|
| 27 |
]
|
| 28 |
|
| 29 |
-
# ----------
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def norm_base(s: str) -> str:
|
| 34 |
s = str(s or "")
|
| 35 |
s = s.lower()
|
| 36 |
s = s.replace("+", " ").replace("/", " ")
|
| 37 |
-
|
| 38 |
-
s =
|
| 39 |
-
s = re.sub(r"\s+", " ", s).strip()
|
| 40 |
return s
|
| 41 |
|
| 42 |
|
| 43 |
-
|
|
|
|
| 44 |
s2 = norm_base(s)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
num_unit = re.findall(
|
| 48 |
-
r'\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%)', s2, flags=re.IGNORECASE)
|
| 49 |
-
|
| 50 |
-
# Extract standalone numbers (e.g., "500")
|
| 51 |
-
nums = re.findall(r'\d+(?:\.\d+)?', s2)
|
| 52 |
-
|
| 53 |
-
# Combine and deduplicate
|
| 54 |
all_numbers = num_unit + nums
|
| 55 |
-
return sorted(set([x.strip() for x in all_numbers]))
|
| 56 |
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
# ---------- Synonyms / detection ----------
|
| 63 |
-
SYNONYMS:
|
| 64 |
-
# RFQ → template mapping
|
| 65 |
"generic_name": [
|
| 66 |
"generic name", "generic", "molecule", "molecule name", "molecule with strength",
|
| 67 |
"composition", "salt", "api", "active ingredient"
|
|
@@ -77,8 +89,6 @@ SYNONYMS: Dict[str, List[str]] = {
|
|
| 77 |
"tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
|
| 78 |
"category": ["category", "schedule", "section", "chapter", "dept"],
|
| 79 |
"dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
|
| 80 |
-
|
| 81 |
-
# Product master detection (support your original schema)
|
| 82 |
"__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
|
| 83 |
"__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
|
| 84 |
"__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
|
|
@@ -86,10 +96,9 @@ SYNONYMS: Dict[str, List[str]] = {
|
|
| 86 |
|
| 87 |
# ---------- Header mapping ----------
|
| 88 |
|
| 89 |
-
|
| 90 |
def score_header(tcol: str, scol: str) -> float:
|
| 91 |
tn, sn = norm_base(tcol), norm_base(scol)
|
| 92 |
-
tset, sset = set(tn.split()), set(sn.split())
|
| 93 |
jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
|
| 94 |
contains = 1.0 if (tn in sn or sn in tn) else 0.0
|
| 95 |
fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
|
|
@@ -99,7 +108,7 @@ def score_header(tcol: str, scol: str) -> float:
|
|
| 99 |
def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
|
| 100 |
src_cols = [str(c) for c in src_cols]
|
| 101 |
src_norm_map = {norm_base(c): c for c in src_cols}
|
| 102 |
-
mapping:
|
| 103 |
for tcol in target_cols:
|
| 104 |
# 1) exact synonym
|
| 105 |
for alias in SYNONYMS.get(tcol, []):
|
|
@@ -144,7 +153,7 @@ def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
|
|
| 144 |
for nn, orig in norm_map.items():
|
| 145 |
if n in nn or nn in n:
|
| 146 |
return orig
|
| 147 |
-
# fallback:
|
| 148 |
best_col, best_score = None, -1.0
|
| 149 |
for c in cols:
|
| 150 |
sc = score_header(logical_name, c)
|
|
@@ -154,7 +163,6 @@ def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
|
|
| 154 |
|
| 155 |
# ---------- File reading ----------
|
| 156 |
|
| 157 |
-
|
| 158 |
def guess_delimiter(sample: str) -> str:
|
| 159 |
for d in ["\t", ";", "|", ","]:
|
| 160 |
if d in sample:
|
|
@@ -163,16 +171,16 @@ def guess_delimiter(sample: str) -> str:
|
|
| 163 |
|
| 164 |
|
| 165 |
def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 166 |
-
keep = [c for c in df.columns if not str(c).startswith("Unnamed")]
|
| 167 |
return df.loc[:, keep]
|
| 168 |
|
| 169 |
|
| 170 |
def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 171 |
-
df.columns = [str(c) for c in df.columns]
|
| 172 |
return df
|
| 173 |
|
| 174 |
|
| 175 |
-
def choose_best_sheet_and_header(xl:
|
| 176 |
best = {"score": -1, "df": None, "sheet": None,
|
| 177 |
"header": None, "mapping": None}
|
| 178 |
for sheet in xl.sheet_names:
|
|
@@ -183,12 +191,12 @@ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
|
|
| 183 |
if df.dropna(how="all").empty:
|
| 184 |
continue
|
| 185 |
df = ensure_str_columns(df)
|
| 186 |
-
m = map_headers_auto(df.columns.tolist(), TEMPLATE_COLUMNS)
|
| 187 |
score = sum(1 for v in m.values() if v is not None)
|
| 188 |
if score > best["score"]:
|
| 189 |
-
best = {"score": score, "df":
|
| 190 |
"header": header, "mapping": m}
|
| 191 |
-
except:
|
| 192 |
continue
|
| 193 |
if best["df"] is None:
|
| 194 |
raise ValueError("No readable tables found in the Excel workbook.")
|
|
@@ -197,17 +205,16 @@ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
|
|
| 197 |
|
| 198 |
def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
|
| 199 |
ext = Path(filename).suffix.lower()
|
| 200 |
-
if ext in [".xlsx", ".xls", ".xlsm", ".ods"]:
|
| 201 |
xl = pd.ExcelFile(io.BytesIO(data))
|
| 202 |
best = choose_best_sheet_and_header(xl)
|
| 203 |
return best["df"]
|
| 204 |
if ext in [".csv", ".tsv"]:
|
| 205 |
text = data.decode("utf-8", errors="ignore")
|
| 206 |
-
delim = guess_delimiter(text[:4096])
|
| 207 |
-
return pd.read_csv(io.StringIO(text), sep=delim, engine="python")
|
| 208 |
if ext == ".json":
|
| 209 |
js = json.loads(data.decode("utf-8", errors="ignore"))
|
| 210 |
-
# Accept both raw list and your original object with "data"
|
| 211 |
if isinstance(js, list):
|
| 212 |
return pd.DataFrame(js)
|
| 213 |
if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
|
|
@@ -227,106 +234,120 @@ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Opti
|
|
| 227 |
[pd.NA]*len(src_df), index=src_df.index)
|
| 228 |
return out, mapping
|
| 229 |
|
| 230 |
-
# ----------
|
| 231 |
-
|
| 232 |
|
|
|
|
| 233 |
def extract_molecule_base(s: str) -> str:
|
| 234 |
"""Extract core molecule name by removing dosages, units, and forms."""
|
| 235 |
s_norm = norm_base(s)
|
| 236 |
-
|
| 237 |
# Step 1: Remove dosage forms FIRST
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
' ', s_norm, flags=re. IGNORECASE)
|
| 244 |
-
|
| 245 |
# Step 3: Remove fractions and ratios
|
| 246 |
-
s_norm =
|
| 247 |
-
|
| 248 |
# Step 4: Remove standalone numbers
|
| 249 |
-
s_norm =
|
| 250 |
-
|
| 251 |
# Step 5: Remove w/w, w/v, v/v
|
| 252 |
-
s_norm =
|
| 253 |
-
|
| 254 |
# Step 6: Clean up spaces
|
| 255 |
-
s_norm =
|
| 256 |
-
|
| 257 |
return s_norm
|
| 258 |
|
| 259 |
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
"""
|
| 262 |
-
Enhanced similarity
|
| 263 |
-
Different dosages of the same molecule should score 75-90%.
|
| 264 |
"""
|
| 265 |
-
a_n, b_n = norm_base(a), norm_base(b)
|
| 266 |
-
|
| 267 |
# Exact match = perfect score
|
| 268 |
-
if
|
| 269 |
-
return {"diff": 100.0, "jacc": 100.0, "num":
|
| 270 |
-
|
| 271 |
-
# 1.
|
| 272 |
-
diff = difflib.SequenceMatcher(None,
|
| 273 |
-
|
| 274 |
# 2. Token Jaccard similarity
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
* 100.0) if (aset and bset) else 0.0
|
| 278 |
-
|
| 279 |
# 3. Number matching (bonus only)
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
# 4. CORE IMPROVEMENT: Molecule base matching
|
| 285 |
-
a_mol_base = extract_molecule_base(a)
|
| 286 |
-
b_mol_base = extract_molecule_base(b)
|
| 287 |
-
|
| 288 |
mol_base_score = 0.0
|
| 289 |
-
|
| 290 |
-
if
|
| 291 |
-
|
| 292 |
-
if a_mol_base == b_mol_base:
|
| 293 |
mol_base_score = 100.0
|
| 294 |
else:
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
if base_tokens_a and base_tokens_b:
|
| 304 |
-
base_jacc = len(base_tokens_a & base_tokens_b) / \
|
| 305 |
-
len(base_tokens_a | base_tokens_b) * 100.0
|
| 306 |
-
|
| 307 |
-
# Weighted average favoring token overlap (handles multi-word molecules)
|
| 308 |
mol_base_score = 0.40 * mol_base_diff + 0.60 * base_jacc
|
| 309 |
else:
|
| 310 |
mol_base_score = mol_base_diff
|
| 311 |
-
|
| 312 |
-
# 5.
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
# Scenario 3: Different molecule → <60%
|
| 316 |
-
|
| 317 |
-
if mol_base_score >= 95:
|
| 318 |
-
# Perfect molecule match - prioritize heavily
|
| 319 |
-
score = (0.60 * mol_base_score + # 60% weight on molecule base
|
| 320 |
-
0.20 * diff + # 20% on full text
|
| 321 |
-
0.15 * jacc + # 15% on tokens
|
| 322 |
-
0.05 * num_match) # 5% bonus for exact dosage
|
| 323 |
else:
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
0.25 * diff + # 25% on full text
|
| 327 |
-
0.20 * jacc + # 20% on tokens
|
| 328 |
-
0.05 * num_match) # 5% bonus
|
| 329 |
-
|
| 330 |
return {
|
| 331 |
"diff": round(diff, 2),
|
| 332 |
"jacc": round(jacc, 2),
|
|
@@ -336,171 +357,172 @@ def hybrid_similarity(a: str, b: str) -> Dict[str, float]:
|
|
| 336 |
}
|
| 337 |
|
| 338 |
|
| 339 |
-
|
|
|
|
|
|
|
| 340 |
generic_list: List[str],
|
| 341 |
-
|
| 342 |
-
molecule_col: str,
|
| 343 |
-
brand_id_col: Optional[str],
|
| 344 |
-
brand_name_col: Optional[str],
|
| 345 |
min_score: float = 60.0,
|
| 346 |
-
return_all: bool = False
|
|
|
|
| 347 |
) -> List[Dict[str, Any]]:
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
mol_raw = subset[molecule_col].astype(str).tolist()
|
| 352 |
-
|
| 353 |
-
# brand id list
|
| 354 |
-
brand_ids = subset[brand_id_col].astype(str).tolist() \
|
| 355 |
-
if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
|
| 356 |
-
|
| 357 |
-
# brand/product name list (fallbacks handled automatically)
|
| 358 |
-
brand_names = subset[brand_name_col].astype(str).tolist() \
|
| 359 |
-
if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
|
| 360 |
-
|
| 361 |
-
idxs = subset.index.tolist()
|
| 362 |
results = []
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
results.append(item)
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
| 400 |
return results
|
| 401 |
|
| 402 |
|
| 403 |
-
# ----------
|
| 404 |
|
| 405 |
-
|
| 406 |
-
def match_generic_to_product_master_grouped_for_row(
|
| 407 |
generic_value: str,
|
| 408 |
-
|
| 409 |
-
molecule_col: str,
|
| 410 |
-
brand_id_col: Optional[str],
|
| 411 |
-
brand_name_col: Optional[str],
|
| 412 |
min_score: float = 60.0,
|
| 413 |
top_n: int = 3
|
| 414 |
) -> List[Dict[str, Any]]:
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
mol_raw = subset[molecule_col].astype(str).tolist()
|
| 419 |
-
|
| 420 |
-
brand_ids = subset[brand_id_col].astype(str).tolist() \
|
| 421 |
-
if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
|
| 422 |
-
|
| 423 |
-
brand_names = subset[brand_name_col].astype(str).tolist() \
|
| 424 |
-
if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
|
| 425 |
-
|
| 426 |
g_str = str(generic_value or "").strip()
|
| 427 |
if not g_str:
|
| 428 |
return []
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
scored = []
|
| 431 |
-
|
| 432 |
-
for idx
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
score = parts["score"]
|
| 437 |
-
|
| 438 |
if score >= min_score:
|
| 439 |
-
scored.append({
|
| 440 |
-
"matched_name":
|
| 441 |
-
"brand_name": brand_names[idx],
|
| 442 |
-
"brand_id": brand_ids[idx],
|
| 443 |
"match_percent": round(score, 2),
|
| 444 |
"_debug": parts
|
| 445 |
})
|
| 446 |
-
|
| 447 |
scored.sort(key=lambda x: x["match_percent"], reverse=True)
|
| 448 |
-
|
| 449 |
return scored[:top_n]
|
| 450 |
|
| 451 |
|
| 452 |
-
# ---------- Endpoints ----------
|
| 453 |
-
|
| 454 |
|
| 455 |
@app.post("/match-difflib")
|
| 456 |
async def match_with_difflib(
|
| 457 |
rfq_file: UploadFile = File(...),
|
| 458 |
product_master_json: UploadFile = File(...),
|
| 459 |
-
min_score: float = Query(
|
| 460 |
-
60.0, description="Minimum composite score (0-100)")
|
| 461 |
):
|
| 462 |
try:
|
| 463 |
# RFQ
|
| 464 |
rfq_bytes = await rfq_file.read()
|
| 465 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 466 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 467 |
-
|
| 468 |
-
if "generic_name" not in mapped.columns:
|
| 469 |
raise HTTPException(
|
| 470 |
status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
|
| 471 |
-
|
| 472 |
gen_series = mapped["generic_name"]
|
| 473 |
nonempty_mask = gen_series.notna() & gen_series.astype(
|
| 474 |
str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
|
| 475 |
generic_list = gen_series[nonempty_mask].astype(str).tolist()
|
| 476 |
-
|
| 477 |
-
# Product master
|
| 478 |
-
pm_bytes = await product_master_json.read()
|
| 479 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 480 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 481 |
-
|
| 482 |
-
molecule_col = detect_single_column(
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
# brand name: prefer brand_name, else brand, else product
|
| 488 |
-
brand_name_col = detect_single_column(
|
| 489 |
-
pm_df, "__product_master_brand_name__")
|
| 490 |
-
|
| 491 |
-
if not molecule_col:
|
| 492 |
raise HTTPException(
|
| 493 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
|
|
|
| 500 |
min_score=min_score,
|
| 501 |
return_all=False
|
| 502 |
)
|
| 503 |
-
|
| 504 |
return JSONResponse({
|
| 505 |
"rfq_rows": int(nonempty_mask.sum()),
|
| 506 |
"product_master_detected": {
|
|
@@ -508,6 +530,7 @@ async def match_with_difflib(
|
|
| 508 |
"brand_id_col": brand_id_col,
|
| 509 |
"brand_name_col": brand_name_col
|
| 510 |
},
|
|
|
|
| 511 |
"matches_returned": len(matches),
|
| 512 |
"data": matches
|
| 513 |
})
|
|
@@ -522,13 +545,13 @@ def test_extract_base(text: str):
|
|
| 522 |
"""Test molecule base extraction"""
|
| 523 |
normalized = norm_base(text)
|
| 524 |
mol_base = extract_molecule_base(text)
|
| 525 |
-
|
| 526 |
return {
|
| 527 |
"original": text,
|
| 528 |
"normalized": normalized,
|
| 529 |
"molecule_base": mol_base,
|
| 530 |
-
"numbers_extracted": extract_numbers(text),
|
| 531 |
-
"tokens": token_set(text)
|
| 532 |
}
|
| 533 |
|
| 534 |
|
|
@@ -536,54 +559,48 @@ def test_extract_base(text: str):
|
|
| 536 |
async def match_with_difflib_debug(
|
| 537 |
rfq_file: UploadFile = File(...),
|
| 538 |
product_master_json: UploadFile = File(...),
|
| 539 |
-
sample:
|
| 540 |
min_score: float = Query(60.0),
|
| 541 |
-
sample_contains:
|
| 542 |
-
"", description="Filter RFQ rows by substring (case-insensitive)")
|
| 543 |
):
|
| 544 |
"""
|
| 545 |
-
Diagnostics:
|
| 546 |
-
Always returns best match, even if below min_score, so you can inspect behavior.
|
| 547 |
"""
|
| 548 |
-
try:
|
| 549 |
# RFQ
|
| 550 |
rfq_bytes = await rfq_file.read()
|
| 551 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 552 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 553 |
-
|
| 554 |
-
gen_series = mapped.get("generic_name", pd.Series([], dtype=object))
|
| 555 |
nonempty_mask = gen_series.notna() & gen_series.astype(
|
| 556 |
str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
|
| 557 |
generic_list_all = gen_series[nonempty_mask].astype(str)
|
| 558 |
-
|
| 559 |
if sample_contains:
|
| 560 |
-
flt = generic_list_all.str.contains(
|
| 561 |
-
|
| 562 |
-
generic_list = generic_list_all[flt].tolist()[:sample]
|
| 563 |
else:
|
| 564 |
generic_list = generic_list_all.tolist()[:sample]
|
| 565 |
-
|
| 566 |
# Product master
|
| 567 |
pm_bytes = await product_master_json.read()
|
| 568 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 569 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 570 |
-
|
| 571 |
-
molecule_col = detect_single_column(
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
demo_matches =
|
| 579 |
-
generic_list,
|
| 580 |
-
molecule_col=molecule_col,
|
| 581 |
-
brand_id_col=brand_id_col,
|
| 582 |
-
brand_name_col=brand_name_col,
|
| 583 |
min_score=min_score,
|
| 584 |
return_all=True
|
| 585 |
)
|
| 586 |
-
|
| 587 |
return JSONResponse({
|
| 588 |
"rfq_detected_headers": list(map(str, rfq_df.columns)),
|
| 589 |
"template_mapping": mapping,
|
|
@@ -593,7 +610,8 @@ async def match_with_difflib_debug(
|
|
| 593 |
"brand_id_col": brand_id_col,
|
| 594 |
"brand_name_col": brand_name_col
|
| 595 |
},
|
| 596 |
-
"
|
|
|
|
| 597 |
"examples": demo_matches
|
| 598 |
})
|
| 599 |
except HTTPException:
|
|
@@ -601,8 +619,6 @@ async def match_with_difflib_debug(
|
|
| 601 |
except Exception as e:
|
| 602 |
raise HTTPException(status_code=500, detail=str(e))
|
| 603 |
|
| 604 |
-
# ---------- NEW: Grouped endpoint ----------
|
| 605 |
-
|
| 606 |
|
| 607 |
@app.post("/match-difflib-grouped")
|
| 608 |
async def match_with_difflib_grouped(
|
|
@@ -612,73 +628,78 @@ async def match_with_difflib_grouped(
|
|
| 612 |
top_n: int = Query(3, description="Max number of matches per RFQ row")
|
| 613 |
):
|
| 614 |
"""
|
| 615 |
-
Return ALL extracted RFQ rows
|
| 616 |
-
|
| 617 |
-
Rows with no matches still appear with an empty `matches` list.
|
| 618 |
"""
|
| 619 |
try:
|
| 620 |
# RFQ
|
| 621 |
rfq_bytes = await rfq_file.read()
|
| 622 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 623 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 624 |
-
|
| 625 |
for col in TEMPLATE_COLUMNS:
|
| 626 |
if col not in mapped.columns:
|
| 627 |
mapped[col] = pd.NA
|
| 628 |
-
|
| 629 |
# Product master
|
| 630 |
pm_bytes = await product_master_json.read()
|
| 631 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 632 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 633 |
-
|
| 634 |
-
molecule_col = detect_single_column(
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
brand_name_col = detect_single_column(
|
| 639 |
-
pm_df, "__product_master_brand_name__")
|
| 640 |
if not molecule_col:
|
| 641 |
raise HTTPException(
|
| 642 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 643 |
-
|
| 644 |
-
#
|
|
|
|
|
|
|
|
|
|
| 645 |
data_out = []
|
| 646 |
match_rows_with_any = 0
|
| 647 |
-
|
| 648 |
-
|
|
|
|
|
|
|
| 649 |
for idx, row in mapped.iterrows():
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
|
|
|
| 655 |
g_val = rfq_record.get("generic_name") or ""
|
| 656 |
-
|
|
|
|
|
|
|
| 657 |
generic_value=g_val,
|
| 658 |
-
|
| 659 |
-
molecule_col=molecule_col,
|
| 660 |
-
brand_id_col=brand_id_col,
|
| 661 |
-
brand_name_col=brand_name_col,
|
| 662 |
min_score=min_score,
|
| 663 |
top_n=top_n
|
| 664 |
)
|
|
|
|
| 665 |
if matches:
|
| 666 |
match_rows_with_any += 1
|
| 667 |
-
|
| 668 |
data_out.append({
|
| 669 |
"row_index": int(idx),
|
| 670 |
-
|
| 671 |
-
"
|
| 672 |
-
"matches": matches # zero or more matches
|
| 673 |
})
|
| 674 |
-
|
|
|
|
|
|
|
| 675 |
return {
|
| 676 |
-
"rfq_rows":
|
| 677 |
"product_master_detected": {
|
| 678 |
"molecule_col": molecule_col,
|
| 679 |
"brand_id_col": brand_id_col,
|
| 680 |
"brand_name_col": brand_name_col
|
| 681 |
},
|
|
|
|
| 682 |
"rows_with_matches": match_rows_with_any,
|
| 683 |
"data": data_out
|
| 684 |
}
|
|
@@ -691,13 +712,52 @@ async def match_with_difflib_grouped(
|
|
| 691 |
@app.get("/debug-score")
|
| 692 |
def debug_score(a: str, b: str):
|
| 693 |
"""Quick check for two strings."""
|
| 694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
|
| 696 |
|
| 697 |
-
@app.get("/")
|
| 698 |
def root():
|
| 699 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
if __name__ == "__main__":
|
| 702 |
import uvicorn
|
| 703 |
-
|
|
|
|
|
|
| 3 |
import re
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import List, Optional, Dict, Any, Tuple
|
| 6 |
+
from functools import lru_cache
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
from fastapi import FastAPI, UploadFile, File, HTTPException, Query
|
| 10 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 11 |
import difflib
|
| 12 |
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
+
import asyncio
|
| 14 |
|
| 15 |
+
app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid - Optimized)")
|
| 16 |
|
| 17 |
app.add_middleware(
|
| 18 |
CORSMiddleware,
|
|
|
|
| 28 |
"current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
|
| 29 |
]
|
| 30 |
|
| 31 |
+
# ---------- OPTIMIZED: Compile regex patterns once at module level ----------
|
| 32 |
+
UNIT_PATTERN_COMPILED = re.compile(
|
| 33 |
+
r'\b\d+(? :\.\d+)?\s*(?:mg|mcg|μg|µg|gm? |kg|iu|i\. u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v|microgram|milligram|gram|kilogram|liter|milliliter)\b',
|
| 34 |
+
re.IGNORECASE
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
FORMS_PATTERN_COMPILED = re. compile(
|
| 38 |
+
r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b',
|
| 39 |
+
re.IGNORECASE
|
| 40 |
+
)
|
| 41 |
|
| 42 |
+
FRACTION_PATTERN = re.compile(r'\d+\s*/\s*\d+')
|
| 43 |
+
STANDALONE_NUM_PATTERN = re.compile(r'\b\d+(? :\.\d+)?\b')
|
| 44 |
+
WV_PATTERN = re.compile(r'\b[wv]\s*/\s*[wv]\b', re.IGNORECASE)
|
| 45 |
+
WHITESPACE_PATTERN = re.compile(r'\s+')
|
| 46 |
+
NON_WORD_PATTERN = re.compile(r'[^\w\s. %/+-]')
|
| 47 |
|
| 48 |
+
# ---------- Normalization ----------
|
| 49 |
+
|
| 50 |
+
# OPTIMIZED: Use lru_cache for frequently repeated strings
|
| 51 |
+
@lru_cache(maxsize=10000)
|
| 52 |
def norm_base(s: str) -> str:
|
| 53 |
s = str(s or "")
|
| 54 |
s = s.lower()
|
| 55 |
s = s.replace("+", " ").replace("/", " ")
|
| 56 |
+
s = NON_WORD_PATTERN. sub(" ", s)
|
| 57 |
+
s = WHITESPACE_PATTERN.sub(" ", s).strip()
|
|
|
|
| 58 |
return s
|
| 59 |
|
| 60 |
|
| 61 |
+
@lru_cache(maxsize=10000)
|
| 62 |
+
def extract_numbers(s: str) -> Tuple[str, ... ]: # Return tuple for hashability
|
| 63 |
s2 = norm_base(s)
|
| 64 |
+
num_unit = UNIT_PATTERN_COMPILED.findall(s2)
|
| 65 |
+
nums = STANDALONE_NUM_PATTERN.findall(s2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
all_numbers = num_unit + nums
|
| 67 |
+
return tuple(sorted(set([x. strip() for x in all_numbers])))
|
| 68 |
|
| 69 |
|
| 70 |
+
@lru_cache(maxsize=10000)
|
| 71 |
+
def token_set(s: str) -> Tuple[str, ...]: # Return tuple for hashability
|
| 72 |
+
return tuple(t for t in norm_base(s).split(" ") if t)
|
| 73 |
|
| 74 |
|
| 75 |
# ---------- Synonyms / detection ----------
|
| 76 |
+
SYNONYMS: Dict[str, List[str]] = {
|
|
|
|
| 77 |
"generic_name": [
|
| 78 |
"generic name", "generic", "molecule", "molecule name", "molecule with strength",
|
| 79 |
"composition", "salt", "api", "active ingredient"
|
|
|
|
| 89 |
"tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
|
| 90 |
"category": ["category", "schedule", "section", "chapter", "dept"],
|
| 91 |
"dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
|
|
|
|
|
|
|
| 92 |
"__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
|
| 93 |
"__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
|
| 94 |
"__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
|
|
|
|
| 96 |
|
| 97 |
# ---------- Header mapping ----------
|
| 98 |
|
|
|
|
| 99 |
def score_header(tcol: str, scol: str) -> float:
|
| 100 |
tn, sn = norm_base(tcol), norm_base(scol)
|
| 101 |
+
tset, sset = set(tn. split()), set(sn.split())
|
| 102 |
jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
|
| 103 |
contains = 1.0 if (tn in sn or sn in tn) else 0.0
|
| 104 |
fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
|
|
|
|
| 108 |
def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
|
| 109 |
src_cols = [str(c) for c in src_cols]
|
| 110 |
src_norm_map = {norm_base(c): c for c in src_cols}
|
| 111 |
+
mapping: Dict[str, Optional[str]] = {}
|
| 112 |
for tcol in target_cols:
|
| 113 |
# 1) exact synonym
|
| 114 |
for alias in SYNONYMS.get(tcol, []):
|
|
|
|
| 153 |
for nn, orig in norm_map.items():
|
| 154 |
if n in nn or nn in n:
|
| 155 |
return orig
|
| 156 |
+
# fallback: score
|
| 157 |
best_col, best_score = None, -1.0
|
| 158 |
for c in cols:
|
| 159 |
sc = score_header(logical_name, c)
|
|
|
|
| 163 |
|
| 164 |
# ---------- File reading ----------
|
| 165 |
|
|
|
|
| 166 |
def guess_delimiter(sample: str) -> str:
|
| 167 |
for d in ["\t", ";", "|", ","]:
|
| 168 |
if d in sample:
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 174 |
+
keep = [c for c in df. columns if not str(c).startswith("Unnamed")]
|
| 175 |
return df.loc[:, keep]
|
| 176 |
|
| 177 |
|
| 178 |
def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 179 |
+
df. columns = [str(c) for c in df.columns]
|
| 180 |
return df
|
| 181 |
|
| 182 |
|
| 183 |
+
def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
|
| 184 |
best = {"score": -1, "df": None, "sheet": None,
|
| 185 |
"header": None, "mapping": None}
|
| 186 |
for sheet in xl.sheet_names:
|
|
|
|
| 191 |
if df.dropna(how="all").empty:
|
| 192 |
continue
|
| 193 |
df = ensure_str_columns(df)
|
| 194 |
+
m = map_headers_auto(df.columns. tolist(), TEMPLATE_COLUMNS)
|
| 195 |
score = sum(1 for v in m.values() if v is not None)
|
| 196 |
if score > best["score"]:
|
| 197 |
+
best = {"score": score, "df": df, "sheet": sheet,
|
| 198 |
"header": header, "mapping": m}
|
| 199 |
+
except:
|
| 200 |
continue
|
| 201 |
if best["df"] is None:
|
| 202 |
raise ValueError("No readable tables found in the Excel workbook.")
|
|
|
|
| 205 |
|
| 206 |
def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
|
| 207 |
ext = Path(filename).suffix.lower()
|
| 208 |
+
if ext in [". xlsx", ".xls", ".xlsm", ". ods"]:
|
| 209 |
xl = pd.ExcelFile(io.BytesIO(data))
|
| 210 |
best = choose_best_sheet_and_header(xl)
|
| 211 |
return best["df"]
|
| 212 |
if ext in [".csv", ".tsv"]:
|
| 213 |
text = data.decode("utf-8", errors="ignore")
|
| 214 |
+
delim = guess_delimiter(text[: 4096])
|
| 215 |
+
return pd.read_csv(io. StringIO(text), sep=delim, engine="python")
|
| 216 |
if ext == ".json":
|
| 217 |
js = json.loads(data.decode("utf-8", errors="ignore"))
|
|
|
|
| 218 |
if isinstance(js, list):
|
| 219 |
return pd.DataFrame(js)
|
| 220 |
if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
|
|
|
|
| 234 |
[pd.NA]*len(src_df), index=src_df.index)
|
| 235 |
return out, mapping
|
| 236 |
|
| 237 |
+
# ---------- OPTIMIZED: Molecule extraction with caching ----------
|
|
|
|
| 238 |
|
| 239 |
+
@lru_cache(maxsize=10000)
|
| 240 |
def extract_molecule_base(s: str) -> str:
|
| 241 |
"""Extract core molecule name by removing dosages, units, and forms."""
|
| 242 |
s_norm = norm_base(s)
|
| 243 |
+
|
| 244 |
# Step 1: Remove dosage forms FIRST
|
| 245 |
+
s_norm = FORMS_PATTERN_COMPILED.sub(' ', s_norm)
|
| 246 |
+
|
| 247 |
+
# Step 2: Remove number+unit patterns
|
| 248 |
+
s_norm = UNIT_PATTERN_COMPILED.sub(' ', s_norm)
|
| 249 |
+
|
|
|
|
|
|
|
| 250 |
# Step 3: Remove fractions and ratios
|
| 251 |
+
s_norm = FRACTION_PATTERN. sub(' ', s_norm)
|
| 252 |
+
|
| 253 |
# Step 4: Remove standalone numbers
|
| 254 |
+
s_norm = STANDALONE_NUM_PATTERN.sub(' ', s_norm)
|
| 255 |
+
|
| 256 |
# Step 5: Remove w/w, w/v, v/v
|
| 257 |
+
s_norm = WV_PATTERN.sub(' ', s_norm)
|
| 258 |
+
|
| 259 |
# Step 6: Clean up spaces
|
| 260 |
+
s_norm = WHITESPACE_PATTERN.sub(' ', s_norm).strip()
|
| 261 |
+
|
| 262 |
return s_norm
|
| 263 |
|
| 264 |
|
| 265 |
+
# ---------- OPTIMIZED: Pre-computed product master ----------
|
| 266 |
+
|
| 267 |
+
class PrecomputedProductMaster:
|
| 268 |
+
"""Pre-compute all expensive operations once for the product master"""
|
| 269 |
+
|
| 270 |
+
def __init__(self, pm_df: pd.DataFrame, molecule_col: str,
|
| 271 |
+
brand_id_col: Optional[str], brand_name_col: Optional[str]):
|
| 272 |
+
subset = pm_df. dropna(subset=[molecule_col]).copy()
|
| 273 |
+
|
| 274 |
+
# Store original data
|
| 275 |
+
self.molecule_col = molecule_col
|
| 276 |
+
self.mol_raw = subset[molecule_col].astype(str).tolist()
|
| 277 |
+
self.brand_ids = subset[brand_id_col].astype(str).tolist() \
|
| 278 |
+
if brand_id_col and brand_id_col in subset. columns else [None] * len(subset)
|
| 279 |
+
self.brand_names = subset[brand_name_col].astype(str).tolist() \
|
| 280 |
+
if brand_name_col and brand_name_col in subset.columns else [None] * len(subset)
|
| 281 |
+
self.idxs = subset.index.tolist()
|
| 282 |
+
|
| 283 |
+
# Pre-compute normalized forms
|
| 284 |
+
print(f"Pre-computing {len(self.mol_raw)} product master entries...")
|
| 285 |
+
self.mol_norm = [norm_base(m) for m in self.mol_raw]
|
| 286 |
+
self.mol_base = [extract_molecule_base(m) for m in self.mol_raw]
|
| 287 |
+
self.mol_tokens = [set(token_set(mb)) for mb in self.mol_base]
|
| 288 |
+
self.mol_numbers = [set(extract_numbers(m)) for m in self.mol_raw]
|
| 289 |
+
print("Pre-computation complete!")
|
| 290 |
+
|
| 291 |
+
def __len__(self):
|
| 292 |
+
return len(self.mol_raw)
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# ---------- OPTIMIZED: Fast pre-filter ----------
|
| 296 |
+
|
| 297 |
+
def quick_filter(g_tokens: set, pm_tokens: set, threshold: float = 0.15) -> bool:
|
| 298 |
+
"""Fast token overlap check to skip obvious non-matches"""
|
| 299 |
+
if not g_tokens or not pm_tokens:
|
| 300 |
+
return False
|
| 301 |
+
overlap = len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens)
|
| 302 |
+
return overlap >= threshold
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# ---------- OPTIMIZED: Hybrid similarity with pre-computed data ----------
|
| 306 |
+
|
| 307 |
+
def hybrid_similarity_optimized(
|
| 308 |
+
g_norm: str, g_base: str, g_tokens: set, g_numbers: set,
|
| 309 |
+
pm_norm: str, pm_base: str, pm_tokens: set, pm_numbers: set
|
| 310 |
+
) -> Dict[str, float]:
|
| 311 |
"""
|
| 312 |
+
Enhanced similarity using pre-computed normalized forms.
|
|
|
|
| 313 |
"""
|
|
|
|
|
|
|
| 314 |
# Exact match = perfect score
|
| 315 |
+
if g_norm == pm_norm:
|
| 316 |
+
return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "mol_base": 100.0, "score": 100.0}
|
| 317 |
+
|
| 318 |
+
# 1. Full text difflib similarity
|
| 319 |
+
diff = difflib.SequenceMatcher(None, g_norm, pm_norm).ratio() * 100.0
|
| 320 |
+
|
| 321 |
# 2. Token Jaccard similarity
|
| 322 |
+
jacc = (len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens) * 100.0) if (g_tokens and pm_tokens) else 0.0
|
| 323 |
+
|
|
|
|
|
|
|
| 324 |
# 3. Number matching (bonus only)
|
| 325 |
+
num_match = 100.0 if (g_numbers and pm_numbers and g_numbers == pm_numbers) else 0.0
|
| 326 |
+
|
| 327 |
+
# 4. Molecule base matching
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
mol_base_score = 0.0
|
| 329 |
+
|
| 330 |
+
if g_base and pm_base:
|
| 331 |
+
if g_base == pm_base:
|
|
|
|
| 332 |
mol_base_score = 100.0
|
| 333 |
else:
|
| 334 |
+
mol_base_diff = difflib.SequenceMatcher(None, g_base, pm_base).ratio() * 100.0
|
| 335 |
+
|
| 336 |
+
base_tokens_g = set(g_base. split())
|
| 337 |
+
base_tokens_pm = set(pm_base. split())
|
| 338 |
+
|
| 339 |
+
if base_tokens_g and base_tokens_pm:
|
| 340 |
+
base_jacc = len(base_tokens_g & base_tokens_pm) / len(base_tokens_g | base_tokens_pm) * 100.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
mol_base_score = 0.40 * mol_base_diff + 0.60 * base_jacc
|
| 342 |
else:
|
| 343 |
mol_base_score = mol_base_diff
|
| 344 |
+
|
| 345 |
+
# 5. Scoring formula
|
| 346 |
+
if mol_base_score >= 95:
|
| 347 |
+
score = (0.60 * mol_base_score + 0.20 * diff + 0.15 * jacc + 0.05 * num_match)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
else:
|
| 349 |
+
score = (0.50 * mol_base_score + 0.25 * diff + 0.20 * jacc + 0.05 * num_match)
|
| 350 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
return {
|
| 352 |
"diff": round(diff, 2),
|
| 353 |
"jacc": round(jacc, 2),
|
|
|
|
| 357 |
}
|
| 358 |
|
| 359 |
|
| 360 |
+
# ---------- OPTIMIZED: Batch matching ----------
|
| 361 |
+
|
| 362 |
+
def match_generic_to_product_master_optimized(
|
| 363 |
generic_list: List[str],
|
| 364 |
+
pm: PrecomputedProductMaster,
|
|
|
|
|
|
|
|
|
|
| 365 |
min_score: float = 60.0,
|
| 366 |
+
return_all: bool = False,
|
| 367 |
+
batch_size: int = 100
|
| 368 |
) -> List[Dict[str, Any]]:
|
| 369 |
+
"""Optimized matching using pre-computed product master"""
|
| 370 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
results = []
|
| 372 |
+
total = len(generic_list)
|
| 373 |
+
|
| 374 |
+
for batch_start in range(0, total, batch_size):
|
| 375 |
+
batch_end = min(batch_start + batch_size, total)
|
| 376 |
+
batch = generic_list[batch_start:batch_end]
|
| 377 |
+
|
| 378 |
+
if batch_start % 500 == 0:
|
| 379 |
+
print(f"Processing RFQ rows {batch_start}-{batch_end} of {total}...")
|
| 380 |
+
|
| 381 |
+
for i_in_batch, g in enumerate(batch):
|
| 382 |
+
i = batch_start + i_in_batch
|
| 383 |
+
g_str = str(g or "").strip()
|
| 384 |
+
if not g_str:
|
| 385 |
+
continue
|
| 386 |
+
|
| 387 |
+
# Pre-compute for this generic
|
| 388 |
+
g_norm = norm_base(g_str)
|
| 389 |
+
g_base = extract_molecule_base(g_str)
|
| 390 |
+
g_tokens = set(token_set(g_base))
|
| 391 |
+
g_numbers = set(extract_numbers(g_str))
|
| 392 |
+
|
| 393 |
+
best_score, best_pos, best_parts = -1.0, None, None
|
| 394 |
+
|
| 395 |
+
for pos in range(len(pm)):
|
| 396 |
+
# Quick filter to skip obvious non-matches
|
| 397 |
+
if not quick_filter(g_tokens, pm.mol_tokens[pos]):
|
| 398 |
+
continue
|
| 399 |
+
|
| 400 |
+
# Full similarity calculation only for candidates
|
| 401 |
+
parts = hybrid_similarity_optimized(
|
| 402 |
+
g_norm, g_base, g_tokens, g_numbers,
|
| 403 |
+
pm.mol_norm[pos], pm.mol_base[pos], pm.mol_tokens[pos], pm.mol_numbers[pos]
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
if parts["score"] > best_score:
|
| 407 |
+
best_score, best_pos, best_parts = parts["score"], pos, parts
|
| 408 |
+
|
| 409 |
+
if best_pos is None:
|
| 410 |
+
continue
|
| 411 |
+
|
| 412 |
+
item = {
|
| 413 |
+
"row_index": i,
|
| 414 |
+
"generic_name": g_str,
|
| 415 |
+
"matched_name": pm.mol_raw[best_pos],
|
| 416 |
+
"matched_brand_name": pm.brand_names[best_pos],
|
| 417 |
+
"match_percent": round(best_score, 2),
|
| 418 |
+
"brand_id": pm.brand_ids[best_pos],
|
| 419 |
+
"brand_name": pm.brand_names[best_pos],
|
| 420 |
+
"master_row_index": int(pm.idxs[best_pos]),
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
if return_all:
|
| 424 |
+
item["_debug"] = best_parts
|
| 425 |
results.append(item)
|
| 426 |
+
else:
|
| 427 |
+
if best_score >= min_score:
|
| 428 |
+
results.append(item)
|
| 429 |
+
|
| 430 |
return results
|
| 431 |
|
| 432 |
|
| 433 |
+
# ---------- OPTIMIZED: Grouped matcher ----------
|
| 434 |
|
| 435 |
+
def match_generic_to_product_master_grouped_for_row_optimized(
|
|
|
|
| 436 |
generic_value: str,
|
| 437 |
+
pm: PrecomputedProductMaster,
|
|
|
|
|
|
|
|
|
|
| 438 |
min_score: float = 60.0,
|
| 439 |
top_n: int = 3
|
| 440 |
) -> List[Dict[str, Any]]:
|
| 441 |
+
"""Optimized grouped matching for a single row"""
|
| 442 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
g_str = str(generic_value or "").strip()
|
| 444 |
if not g_str:
|
| 445 |
return []
|
| 446 |
+
|
| 447 |
+
# Pre-compute for this generic
|
| 448 |
+
g_norm = norm_base(g_str)
|
| 449 |
+
g_base = extract_molecule_base(g_str)
|
| 450 |
+
g_tokens = set(token_set(g_base))
|
| 451 |
+
g_numbers = set(extract_numbers(g_str))
|
| 452 |
+
|
| 453 |
scored = []
|
| 454 |
+
|
| 455 |
+
for idx in range(len(pm)):
|
| 456 |
+
# Quick filter
|
| 457 |
+
if not quick_filter(g_tokens, pm.mol_tokens[idx]):
|
| 458 |
+
continue
|
| 459 |
+
|
| 460 |
+
# Full calculation
|
| 461 |
+
parts = hybrid_similarity_optimized(
|
| 462 |
+
g_norm, g_base, g_tokens, g_numbers,
|
| 463 |
+
pm.mol_norm[idx], pm.mol_base[idx], pm.mol_tokens[idx], pm.mol_numbers[idx]
|
| 464 |
+
)
|
| 465 |
score = parts["score"]
|
| 466 |
+
|
| 467 |
if score >= min_score:
|
| 468 |
+
scored. append({
|
| 469 |
+
"matched_name": pm.mol_raw[idx],
|
| 470 |
+
"brand_name": pm.brand_names[idx],
|
| 471 |
+
"brand_id": pm.brand_ids[idx],
|
| 472 |
"match_percent": round(score, 2),
|
| 473 |
"_debug": parts
|
| 474 |
})
|
| 475 |
+
|
| 476 |
scored.sort(key=lambda x: x["match_percent"], reverse=True)
|
|
|
|
| 477 |
return scored[:top_n]
|
| 478 |
|
| 479 |
|
| 480 |
+
# ---------- OPTIMIZED Endpoints ----------
|
|
|
|
| 481 |
|
| 482 |
@app.post("/match-difflib")
|
| 483 |
async def match_with_difflib(
|
| 484 |
rfq_file: UploadFile = File(...),
|
| 485 |
product_master_json: UploadFile = File(...),
|
| 486 |
+
min_score: float = Query(60.0, description="Minimum composite score (0-100)")
|
|
|
|
| 487 |
):
|
| 488 |
try:
|
| 489 |
# RFQ
|
| 490 |
rfq_bytes = await rfq_file.read()
|
| 491 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 492 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 493 |
+
|
| 494 |
+
if "generic_name" not in mapped. columns:
|
| 495 |
raise HTTPException(
|
| 496 |
status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
|
| 497 |
+
|
| 498 |
gen_series = mapped["generic_name"]
|
| 499 |
nonempty_mask = gen_series.notna() & gen_series.astype(
|
| 500 |
str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
|
| 501 |
generic_list = gen_series[nonempty_mask].astype(str).tolist()
|
| 502 |
+
|
| 503 |
+
# Product master
|
| 504 |
+
pm_bytes = await product_master_json. read()
|
| 505 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 506 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 507 |
+
|
| 508 |
+
molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
|
| 509 |
+
brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
|
| 510 |
+
brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
|
| 511 |
+
|
| 512 |
+
if not molecule_col:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
raise HTTPException(
|
| 514 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 515 |
+
|
| 516 |
+
# OPTIMIZED: Pre-compute product master
|
| 517 |
+
pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
|
| 518 |
+
|
| 519 |
+
# OPTIMIZED: Use optimized matching
|
| 520 |
+
matches = match_generic_to_product_master_optimized(
|
| 521 |
+
generic_list, pm,
|
| 522 |
min_score=min_score,
|
| 523 |
return_all=False
|
| 524 |
)
|
| 525 |
+
|
| 526 |
return JSONResponse({
|
| 527 |
"rfq_rows": int(nonempty_mask.sum()),
|
| 528 |
"product_master_detected": {
|
|
|
|
| 530 |
"brand_id_col": brand_id_col,
|
| 531 |
"brand_name_col": brand_name_col
|
| 532 |
},
|
| 533 |
+
"product_master_size": len(pm),
|
| 534 |
"matches_returned": len(matches),
|
| 535 |
"data": matches
|
| 536 |
})
|
|
|
|
| 545 |
"""Test molecule base extraction"""
|
| 546 |
normalized = norm_base(text)
|
| 547 |
mol_base = extract_molecule_base(text)
|
| 548 |
+
|
| 549 |
return {
|
| 550 |
"original": text,
|
| 551 |
"normalized": normalized,
|
| 552 |
"molecule_base": mol_base,
|
| 553 |
+
"numbers_extracted": list(extract_numbers(text)),
|
| 554 |
+
"tokens": list(token_set(text))
|
| 555 |
}
|
| 556 |
|
| 557 |
|
|
|
|
| 559 |
async def match_with_difflib_debug(
|
| 560 |
rfq_file: UploadFile = File(...),
|
| 561 |
product_master_json: UploadFile = File(...),
|
| 562 |
+
sample: int = Query(5, ge=1, le=200),
|
| 563 |
min_score: float = Query(60.0),
|
| 564 |
+
sample_contains: str = Query("", description="Filter RFQ rows by substring (case-insensitive)")
|
|
|
|
| 565 |
):
|
| 566 |
"""
|
| 567 |
+
Diagnostics: return BEST match (+%) for the first N RFQ rows, optionally filtered by text.
|
|
|
|
| 568 |
"""
|
| 569 |
+
try:
|
| 570 |
# RFQ
|
| 571 |
rfq_bytes = await rfq_file.read()
|
| 572 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 573 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 574 |
+
|
| 575 |
+
gen_series = mapped. get("generic_name", pd.Series([], dtype=object))
|
| 576 |
nonempty_mask = gen_series.notna() & gen_series.astype(
|
| 577 |
str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
|
| 578 |
generic_list_all = gen_series[nonempty_mask].astype(str)
|
| 579 |
+
|
| 580 |
if sample_contains:
|
| 581 |
+
flt = generic_list_all.str.contains(sample_contains, case=False, na=False)
|
| 582 |
+
generic_list = generic_list_all[flt]. tolist()[:sample]
|
|
|
|
| 583 |
else:
|
| 584 |
generic_list = generic_list_all.tolist()[:sample]
|
| 585 |
+
|
| 586 |
# Product master
|
| 587 |
pm_bytes = await product_master_json.read()
|
| 588 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 589 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 590 |
+
|
| 591 |
+
molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
|
| 592 |
+
brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
|
| 593 |
+
brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
|
| 594 |
+
|
| 595 |
+
# OPTIMIZED: Pre-compute
|
| 596 |
+
pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
|
| 597 |
+
|
| 598 |
+
demo_matches = match_generic_to_product_master_optimized(
|
| 599 |
+
generic_list, pm,
|
|
|
|
|
|
|
|
|
|
| 600 |
min_score=min_score,
|
| 601 |
return_all=True
|
| 602 |
)
|
| 603 |
+
|
| 604 |
return JSONResponse({
|
| 605 |
"rfq_detected_headers": list(map(str, rfq_df.columns)),
|
| 606 |
"template_mapping": mapping,
|
|
|
|
| 610 |
"brand_id_col": brand_id_col,
|
| 611 |
"brand_name_col": brand_name_col
|
| 612 |
},
|
| 613 |
+
"product_master_size": len(pm),
|
| 614 |
+
"filter": sample_contains or None,
|
| 615 |
"examples": demo_matches
|
| 616 |
})
|
| 617 |
except HTTPException:
|
|
|
|
| 619 |
except Exception as e:
|
| 620 |
raise HTTPException(status_code=500, detail=str(e))
|
| 621 |
|
|
|
|
|
|
|
| 622 |
|
| 623 |
@app.post("/match-difflib-grouped")
|
| 624 |
async def match_with_difflib_grouped(
|
|
|
|
| 628 |
top_n: int = Query(3, description="Max number of matches per RFQ row")
|
| 629 |
):
|
| 630 |
"""
|
| 631 |
+
Return ALL extracted RFQ rows with matches array.
|
| 632 |
+
OPTIMIZED version with pre-computation and batching.
|
|
|
|
| 633 |
"""
|
| 634 |
try:
|
| 635 |
# RFQ
|
| 636 |
rfq_bytes = await rfq_file.read()
|
| 637 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 638 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 639 |
+
|
| 640 |
for col in TEMPLATE_COLUMNS:
|
| 641 |
if col not in mapped.columns:
|
| 642 |
mapped[col] = pd.NA
|
| 643 |
+
|
| 644 |
# Product master
|
| 645 |
pm_bytes = await product_master_json.read()
|
| 646 |
pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
|
| 647 |
pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
|
| 648 |
+
|
| 649 |
+
molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
|
| 650 |
+
brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
|
| 651 |
+
brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
|
| 652 |
+
|
|
|
|
|
|
|
| 653 |
if not molecule_col:
|
| 654 |
raise HTTPException(
|
| 655 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 656 |
+
|
| 657 |
+
# OPTIMIZED: Pre-compute product master
|
| 658 |
+
pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
|
| 659 |
+
|
| 660 |
+
# Build response data
|
| 661 |
data_out = []
|
| 662 |
match_rows_with_any = 0
|
| 663 |
+
total = len(mapped)
|
| 664 |
+
|
| 665 |
+
print(f"Processing {total} RFQ rows against {len(pm)} products...")
|
| 666 |
+
|
| 667 |
for idx, row in mapped.iterrows():
|
| 668 |
+
if idx % 100 == 0:
|
| 669 |
+
print(f"Processing RFQ row {idx}/{total}...")
|
| 670 |
+
|
| 671 |
+
rfq_record = {col: (None if pd.isna(row. get(col)) else str(row.get(col)))
|
| 672 |
+
for col in TEMPLATE_COLUMNS}
|
| 673 |
+
|
| 674 |
g_val = rfq_record.get("generic_name") or ""
|
| 675 |
+
|
| 676 |
+
# OPTIMIZED: Use optimized matching
|
| 677 |
+
matches = match_generic_to_product_master_grouped_for_row_optimized(
|
| 678 |
generic_value=g_val,
|
| 679 |
+
pm=pm,
|
|
|
|
|
|
|
|
|
|
| 680 |
min_score=min_score,
|
| 681 |
top_n=top_n
|
| 682 |
)
|
| 683 |
+
|
| 684 |
if matches:
|
| 685 |
match_rows_with_any += 1
|
| 686 |
+
|
| 687 |
data_out.append({
|
| 688 |
"row_index": int(idx),
|
| 689 |
+
"rfq": rfq_record,
|
| 690 |
+
"matches": matches
|
|
|
|
| 691 |
})
|
| 692 |
+
|
| 693 |
+
print(f"Completed! {match_rows_with_any}/{total} rows had matches.")
|
| 694 |
+
|
| 695 |
return {
|
| 696 |
+
"rfq_rows": int(len(mapped)),
|
| 697 |
"product_master_detected": {
|
| 698 |
"molecule_col": molecule_col,
|
| 699 |
"brand_id_col": brand_id_col,
|
| 700 |
"brand_name_col": brand_name_col
|
| 701 |
},
|
| 702 |
+
"product_master_size": len(pm),
|
| 703 |
"rows_with_matches": match_rows_with_any,
|
| 704 |
"data": data_out
|
| 705 |
}
|
|
|
|
| 712 |
@app.get("/debug-score")
|
| 713 |
def debug_score(a: str, b: str):
|
| 714 |
"""Quick check for two strings."""
|
| 715 |
+
# Pre-compute both sides
|
| 716 |
+
a_norm = norm_base(a)
|
| 717 |
+
a_base = extract_molecule_base(a)
|
| 718 |
+
a_tokens = set(token_set(a_base))
|
| 719 |
+
a_numbers = set(extract_numbers(a))
|
| 720 |
+
|
| 721 |
+
b_norm = norm_base(b)
|
| 722 |
+
b_base = extract_molecule_base(b)
|
| 723 |
+
b_tokens = set(token_set(b_base))
|
| 724 |
+
b_numbers = set(extract_numbers(b))
|
| 725 |
+
|
| 726 |
+
result = hybrid_similarity_optimized(
|
| 727 |
+
a_norm, a_base, a_tokens, a_numbers,
|
| 728 |
+
b_norm, b_base, b_tokens, b_numbers
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
return {
|
| 732 |
+
"a": a,
|
| 733 |
+
"b": b,
|
| 734 |
+
"a_normalized": a_norm,
|
| 735 |
+
"b_normalized": b_norm,
|
| 736 |
+
"a_base": a_base,
|
| 737 |
+
"b_base": b_base,
|
| 738 |
+
"a_tokens": list(a_tokens),
|
| 739 |
+
"b_tokens": list(b_tokens),
|
| 740 |
+
"quick_filter_pass": quick_filter(a_tokens, b_tokens),
|
| 741 |
+
"similarity": result
|
| 742 |
+
}
|
| 743 |
|
| 744 |
|
| 745 |
+
@app. get("/")
|
| 746 |
def root():
|
| 747 |
+
return {
|
| 748 |
+
"status": "ok",
|
| 749 |
+
"message": "OPTIMIZED version with pre-computation and batching",
|
| 750 |
+
"endpoints": {
|
| 751 |
+
"/match-difflib": "Standard matching",
|
| 752 |
+
"/match-difflib-grouped": "Grouped matching (recommended)",
|
| 753 |
+
"/match-difflib-debug": "Debug mode",
|
| 754 |
+
"/debug-score": "Test two strings",
|
| 755 |
+
"/test-extract-base": "Test molecule extraction"
|
| 756 |
+
}
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
|
| 760 |
if __name__ == "__main__":
|
| 761 |
import uvicorn
|
| 762 |
+
# INCREASED TIMEOUT: 10 minutes (600 seconds)
|
| 763 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, timeout_keep_alive=600)
|