aniket9909 commited on
Commit
40ce7ee
·
verified ·
1 Parent(s): cee2f03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -299
app.py CHANGED
@@ -3,14 +3,16 @@ import json
3
  import re
4
  from pathlib import Path
5
  from typing import List, Optional, Dict, Any, Tuple
 
6
 
7
  import pandas as pd
8
  from fastapi import FastAPI, UploadFile, File, HTTPException, Query
9
- from fastapi.responses import JSONResponse
10
  import difflib
11
  from fastapi.middleware.cors import CORSMiddleware
 
12
 
13
- app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid)")
14
 
15
  app.add_middleware(
16
  CORSMiddleware,
@@ -26,42 +28,52 @@ TEMPLATE_COLUMNS = [
26
  "current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
27
  ]
28
 
29
- # ---------- Normalization ----------
30
- UNIT_PATTERN = r"(mg|mcg|μg|µg|g|gm|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v|microgram|milligram|gram|kilogram|liter|milliliter)"
 
 
 
 
 
 
 
 
31
 
 
 
 
 
 
32
 
 
 
 
 
33
  def norm_base(s: str) -> str:
34
  s = str(s or "")
35
  s = s.lower()
36
  s = s.replace("+", " ").replace("/", " ")
37
- # keep word chars, digits, ., %, /, +, -
38
- s = re.sub(r"[^\w\s.%/+-]", " ", s)
39
- s = re.sub(r"\s+", " ", s).strip()
40
  return s
41
 
42
 
43
- def extract_numbers(s: str) -> List[str]:
 
44
  s2 = norm_base(s)
45
-
46
- # Extract number+unit combinations (e.g., "500mg", "500 mg")
47
- num_unit = re.findall(
48
- r'\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%)', s2, flags=re.IGNORECASE)
49
-
50
- # Extract standalone numbers (e.g., "500")
51
- nums = re.findall(r'\d+(?:\.\d+)?', s2)
52
-
53
- # Combine and deduplicate
54
  all_numbers = num_unit + nums
55
- return sorted(set([x.strip() for x in all_numbers]))
56
 
57
 
58
- def token_set(s: str) -> List[str]:
59
- return [t for t in norm_base(s).split(" ") if t]
 
60
 
61
 
62
  # ---------- Synonyms / detection ----------
63
- SYNONYMS: Dict[str, List[str]] = {
64
- # RFQ → template mapping
65
  "generic_name": [
66
  "generic name", "generic", "molecule", "molecule name", "molecule with strength",
67
  "composition", "salt", "api", "active ingredient"
@@ -77,8 +89,6 @@ SYNONYMS: Dict[str, List[str]] = {
77
  "tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
78
  "category": ["category", "schedule", "section", "chapter", "dept"],
79
  "dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
80
-
81
- # Product master detection (support your original schema)
82
  "__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
83
  "__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
84
  "__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
@@ -86,10 +96,9 @@ SYNONYMS: Dict[str, List[str]] = {
86
 
87
  # ---------- Header mapping ----------
88
 
89
-
90
  def score_header(tcol: str, scol: str) -> float:
91
  tn, sn = norm_base(tcol), norm_base(scol)
92
- tset, sset = set(tn.split()), set(sn.split())
93
  jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
94
  contains = 1.0 if (tn in sn or sn in tn) else 0.0
95
  fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
@@ -99,7 +108,7 @@ def score_header(tcol: str, scol: str) -> float:
99
  def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
100
  src_cols = [str(c) for c in src_cols]
101
  src_norm_map = {norm_base(c): c for c in src_cols}
102
- mapping: Dict[str, Optional[str]] = {}
103
  for tcol in target_cols:
104
  # 1) exact synonym
105
  for alias in SYNONYMS.get(tcol, []):
@@ -144,7 +153,7 @@ def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
144
  for nn, orig in norm_map.items():
145
  if n in nn or nn in n:
146
  return orig
147
- # fallback: score
148
  best_col, best_score = None, -1.0
149
  for c in cols:
150
  sc = score_header(logical_name, c)
@@ -154,7 +163,6 @@ def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
154
 
155
  # ---------- File reading ----------
156
 
157
-
158
  def guess_delimiter(sample: str) -> str:
159
  for d in ["\t", ";", "|", ","]:
160
  if d in sample:
@@ -163,16 +171,16 @@ def guess_delimiter(sample: str) -> str:
163
 
164
 
165
  def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
166
- keep = [c for c in df.columns if not str(c).startswith("Unnamed")]
167
  return df.loc[:, keep]
168
 
169
 
170
  def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
171
- df.columns = [str(c) for c in df.columns]
172
  return df
173
 
174
 
175
- def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
176
  best = {"score": -1, "df": None, "sheet": None,
177
  "header": None, "mapping": None}
178
  for sheet in xl.sheet_names:
@@ -183,12 +191,12 @@ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
183
  if df.dropna(how="all").empty:
184
  continue
185
  df = ensure_str_columns(df)
186
- m = map_headers_auto(df.columns.tolist(), TEMPLATE_COLUMNS)
187
  score = sum(1 for v in m.values() if v is not None)
188
  if score > best["score"]:
189
- best = {"score": score, "df": df, "sheet": sheet,
190
  "header": header, "mapping": m}
191
- except:
192
  continue
193
  if best["df"] is None:
194
  raise ValueError("No readable tables found in the Excel workbook.")
@@ -197,17 +205,16 @@ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
197
 
198
  def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
199
  ext = Path(filename).suffix.lower()
200
- if ext in [".xlsx", ".xls", ".xlsm", ".ods"]:
201
  xl = pd.ExcelFile(io.BytesIO(data))
202
  best = choose_best_sheet_and_header(xl)
203
  return best["df"]
204
  if ext in [".csv", ".tsv"]:
205
  text = data.decode("utf-8", errors="ignore")
206
- delim = guess_delimiter(text[:4096])
207
- return pd.read_csv(io.StringIO(text), sep=delim, engine="python")
208
  if ext == ".json":
209
  js = json.loads(data.decode("utf-8", errors="ignore"))
210
- # Accept both raw list and your original object with "data"
211
  if isinstance(js, list):
212
  return pd.DataFrame(js)
213
  if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
@@ -227,106 +234,120 @@ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Opti
227
  [pd.NA]*len(src_df), index=src_df.index)
228
  return out, mapping
229
 
230
- # ---------- Hybrid difflib score ----------
231
-
232
 
 
233
  def extract_molecule_base(s: str) -> str:
234
  """Extract core molecule name by removing dosages, units, and forms."""
235
  s_norm = norm_base(s)
236
-
237
  # Step 1: Remove dosage forms FIRST
238
- forms = r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b'
239
- s_norm = re.sub(forms, ' ', s_norm, flags=re.IGNORECASE)
240
-
241
- # Step 2: Remove number+unit patterns (handles "500mg" and "500 mg")
242
- s_norm = re.sub(r'\b\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v)\b',
243
- ' ', s_norm, flags=re. IGNORECASE)
244
-
245
  # Step 3: Remove fractions and ratios
246
- s_norm = re.sub(r'\d+\s*/\s*\d+', ' ', s_norm)
247
-
248
  # Step 4: Remove standalone numbers
249
- s_norm = re.sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
250
-
251
  # Step 5: Remove w/w, w/v, v/v
252
- s_norm = re.sub(r'\b[wv]\s*/\s*[wv]\b', ' ', s_norm, flags=re.IGNORECASE)
253
-
254
  # Step 6: Clean up spaces
255
- s_norm = re.sub(r'\s+', ' ', s_norm).strip()
256
-
257
  return s_norm
258
 
259
 
260
- def hybrid_similarity(a: str, b: str) -> Dict[str, float]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  """
262
- Enhanced similarity that STRONGLY prioritizes molecule base name.
263
- Different dosages of the same molecule should score 75-90%.
264
  """
265
- a_n, b_n = norm_base(a), norm_base(b)
266
-
267
  # Exact match = perfect score
268
- if a_n == b_n:
269
- return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "mol_base": 100.0, "score": 100.0}
270
-
271
- # 1. Full text difflib similarity
272
- diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0
273
-
274
  # 2. Token Jaccard similarity
275
- aset, bset = set(token_set(a)), set(token_set(b))
276
- jacc = (len(aset & bset) / len(aset | bset)
277
- * 100.0) if (aset and bset) else 0.0
278
-
279
  # 3. Number matching (bonus only)
280
- anums, bnums = extract_numbers(a), extract_numbers(b)
281
- num_match = 100.0 if (anums and bnums and set(anums)
282
- == set(bnums)) else 0.0
283
-
284
- # 4. CORE IMPROVEMENT: Molecule base matching
285
- a_mol_base = extract_molecule_base(a)
286
- b_mol_base = extract_molecule_base(b)
287
-
288
  mol_base_score = 0.0
289
-
290
- if a_mol_base and b_mol_base:
291
- # Exact base match (e.g., "amoxicillin" == "amoxicillin")
292
- if a_mol_base == b_mol_base:
293
  mol_base_score = 100.0
294
  else:
295
- # Fuzzy base match
296
- mol_base_diff = difflib.SequenceMatcher(
297
- None, a_mol_base, b_mol_base).ratio() * 100.0
298
-
299
- # Token overlap for molecule base
300
- base_tokens_a = set(a_mol_base.split())
301
- base_tokens_b = set(b_mol_base.split())
302
-
303
- if base_tokens_a and base_tokens_b:
304
- base_jacc = len(base_tokens_a & base_tokens_b) / \
305
- len(base_tokens_a | base_tokens_b) * 100.0
306
-
307
- # Weighted average favoring token overlap (handles multi-word molecules)
308
  mol_base_score = 0.40 * mol_base_diff + 0.60 * base_jacc
309
  else:
310
  mol_base_score = mol_base_diff
311
-
312
- # 5. ADJUSTED SCORING FORMULA
313
- # Scenario 1: Same molecule, same dosage → 95-100%
314
- # Scenario 2: Same molecule, different dosage 75-90%
315
- # Scenario 3: Different molecule → <60%
316
-
317
- if mol_base_score >= 95:
318
- # Perfect molecule match - prioritize heavily
319
- score = (0.60 * mol_base_score + # 60% weight on molecule base
320
- 0.20 * diff + # 20% on full text
321
- 0.15 * jacc + # 15% on tokens
322
- 0.05 * num_match) # 5% bonus for exact dosage
323
  else:
324
- # Partial molecule match - still favor base
325
- score = (0.50 * mol_base_score + # 50% weight on molecule base
326
- 0.25 * diff + # 25% on full text
327
- 0.20 * jacc + # 20% on tokens
328
- 0.05 * num_match) # 5% bonus
329
-
330
  return {
331
  "diff": round(diff, 2),
332
  "jacc": round(jacc, 2),
@@ -336,171 +357,172 @@ def hybrid_similarity(a: str, b: str) -> Dict[str, float]:
336
  }
337
 
338
 
339
- def match_generic_to_product_master(
 
 
340
  generic_list: List[str],
341
- pm_df: pd.DataFrame,
342
- molecule_col: str,
343
- brand_id_col: Optional[str],
344
- brand_name_col: Optional[str],
345
  min_score: float = 60.0,
346
- return_all: bool = False
 
347
  ) -> List[Dict[str, Any]]:
348
-
349
- subset = pm_df.dropna(subset=[molecule_col]).copy()
350
-
351
- mol_raw = subset[molecule_col].astype(str).tolist()
352
-
353
- # brand id list
354
- brand_ids = subset[brand_id_col].astype(str).tolist() \
355
- if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
356
-
357
- # brand/product name list (fallbacks handled automatically)
358
- brand_names = subset[brand_name_col].astype(str).tolist() \
359
- if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
360
-
361
- idxs = subset.index.tolist()
362
  results = []
363
-
364
- for i, g in enumerate(generic_list):
365
- g_str = str(g or "").strip()
366
- if not g_str:
367
- continue
368
-
369
- best_score, best_pos, best_parts = -1.0, None, None
370
-
371
- for pos, mol in enumerate(mol_raw):
372
-
373
- # 🔥 Match ONLY against molecule name (ignore brand)
374
- parts = hybrid_similarity(g_str, mol)
375
-
376
- if parts["score"] > best_score:
377
- best_score, best_pos, best_parts = parts["score"], pos, parts
378
-
379
- if best_pos is None:
380
- continue
381
-
382
- item = {
383
- "row_index": i,
384
- "generic_name": g_str,
385
- "matched_name": mol_raw[best_pos],
386
- "matched_brand_name": brand_names[best_pos],
387
- "match_percent": round(best_score, 2),
388
- "brand_id": brand_ids[best_pos],
389
- "brand_name": brand_names[best_pos],
390
- "master_row_index": int(idxs[best_pos]),
391
- }
392
-
393
- if return_all:
394
- item["_debug"] = best_parts
395
- results.append(item)
396
- else:
397
- if best_score >= min_score:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  results.append(item)
399
-
 
 
 
400
  return results
401
 
402
 
403
- # ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
404
 
405
-
406
- def match_generic_to_product_master_grouped_for_row(
407
  generic_value: str,
408
- pm_df: pd.DataFrame,
409
- molecule_col: str,
410
- brand_id_col: Optional[str],
411
- brand_name_col: Optional[str],
412
  min_score: float = 60.0,
413
  top_n: int = 3
414
  ) -> List[Dict[str, Any]]:
415
-
416
- subset = pm_df.dropna(subset=[molecule_col]).copy()
417
-
418
- mol_raw = subset[molecule_col].astype(str).tolist()
419
-
420
- brand_ids = subset[brand_id_col].astype(str).tolist() \
421
- if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
422
-
423
- brand_names = subset[brand_name_col].astype(str).tolist() \
424
- if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
425
-
426
  g_str = str(generic_value or "").strip()
427
  if not g_str:
428
  return []
429
-
 
 
 
 
 
 
430
  scored = []
431
-
432
- for idx, mol in enumerate(mol_raw):
433
-
434
- # 🔥 Match ONLY against molecule name (ignore brand)
435
- parts = hybrid_similarity(g_str, mol)
 
 
 
 
 
 
436
  score = parts["score"]
437
-
438
  if score >= min_score:
439
- scored.append({
440
- "matched_name": mol,
441
- "brand_name": brand_names[idx],
442
- "brand_id": brand_ids[idx],
443
  "match_percent": round(score, 2),
444
  "_debug": parts
445
  })
446
-
447
  scored.sort(key=lambda x: x["match_percent"], reverse=True)
448
-
449
  return scored[:top_n]
450
 
451
 
452
- # ---------- Endpoints ----------
453
-
454
 
455
  @app.post("/match-difflib")
456
  async def match_with_difflib(
457
  rfq_file: UploadFile = File(...),
458
  product_master_json: UploadFile = File(...),
459
- min_score: float = Query(
460
- 60.0, description="Minimum composite score (0-100)")
461
  ):
462
  try:
463
  # RFQ
464
  rfq_bytes = await rfq_file.read()
465
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
466
  mapped, mapping = build_mapped_rfq(rfq_df)
467
-
468
- if "generic_name" not in mapped.columns:
469
  raise HTTPException(
470
  status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
471
-
472
  gen_series = mapped["generic_name"]
473
  nonempty_mask = gen_series.notna() & gen_series.astype(
474
  str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
475
  generic_list = gen_series[nonempty_mask].astype(str).tolist()
476
-
477
- # Product master (supports your original JSON shape)
478
- pm_bytes = await product_master_json.read()
479
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
480
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
481
-
482
- molecule_col = detect_single_column(
483
- pm_df, "__product_master_molecule__")
484
- # brand id: prefer brand_id, else id
485
- brand_id_col = detect_single_column(
486
- pm_df, "__product_master_brand_id__")
487
- # brand name: prefer brand_name, else brand, else product
488
- brand_name_col = detect_single_column(
489
- pm_df, "__product_master_brand_name__")
490
-
491
- if not molecule_col:
492
  raise HTTPException(
493
  status_code=400, detail="Could not detect molecule column in product master JSON.")
494
-
495
- matches = match_generic_to_product_master(
496
- generic_list, pm_df,
497
- molecule_col=molecule_col,
498
- brand_id_col=brand_id_col,
499
- brand_name_col=brand_name_col,
 
500
  min_score=min_score,
501
  return_all=False
502
  )
503
-
504
  return JSONResponse({
505
  "rfq_rows": int(nonempty_mask.sum()),
506
  "product_master_detected": {
@@ -508,6 +530,7 @@ async def match_with_difflib(
508
  "brand_id_col": brand_id_col,
509
  "brand_name_col": brand_name_col
510
  },
 
511
  "matches_returned": len(matches),
512
  "data": matches
513
  })
@@ -522,13 +545,13 @@ def test_extract_base(text: str):
522
  """Test molecule base extraction"""
523
  normalized = norm_base(text)
524
  mol_base = extract_molecule_base(text)
525
-
526
  return {
527
  "original": text,
528
  "normalized": normalized,
529
  "molecule_base": mol_base,
530
- "numbers_extracted": extract_numbers(text),
531
- "tokens": token_set(text)
532
  }
533
 
534
 
@@ -536,54 +559,48 @@ def test_extract_base(text: str):
536
  async def match_with_difflib_debug(
537
  rfq_file: UploadFile = File(...),
538
  product_master_json: UploadFile = File(...),
539
- sample: int = Query(5, ge=1, le=200),
540
  min_score: float = Query(60.0),
541
- sample_contains: str = Query(
542
- "", description="Filter RFQ rows by substring (case-insensitive)")
543
  ):
544
  """
545
- Diagnostics: return BEST match (+%) for the first N RFQ rows, optionally filtered by text.
546
- Always returns best match, even if below min_score, so you can inspect behavior.
547
  """
548
- try:
549
  # RFQ
550
  rfq_bytes = await rfq_file.read()
551
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
552
  mapped, mapping = build_mapped_rfq(rfq_df)
553
-
554
- gen_series = mapped.get("generic_name", pd.Series([], dtype=object))
555
  nonempty_mask = gen_series.notna() & gen_series.astype(
556
  str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
557
  generic_list_all = gen_series[nonempty_mask].astype(str)
558
-
559
  if sample_contains:
560
- flt = generic_list_all.str.contains(
561
- sample_contains, case=False, na=False)
562
- generic_list = generic_list_all[flt].tolist()[:sample]
563
  else:
564
  generic_list = generic_list_all.tolist()[:sample]
565
-
566
  # Product master
567
  pm_bytes = await product_master_json.read()
568
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
569
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
570
-
571
- molecule_col = detect_single_column(
572
- pm_df, "__product_master_molecule__")
573
- brand_id_col = detect_single_column(
574
- pm_df, "__product_master_brand_id__")
575
- brand_name_col = detect_single_column(
576
- pm_df, "__product_master_brand_name__")
577
-
578
- demo_matches = match_generic_to_product_master(
579
- generic_list, pm_df,
580
- molecule_col=molecule_col,
581
- brand_id_col=brand_id_col,
582
- brand_name_col=brand_name_col,
583
  min_score=min_score,
584
  return_all=True
585
  )
586
-
587
  return JSONResponse({
588
  "rfq_detected_headers": list(map(str, rfq_df.columns)),
589
  "template_mapping": mapping,
@@ -593,7 +610,8 @@ async def match_with_difflib_debug(
593
  "brand_id_col": brand_id_col,
594
  "brand_name_col": brand_name_col
595
  },
596
- "filter": sample_contains or None,
 
597
  "examples": demo_matches
598
  })
599
  except HTTPException:
@@ -601,8 +619,6 @@ async def match_with_difflib_debug(
601
  except Exception as e:
602
  raise HTTPException(status_code=500, detail=str(e))
603
 
604
- # ---------- NEW: Grouped endpoint ----------
605
-
606
 
607
  @app.post("/match-difflib-grouped")
608
  async def match_with_difflib_grouped(
@@ -612,73 +628,78 @@ async def match_with_difflib_grouped(
612
  top_n: int = Query(3, description="Max number of matches per RFQ row")
613
  ):
614
  """
615
- Return ALL extracted RFQ rows (template-aligned fields), each with a `matches` array of
616
- product master molecules (matched_name, match_percent, brand_id, brand_name) scoring ≥ min_score.
617
- Rows with no matches still appear with an empty `matches` list.
618
  """
619
  try:
620
  # RFQ
621
  rfq_bytes = await rfq_file.read()
622
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
623
  mapped, mapping = build_mapped_rfq(rfq_df)
624
- # Ensure columns exist even if not mapped
625
  for col in TEMPLATE_COLUMNS:
626
  if col not in mapped.columns:
627
  mapped[col] = pd.NA
628
-
629
  # Product master
630
  pm_bytes = await product_master_json.read()
631
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
632
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
633
-
634
- molecule_col = detect_single_column(
635
- pm_df, "__product_master_molecule__")
636
- brand_id_col = detect_single_column(
637
- pm_df, "__product_master_brand_id__")
638
- brand_name_col = detect_single_column(
639
- pm_df, "__product_master_brand_name__")
640
  if not molecule_col:
641
  raise HTTPException(
642
  status_code=400, detail="Could not detect molecule column in product master JSON.")
643
-
644
- # Build response data: include every RFQ row as extracted, plus matches
 
 
 
645
  data_out = []
646
  match_rows_with_any = 0
647
-
648
- # Work only with the same index order; keep all rows
 
 
649
  for idx, row in mapped.iterrows():
650
- # serialize RFQ row (template-aligned)
651
- rfq_record = {col: (None if pd.isna(row.get(col)) else str(
652
- row.get(col))) for col in TEMPLATE_COLUMNS}
653
-
654
- # compute matches based on this row's generic_name
 
655
  g_val = rfq_record.get("generic_name") or ""
656
- matches = match_generic_to_product_master_grouped_for_row(
 
 
657
  generic_value=g_val,
658
- pm_df=pm_df,
659
- molecule_col=molecule_col,
660
- brand_id_col=brand_id_col,
661
- brand_name_col=brand_name_col,
662
  min_score=min_score,
663
  top_n=top_n
664
  )
 
665
  if matches:
666
  match_rows_with_any += 1
667
-
668
  data_out.append({
669
  "row_index": int(idx),
670
- # ALL extracted fields (id, generic_name, annual_volume_qty, etc.)
671
- "rfq": rfq_record,
672
- "matches": matches # zero or more matches
673
  })
674
-
 
 
675
  return {
676
- "rfq_rows": int(len(mapped)),
677
  "product_master_detected": {
678
  "molecule_col": molecule_col,
679
  "brand_id_col": brand_id_col,
680
  "brand_name_col": brand_name_col
681
  },
 
682
  "rows_with_matches": match_rows_with_any,
683
  "data": data_out
684
  }
@@ -691,13 +712,52 @@ async def match_with_difflib_grouped(
691
  @app.get("/debug-score")
692
  def debug_score(a: str, b: str):
693
  """Quick check for two strings."""
694
- return hybrid_similarity(a, b)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
 
696
 
697
- @app.get("/")
698
  def root():
699
- return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
 
 
 
 
 
 
 
 
 
 
 
700
 
701
  if __name__ == "__main__":
702
  import uvicorn
703
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
3
  import re
4
  from pathlib import Path
5
  from typing import List, Optional, Dict, Any, Tuple
6
+ from functools import lru_cache
7
 
8
  import pandas as pd
9
  from fastapi import FastAPI, UploadFile, File, HTTPException, Query
10
+ from fastapi.responses import JSONResponse, StreamingResponse
11
  import difflib
12
  from fastapi.middleware.cors import CORSMiddleware
13
+ import asyncio
14
 
15
+ app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid - Optimized)")
16
 
17
  app.add_middleware(
18
  CORSMiddleware,
 
28
  "current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
29
  ]
30
 
31
+ # ---------- OPTIMIZED: Compile regex patterns once at module level ----------
32
+ UNIT_PATTERN_COMPILED = re.compile(
33
+ r'\b\d+(? :\.\d+)?\s*(?:mg|mcg|μg|µg|gm? |kg|iu|i\. u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v|microgram|milligram|gram|kilogram|liter|milliliter)\b',
34
+ re.IGNORECASE
35
+ )
36
+
37
+ FORMS_PATTERN_COMPILED = re. compile(
38
+ r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b',
39
+ re.IGNORECASE
40
+ )
41
 
42
+ FRACTION_PATTERN = re.compile(r'\d+\s*/\s*\d+')
43
+ STANDALONE_NUM_PATTERN = re.compile(r'\b\d+(? :\.\d+)?\b')
44
+ WV_PATTERN = re.compile(r'\b[wv]\s*/\s*[wv]\b', re.IGNORECASE)
45
+ WHITESPACE_PATTERN = re.compile(r'\s+')
46
+ NON_WORD_PATTERN = re.compile(r'[^\w\s. %/+-]')
47
 
48
+ # ---------- Normalization ----------
49
+
50
+ # OPTIMIZED: Use lru_cache for frequently repeated strings
51
+ @lru_cache(maxsize=10000)
52
  def norm_base(s: str) -> str:
53
  s = str(s or "")
54
  s = s.lower()
55
  s = s.replace("+", " ").replace("/", " ")
56
+ s = NON_WORD_PATTERN. sub(" ", s)
57
+ s = WHITESPACE_PATTERN.sub(" ", s).strip()
 
58
  return s
59
 
60
 
61
+ @lru_cache(maxsize=10000)
62
+ def extract_numbers(s: str) -> Tuple[str, ... ]: # Return tuple for hashability
63
  s2 = norm_base(s)
64
+ num_unit = UNIT_PATTERN_COMPILED.findall(s2)
65
+ nums = STANDALONE_NUM_PATTERN.findall(s2)
 
 
 
 
 
 
 
66
  all_numbers = num_unit + nums
67
+ return tuple(sorted(set([x. strip() for x in all_numbers])))
68
 
69
 
70
+ @lru_cache(maxsize=10000)
71
+ def token_set(s: str) -> Tuple[str, ...]: # Return tuple for hashability
72
+ return tuple(t for t in norm_base(s).split(" ") if t)
73
 
74
 
75
  # ---------- Synonyms / detection ----------
76
+ SYNONYMS: Dict[str, List[str]] = {
 
77
  "generic_name": [
78
  "generic name", "generic", "molecule", "molecule name", "molecule with strength",
79
  "composition", "salt", "api", "active ingredient"
 
89
  "tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
90
  "category": ["category", "schedule", "section", "chapter", "dept"],
91
  "dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
 
 
92
  "__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
93
  "__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
94
  "__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
 
96
 
97
  # ---------- Header mapping ----------
98
 
 
99
  def score_header(tcol: str, scol: str) -> float:
100
  tn, sn = norm_base(tcol), norm_base(scol)
101
+ tset, sset = set(tn. split()), set(sn.split())
102
  jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
103
  contains = 1.0 if (tn in sn or sn in tn) else 0.0
104
  fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
 
108
  def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
109
  src_cols = [str(c) for c in src_cols]
110
  src_norm_map = {norm_base(c): c for c in src_cols}
111
+ mapping: Dict[str, Optional[str]] = {}
112
  for tcol in target_cols:
113
  # 1) exact synonym
114
  for alias in SYNONYMS.get(tcol, []):
 
153
  for nn, orig in norm_map.items():
154
  if n in nn or nn in n:
155
  return orig
156
+ # fallback: score
157
  best_col, best_score = None, -1.0
158
  for c in cols:
159
  sc = score_header(logical_name, c)
 
163
 
164
  # ---------- File reading ----------
165
 
 
166
  def guess_delimiter(sample: str) -> str:
167
  for d in ["\t", ";", "|", ","]:
168
  if d in sample:
 
171
 
172
 
173
  def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
174
+ keep = [c for c in df. columns if not str(c).startswith("Unnamed")]
175
  return df.loc[:, keep]
176
 
177
 
178
  def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
179
+ df. columns = [str(c) for c in df.columns]
180
  return df
181
 
182
 
183
+ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
184
  best = {"score": -1, "df": None, "sheet": None,
185
  "header": None, "mapping": None}
186
  for sheet in xl.sheet_names:
 
191
  if df.dropna(how="all").empty:
192
  continue
193
  df = ensure_str_columns(df)
194
+ m = map_headers_auto(df.columns. tolist(), TEMPLATE_COLUMNS)
195
  score = sum(1 for v in m.values() if v is not None)
196
  if score > best["score"]:
197
+ best = {"score": score, "df": df, "sheet": sheet,
198
  "header": header, "mapping": m}
199
+ except:
200
  continue
201
  if best["df"] is None:
202
  raise ValueError("No readable tables found in the Excel workbook.")
 
205
 
206
  def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
207
  ext = Path(filename).suffix.lower()
208
+ if ext in [". xlsx", ".xls", ".xlsm", ". ods"]:
209
  xl = pd.ExcelFile(io.BytesIO(data))
210
  best = choose_best_sheet_and_header(xl)
211
  return best["df"]
212
  if ext in [".csv", ".tsv"]:
213
  text = data.decode("utf-8", errors="ignore")
214
+ delim = guess_delimiter(text[: 4096])
215
+ return pd.read_csv(io. StringIO(text), sep=delim, engine="python")
216
  if ext == ".json":
217
  js = json.loads(data.decode("utf-8", errors="ignore"))
 
218
  if isinstance(js, list):
219
  return pd.DataFrame(js)
220
  if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
 
234
  [pd.NA]*len(src_df), index=src_df.index)
235
  return out, mapping
236
 
237
+ # ---------- OPTIMIZED: Molecule extraction with caching ----------
 
238
 
239
+ @lru_cache(maxsize=10000)
240
  def extract_molecule_base(s: str) -> str:
241
  """Extract core molecule name by removing dosages, units, and forms."""
242
  s_norm = norm_base(s)
243
+
244
  # Step 1: Remove dosage forms FIRST
245
+ s_norm = FORMS_PATTERN_COMPILED.sub(' ', s_norm)
246
+
247
+ # Step 2: Remove number+unit patterns
248
+ s_norm = UNIT_PATTERN_COMPILED.sub(' ', s_norm)
249
+
 
 
250
  # Step 3: Remove fractions and ratios
251
+ s_norm = FRACTION_PATTERN. sub(' ', s_norm)
252
+
253
  # Step 4: Remove standalone numbers
254
+ s_norm = STANDALONE_NUM_PATTERN.sub(' ', s_norm)
255
+
256
  # Step 5: Remove w/w, w/v, v/v
257
+ s_norm = WV_PATTERN.sub(' ', s_norm)
258
+
259
  # Step 6: Clean up spaces
260
+ s_norm = WHITESPACE_PATTERN.sub(' ', s_norm).strip()
261
+
262
  return s_norm
263
 
264
 
265
+ # ---------- OPTIMIZED: Pre-computed product master ----------
266
+
267
+ class PrecomputedProductMaster:
268
+ """Pre-compute all expensive operations once for the product master"""
269
+
270
+ def __init__(self, pm_df: pd.DataFrame, molecule_col: str,
271
+ brand_id_col: Optional[str], brand_name_col: Optional[str]):
272
+ subset = pm_df. dropna(subset=[molecule_col]).copy()
273
+
274
+ # Store original data
275
+ self.molecule_col = molecule_col
276
+ self.mol_raw = subset[molecule_col].astype(str).tolist()
277
+ self.brand_ids = subset[brand_id_col].astype(str).tolist() \
278
+ if brand_id_col and brand_id_col in subset. columns else [None] * len(subset)
279
+ self.brand_names = subset[brand_name_col].astype(str).tolist() \
280
+ if brand_name_col and brand_name_col in subset.columns else [None] * len(subset)
281
+ self.idxs = subset.index.tolist()
282
+
283
+ # Pre-compute normalized forms
284
+ print(f"Pre-computing {len(self.mol_raw)} product master entries...")
285
+ self.mol_norm = [norm_base(m) for m in self.mol_raw]
286
+ self.mol_base = [extract_molecule_base(m) for m in self.mol_raw]
287
+ self.mol_tokens = [set(token_set(mb)) for mb in self.mol_base]
288
+ self.mol_numbers = [set(extract_numbers(m)) for m in self.mol_raw]
289
+ print("Pre-computation complete!")
290
+
291
+ def __len__(self):
292
+ return len(self.mol_raw)
293
+
294
+
295
+ # ---------- OPTIMIZED: Fast pre-filter ----------
296
+
297
+ def quick_filter(g_tokens: set, pm_tokens: set, threshold: float = 0.15) -> bool:
298
+ """Fast token overlap check to skip obvious non-matches"""
299
+ if not g_tokens or not pm_tokens:
300
+ return False
301
+ overlap = len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens)
302
+ return overlap >= threshold
303
+
304
+
305
+ # ---------- OPTIMIZED: Hybrid similarity with pre-computed data ----------
306
+
307
+ def hybrid_similarity_optimized(
308
+ g_norm: str, g_base: str, g_tokens: set, g_numbers: set,
309
+ pm_norm: str, pm_base: str, pm_tokens: set, pm_numbers: set
310
+ ) -> Dict[str, float]:
311
  """
312
+ Enhanced similarity using pre-computed normalized forms.
 
313
  """
 
 
314
  # Exact match = perfect score
315
+ if g_norm == pm_norm:
316
+ return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "mol_base": 100.0, "score": 100.0}
317
+
318
+ # 1. Full text difflib similarity
319
+ diff = difflib.SequenceMatcher(None, g_norm, pm_norm).ratio() * 100.0
320
+
321
  # 2. Token Jaccard similarity
322
+ jacc = (len(g_tokens & pm_tokens) / len(g_tokens | pm_tokens) * 100.0) if (g_tokens and pm_tokens) else 0.0
323
+
 
 
324
  # 3. Number matching (bonus only)
325
+ num_match = 100.0 if (g_numbers and pm_numbers and g_numbers == pm_numbers) else 0.0
326
+
327
+ # 4. Molecule base matching
 
 
 
 
 
328
  mol_base_score = 0.0
329
+
330
+ if g_base and pm_base:
331
+ if g_base == pm_base:
 
332
  mol_base_score = 100.0
333
  else:
334
+ mol_base_diff = difflib.SequenceMatcher(None, g_base, pm_base).ratio() * 100.0
335
+
336
+ base_tokens_g = set(g_base. split())
337
+ base_tokens_pm = set(pm_base. split())
338
+
339
+ if base_tokens_g and base_tokens_pm:
340
+ base_jacc = len(base_tokens_g & base_tokens_pm) / len(base_tokens_g | base_tokens_pm) * 100.0
 
 
 
 
 
 
341
  mol_base_score = 0.40 * mol_base_diff + 0.60 * base_jacc
342
  else:
343
  mol_base_score = mol_base_diff
344
+
345
+ # 5. Scoring formula
346
+ if mol_base_score >= 95:
347
+ score = (0.60 * mol_base_score + 0.20 * diff + 0.15 * jacc + 0.05 * num_match)
 
 
 
 
 
 
 
 
348
  else:
349
+ score = (0.50 * mol_base_score + 0.25 * diff + 0.20 * jacc + 0.05 * num_match)
350
+
 
 
 
 
351
  return {
352
  "diff": round(diff, 2),
353
  "jacc": round(jacc, 2),
 
357
  }
358
 
359
 
360
+ # ---------- OPTIMIZED: Batch matching ----------
361
+
362
+ def match_generic_to_product_master_optimized(
363
  generic_list: List[str],
364
+ pm: PrecomputedProductMaster,
 
 
 
365
  min_score: float = 60.0,
366
+ return_all: bool = False,
367
+ batch_size: int = 100
368
  ) -> List[Dict[str, Any]]:
369
+ """Optimized matching using pre-computed product master"""
370
+
 
 
 
 
 
 
 
 
 
 
 
 
371
  results = []
372
+ total = len(generic_list)
373
+
374
+ for batch_start in range(0, total, batch_size):
375
+ batch_end = min(batch_start + batch_size, total)
376
+ batch = generic_list[batch_start:batch_end]
377
+
378
+ if batch_start % 500 == 0:
379
+ print(f"Processing RFQ rows {batch_start}-{batch_end} of {total}...")
380
+
381
+ for i_in_batch, g in enumerate(batch):
382
+ i = batch_start + i_in_batch
383
+ g_str = str(g or "").strip()
384
+ if not g_str:
385
+ continue
386
+
387
+ # Pre-compute for this generic
388
+ g_norm = norm_base(g_str)
389
+ g_base = extract_molecule_base(g_str)
390
+ g_tokens = set(token_set(g_base))
391
+ g_numbers = set(extract_numbers(g_str))
392
+
393
+ best_score, best_pos, best_parts = -1.0, None, None
394
+
395
+ for pos in range(len(pm)):
396
+ # Quick filter to skip obvious non-matches
397
+ if not quick_filter(g_tokens, pm.mol_tokens[pos]):
398
+ continue
399
+
400
+ # Full similarity calculation only for candidates
401
+ parts = hybrid_similarity_optimized(
402
+ g_norm, g_base, g_tokens, g_numbers,
403
+ pm.mol_norm[pos], pm.mol_base[pos], pm.mol_tokens[pos], pm.mol_numbers[pos]
404
+ )
405
+
406
+ if parts["score"] > best_score:
407
+ best_score, best_pos, best_parts = parts["score"], pos, parts
408
+
409
+ if best_pos is None:
410
+ continue
411
+
412
+ item = {
413
+ "row_index": i,
414
+ "generic_name": g_str,
415
+ "matched_name": pm.mol_raw[best_pos],
416
+ "matched_brand_name": pm.brand_names[best_pos],
417
+ "match_percent": round(best_score, 2),
418
+ "brand_id": pm.brand_ids[best_pos],
419
+ "brand_name": pm.brand_names[best_pos],
420
+ "master_row_index": int(pm.idxs[best_pos]),
421
+ }
422
+
423
+ if return_all:
424
+ item["_debug"] = best_parts
425
  results.append(item)
426
+ else:
427
+ if best_score >= min_score:
428
+ results.append(item)
429
+
430
  return results
431
 
432
 
433
+ # ---------- OPTIMIZED: Grouped matcher ----------
434
 
435
+ def match_generic_to_product_master_grouped_for_row_optimized(
 
436
  generic_value: str,
437
+ pm: PrecomputedProductMaster,
 
 
 
438
  min_score: float = 60.0,
439
  top_n: int = 3
440
  ) -> List[Dict[str, Any]]:
441
+ """Optimized grouped matching for a single row"""
442
+
 
 
 
 
 
 
 
 
 
443
  g_str = str(generic_value or "").strip()
444
  if not g_str:
445
  return []
446
+
447
+ # Pre-compute for this generic
448
+ g_norm = norm_base(g_str)
449
+ g_base = extract_molecule_base(g_str)
450
+ g_tokens = set(token_set(g_base))
451
+ g_numbers = set(extract_numbers(g_str))
452
+
453
  scored = []
454
+
455
+ for idx in range(len(pm)):
456
+ # Quick filter
457
+ if not quick_filter(g_tokens, pm.mol_tokens[idx]):
458
+ continue
459
+
460
+ # Full calculation
461
+ parts = hybrid_similarity_optimized(
462
+ g_norm, g_base, g_tokens, g_numbers,
463
+ pm.mol_norm[idx], pm.mol_base[idx], pm.mol_tokens[idx], pm.mol_numbers[idx]
464
+ )
465
  score = parts["score"]
466
+
467
  if score >= min_score:
468
+ scored. append({
469
+ "matched_name": pm.mol_raw[idx],
470
+ "brand_name": pm.brand_names[idx],
471
+ "brand_id": pm.brand_ids[idx],
472
  "match_percent": round(score, 2),
473
  "_debug": parts
474
  })
475
+
476
  scored.sort(key=lambda x: x["match_percent"], reverse=True)
 
477
  return scored[:top_n]
478
 
479
 
480
+ # ---------- OPTIMIZED Endpoints ----------
 
481
 
482
  @app.post("/match-difflib")
483
  async def match_with_difflib(
484
  rfq_file: UploadFile = File(...),
485
  product_master_json: UploadFile = File(...),
486
+ min_score: float = Query(60.0, description="Minimum composite score (0-100)")
 
487
  ):
488
  try:
489
  # RFQ
490
  rfq_bytes = await rfq_file.read()
491
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
492
  mapped, mapping = build_mapped_rfq(rfq_df)
493
+
494
+ if "generic_name" not in mapped. columns:
495
  raise HTTPException(
496
  status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
497
+
498
  gen_series = mapped["generic_name"]
499
  nonempty_mask = gen_series.notna() & gen_series.astype(
500
  str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
501
  generic_list = gen_series[nonempty_mask].astype(str).tolist()
502
+
503
+ # Product master
504
+ pm_bytes = await product_master_json. read()
505
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
506
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
507
+
508
+ molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
509
+ brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
510
+ brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
511
+
512
+ if not molecule_col:
 
 
 
 
 
513
  raise HTTPException(
514
  status_code=400, detail="Could not detect molecule column in product master JSON.")
515
+
516
+ # OPTIMIZED: Pre-compute product master
517
+ pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
518
+
519
+ # OPTIMIZED: Use optimized matching
520
+ matches = match_generic_to_product_master_optimized(
521
+ generic_list, pm,
522
  min_score=min_score,
523
  return_all=False
524
  )
525
+
526
  return JSONResponse({
527
  "rfq_rows": int(nonempty_mask.sum()),
528
  "product_master_detected": {
 
530
  "brand_id_col": brand_id_col,
531
  "brand_name_col": brand_name_col
532
  },
533
+ "product_master_size": len(pm),
534
  "matches_returned": len(matches),
535
  "data": matches
536
  })
 
545
  """Test molecule base extraction"""
546
  normalized = norm_base(text)
547
  mol_base = extract_molecule_base(text)
548
+
549
  return {
550
  "original": text,
551
  "normalized": normalized,
552
  "molecule_base": mol_base,
553
+ "numbers_extracted": list(extract_numbers(text)),
554
+ "tokens": list(token_set(text))
555
  }
556
 
557
 
 
559
  async def match_with_difflib_debug(
560
  rfq_file: UploadFile = File(...),
561
  product_master_json: UploadFile = File(...),
562
+ sample: int = Query(5, ge=1, le=200),
563
  min_score: float = Query(60.0),
564
+ sample_contains: str = Query("", description="Filter RFQ rows by substring (case-insensitive)")
 
565
  ):
566
  """
567
+ Diagnostics: return BEST match (+%) for the first N RFQ rows, optionally filtered by text.
 
568
  """
569
+ try:
570
  # RFQ
571
  rfq_bytes = await rfq_file.read()
572
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
573
  mapped, mapping = build_mapped_rfq(rfq_df)
574
+
575
+ gen_series = mapped. get("generic_name", pd.Series([], dtype=object))
576
  nonempty_mask = gen_series.notna() & gen_series.astype(
577
  str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
578
  generic_list_all = gen_series[nonempty_mask].astype(str)
579
+
580
  if sample_contains:
581
+ flt = generic_list_all.str.contains(sample_contains, case=False, na=False)
582
+ generic_list = generic_list_all[flt]. tolist()[:sample]
 
583
  else:
584
  generic_list = generic_list_all.tolist()[:sample]
585
+
586
  # Product master
587
  pm_bytes = await product_master_json.read()
588
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
589
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
590
+
591
+ molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
592
+ brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
593
+ brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
594
+
595
+ # OPTIMIZED: Pre-compute
596
+ pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
597
+
598
+ demo_matches = match_generic_to_product_master_optimized(
599
+ generic_list, pm,
 
 
 
600
  min_score=min_score,
601
  return_all=True
602
  )
603
+
604
  return JSONResponse({
605
  "rfq_detected_headers": list(map(str, rfq_df.columns)),
606
  "template_mapping": mapping,
 
610
  "brand_id_col": brand_id_col,
611
  "brand_name_col": brand_name_col
612
  },
613
+ "product_master_size": len(pm),
614
+ "filter": sample_contains or None,
615
  "examples": demo_matches
616
  })
617
  except HTTPException:
 
619
  except Exception as e:
620
  raise HTTPException(status_code=500, detail=str(e))
621
 
 
 
622
 
623
  @app.post("/match-difflib-grouped")
624
  async def match_with_difflib_grouped(
 
628
  top_n: int = Query(3, description="Max number of matches per RFQ row")
629
  ):
630
  """
631
+ Return ALL extracted RFQ rows with matches array.
632
+ OPTIMIZED version with pre-computation and batching.
 
633
  """
634
  try:
635
  # RFQ
636
  rfq_bytes = await rfq_file.read()
637
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
638
  mapped, mapping = build_mapped_rfq(rfq_df)
639
+
640
  for col in TEMPLATE_COLUMNS:
641
  if col not in mapped.columns:
642
  mapped[col] = pd.NA
643
+
644
  # Product master
645
  pm_bytes = await product_master_json.read()
646
  pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
647
  pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
648
+
649
+ molecule_col = detect_single_column(pm_df, "__product_master_molecule__")
650
+ brand_id_col = detect_single_column(pm_df, "__product_master_brand_id__")
651
+ brand_name_col = detect_single_column(pm_df, "__product_master_brand_name__")
652
+
 
 
653
  if not molecule_col:
654
  raise HTTPException(
655
  status_code=400, detail="Could not detect molecule column in product master JSON.")
656
+
657
+ # OPTIMIZED: Pre-compute product master
658
+ pm = PrecomputedProductMaster(pm_df, molecule_col, brand_id_col, brand_name_col)
659
+
660
+ # Build response data
661
  data_out = []
662
  match_rows_with_any = 0
663
+ total = len(mapped)
664
+
665
+ print(f"Processing {total} RFQ rows against {len(pm)} products...")
666
+
667
  for idx, row in mapped.iterrows():
668
+ if idx % 100 == 0:
669
+ print(f"Processing RFQ row {idx}/{total}...")
670
+
671
+ rfq_record = {col: (None if pd.isna(row. get(col)) else str(row.get(col)))
672
+ for col in TEMPLATE_COLUMNS}
673
+
674
  g_val = rfq_record.get("generic_name") or ""
675
+
676
+ # OPTIMIZED: Use optimized matching
677
+ matches = match_generic_to_product_master_grouped_for_row_optimized(
678
  generic_value=g_val,
679
+ pm=pm,
 
 
 
680
  min_score=min_score,
681
  top_n=top_n
682
  )
683
+
684
  if matches:
685
  match_rows_with_any += 1
686
+
687
  data_out.append({
688
  "row_index": int(idx),
689
+ "rfq": rfq_record,
690
+ "matches": matches
 
691
  })
692
+
693
+ print(f"Completed! {match_rows_with_any}/{total} rows had matches.")
694
+
695
  return {
696
+ "rfq_rows": int(len(mapped)),
697
  "product_master_detected": {
698
  "molecule_col": molecule_col,
699
  "brand_id_col": brand_id_col,
700
  "brand_name_col": brand_name_col
701
  },
702
+ "product_master_size": len(pm),
703
  "rows_with_matches": match_rows_with_any,
704
  "data": data_out
705
  }
 
712
  @app.get("/debug-score")
713
  def debug_score(a: str, b: str):
714
  """Quick check for two strings."""
715
+ # Pre-compute both sides
716
+ a_norm = norm_base(a)
717
+ a_base = extract_molecule_base(a)
718
+ a_tokens = set(token_set(a_base))
719
+ a_numbers = set(extract_numbers(a))
720
+
721
+ b_norm = norm_base(b)
722
+ b_base = extract_molecule_base(b)
723
+ b_tokens = set(token_set(b_base))
724
+ b_numbers = set(extract_numbers(b))
725
+
726
+ result = hybrid_similarity_optimized(
727
+ a_norm, a_base, a_tokens, a_numbers,
728
+ b_norm, b_base, b_tokens, b_numbers
729
+ )
730
+
731
+ return {
732
+ "a": a,
733
+ "b": b,
734
+ "a_normalized": a_norm,
735
+ "b_normalized": b_norm,
736
+ "a_base": a_base,
737
+ "b_base": b_base,
738
+ "a_tokens": list(a_tokens),
739
+ "b_tokens": list(b_tokens),
740
+ "quick_filter_pass": quick_filter(a_tokens, b_tokens),
741
+ "similarity": result
742
+ }
743
 
744
 
745
+ @app. get("/")
746
  def root():
747
+ return {
748
+ "status": "ok",
749
+ "message": "OPTIMIZED version with pre-computation and batching",
750
+ "endpoints": {
751
+ "/match-difflib": "Standard matching",
752
+ "/match-difflib-grouped": "Grouped matching (recommended)",
753
+ "/match-difflib-debug": "Debug mode",
754
+ "/debug-score": "Test two strings",
755
+ "/test-extract-base": "Test molecule extraction"
756
+ }
757
+ }
758
+
759
 
760
  if __name__ == "__main__":
761
  import uvicorn
762
+ # INCREASED TIMEOUT: 10 minutes (600 seconds)
763
+ uvicorn.run(app, host="0.0.0.0", port=7860, timeout_keep_alive=600)