anujakkulkarni commited on
Commit
221a9b2
·
verified ·
1 Parent(s): 85d1af4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -51
app.py CHANGED
@@ -295,8 +295,8 @@ def match_generic_to_product_master(
295
  # ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
296
 
297
 
298
- def match_generic_to_product_master_grouped(
299
- generic_list: List[str],
300
  pm_df: pd.DataFrame,
301
  molecule_col: str,
302
  brand_id_col: Optional[str],
@@ -304,10 +304,7 @@ def match_generic_to_product_master_grouped(
304
  min_score: float = 60.0,
305
  top_n: int = 3
306
  ) -> List[Dict[str, Any]]:
307
- """
308
- For each RFQ generic name, return an array of up to top_n matches from product master
309
- with score >= min_score.
310
- """
311
  subset = pm_df.dropna(subset=[molecule_col]).copy()
312
  mol_raw = subset[molecule_col].astype(str).tolist()
313
  brand_ids = subset[brand_id_col].astype(str).tolist(
@@ -315,29 +312,23 @@ def match_generic_to_product_master_grouped(
315
  brand_names = subset[brand_name_col].astype(str).tolist(
316
  ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
317
 
318
- grouped_results = []
319
- for g in generic_list:
320
- g_str = str(g or "").strip()
321
- if not g_str:
322
- continue
323
- scored = []
324
- for idx, cand in enumerate(mol_raw):
325
- parts = hybrid_similarity(g_str, cand)
326
- score = parts["score"]
327
- if score >= min_score:
328
- scored.append({
329
- "matched_name": cand,
330
- "match_percent": round(score, 2),
331
- "brand_id": brand_ids[idx],
332
- "brand_name": brand_names[idx]
333
- })
334
- scored.sort(key=lambda x: x["match_percent"], reverse=True)
335
- if scored:
336
- grouped_results.append({
337
- "generic_name": g_str,
338
- "matches": scored[:top_n]
339
  })
340
- return grouped_results
 
341
 
342
  # ---------- Endpoints ----------
343
 
@@ -487,22 +478,19 @@ async def match_with_difflib_grouped(
487
  top_n: int = Query(3, description="Max number of matches per RFQ row")
488
  ):
489
  """
490
- Return grouped matches: one RFQ generic_name array of matched product master molecules.
 
 
491
  """
492
  try:
493
  # RFQ
494
  rfq_bytes = await rfq_file.read()
495
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
496
  mapped, mapping = build_mapped_rfq(rfq_df)
497
-
498
- if "generic_name" not in mapped.columns:
499
- raise HTTPException(
500
- status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
501
-
502
- gen_series = mapped["generic_name"]
503
- nonempty_mask = gen_series.notna() & gen_series.astype(
504
- str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
505
- generic_list = gen_series[nonempty_mask].astype(str).tolist()
506
 
507
  # Product master
508
  pm_bytes = await product_master_json.read()
@@ -515,29 +503,50 @@ async def match_with_difflib_grouped(
515
  pm_df, "__product_master_brand_id__")
516
  brand_name_col = detect_single_column(
517
  pm_df, "__product_master_brand_name__")
518
-
519
  if not molecule_col:
520
  raise HTTPException(
521
  status_code=400, detail="Could not detect molecule column in product master JSON.")
522
 
523
- grouped_matches = match_generic_to_product_master_grouped(
524
- generic_list, pm_df,
525
- molecule_col=molecule_col,
526
- brand_id_col=brand_id_col,
527
- brand_name_col=brand_name_col,
528
- min_score=min_score,
529
- top_n=top_n
530
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
  return {
533
- "rfq_rows": int(nonempty_mask.sum()),
534
  "product_master_detected": {
535
  "molecule_col": molecule_col,
536
  "brand_id_col": brand_id_col,
537
  "brand_name_col": brand_name_col
538
  },
539
- "matches_returned": len(grouped_matches),
540
- "data": grouped_matches
541
  }
542
  except HTTPException:
543
  raise
@@ -553,4 +562,4 @@ def debug_score(a: str, b: str):
553
 
554
  @app.get("/")
555
  def root():
556
- return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-debug to inspect best matches."}
 
295
  # ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
296
 
297
 
298
+ def match_generic_to_product_master_grouped_for_row(
299
+ generic_value: str,
300
  pm_df: pd.DataFrame,
301
  molecule_col: str,
302
  brand_id_col: Optional[str],
 
304
  min_score: float = 60.0,
305
  top_n: int = 3
306
  ) -> List[Dict[str, Any]]:
307
+ """Compute matches for a *single* RFQ row's generic name."""
 
 
 
308
  subset = pm_df.dropna(subset=[molecule_col]).copy()
309
  mol_raw = subset[molecule_col].astype(str).tolist()
310
  brand_ids = subset[brand_id_col].astype(str).tolist(
 
312
  brand_names = subset[brand_name_col].astype(str).tolist(
313
  ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
314
 
315
+ g_str = str(generic_value or "").strip()
316
+ if not g_str:
317
+ return []
318
+
319
+ scored = []
320
+ for idx, cand in enumerate(mol_raw):
321
+ parts = hybrid_similarity(g_str, cand)
322
+ score = parts["score"]
323
+ if score >= min_score:
324
+ scored.append({
325
+ "matched_name": cand,
326
+ "match_percent": round(score, 2),
327
+ "brand_id": brand_ids[idx],
328
+ "brand_name": brand_names[idx]
 
 
 
 
 
 
 
329
  })
330
+ scored.sort(key=lambda x: x["match_percent"], reverse=True)
331
+ return scored[:top_n]
332
 
333
  # ---------- Endpoints ----------
334
 
 
478
  top_n: int = Query(3, description="Max number of matches per RFQ row")
479
  ):
480
  """
481
+ Return ALL extracted RFQ rows (template-aligned fields), each with a `matches` array of
482
+ product master molecules (matched_name, match_percent, brand_id, brand_name) scoring ≥ min_score.
483
+ Rows with no matches still appear with an empty `matches` list.
484
  """
485
  try:
486
  # RFQ
487
  rfq_bytes = await rfq_file.read()
488
  rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
489
  mapped, mapping = build_mapped_rfq(rfq_df)
490
+ # Ensure columns exist even if not mapped
491
+ for col in TEMPLATE_COLUMNS:
492
+ if col not in mapped.columns:
493
+ mapped[col] = pd.NA
 
 
 
 
 
494
 
495
  # Product master
496
  pm_bytes = await product_master_json.read()
 
503
  pm_df, "__product_master_brand_id__")
504
  brand_name_col = detect_single_column(
505
  pm_df, "__product_master_brand_name__")
 
506
  if not molecule_col:
507
  raise HTTPException(
508
  status_code=400, detail="Could not detect molecule column in product master JSON.")
509
 
510
+ # Build response data: include every RFQ row as extracted, plus matches
511
+ data_out = []
512
+ match_rows_with_any = 0
513
+
514
+ # Work only with the same index order; keep all rows
515
+ for idx, row in mapped.iterrows():
516
+ # serialize RFQ row (template-aligned)
517
+ rfq_record = {col: (None if pd.isna(row.get(col)) else str(
518
+ row.get(col))) for col in TEMPLATE_COLUMNS}
519
+
520
+ # compute matches based on this row's generic_name
521
+ g_val = rfq_record.get("generic_name") or ""
522
+ matches = match_generic_to_product_master_grouped_for_row(
523
+ generic_value=g_val,
524
+ pm_df=pm_df,
525
+ molecule_col=molecule_col,
526
+ brand_id_col=brand_id_col,
527
+ brand_name_col=brand_name_col,
528
+ min_score=min_score,
529
+ top_n=top_n
530
+ )
531
+ if matches:
532
+ match_rows_with_any += 1
533
+
534
+ data_out.append({
535
+ "row_index": int(idx),
536
+ # ALL extracted fields (id, generic_name, annual_volume_qty, etc.)
537
+ "rfq": rfq_record,
538
+ "matches": matches # zero or more matches
539
+ })
540
 
541
  return {
542
+ "rfq_rows": int(len(mapped)),
543
  "product_master_detected": {
544
  "molecule_col": molecule_col,
545
  "brand_id_col": brand_id_col,
546
  "brand_name_col": brand_name_col
547
  },
548
+ "rows_with_matches": match_rows_with_any,
549
+ "data": data_out
550
  }
551
  except HTTPException:
552
  raise
 
562
 
563
  @app.get("/")
564
  def root():
565
+ return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}