Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -295,8 +295,8 @@ def match_generic_to_product_master(
|
|
| 295 |
# ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
|
| 296 |
|
| 297 |
|
| 298 |
-
def
|
| 299 |
-
|
| 300 |
pm_df: pd.DataFrame,
|
| 301 |
molecule_col: str,
|
| 302 |
brand_id_col: Optional[str],
|
|
@@ -304,10 +304,7 @@ def match_generic_to_product_master_grouped(
|
|
| 304 |
min_score: float = 60.0,
|
| 305 |
top_n: int = 3
|
| 306 |
) -> List[Dict[str, Any]]:
|
| 307 |
-
"""
|
| 308 |
-
For each RFQ generic name, return an array of up to top_n matches from product master
|
| 309 |
-
with score >= min_score.
|
| 310 |
-
"""
|
| 311 |
subset = pm_df.dropna(subset=[molecule_col]).copy()
|
| 312 |
mol_raw = subset[molecule_col].astype(str).tolist()
|
| 313 |
brand_ids = subset[brand_id_col].astype(str).tolist(
|
|
@@ -315,29 +312,23 @@ def match_generic_to_product_master_grouped(
|
|
| 315 |
brand_names = subset[brand_name_col].astype(str).tolist(
|
| 316 |
) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
"brand_name": brand_names[idx]
|
| 333 |
-
})
|
| 334 |
-
scored.sort(key=lambda x: x["match_percent"], reverse=True)
|
| 335 |
-
if scored:
|
| 336 |
-
grouped_results.append({
|
| 337 |
-
"generic_name": g_str,
|
| 338 |
-
"matches": scored[:top_n]
|
| 339 |
})
|
| 340 |
-
|
|
|
|
| 341 |
|
| 342 |
# ---------- Endpoints ----------
|
| 343 |
|
|
@@ -487,22 +478,19 @@ async def match_with_difflib_grouped(
|
|
| 487 |
top_n: int = Query(3, description="Max number of matches per RFQ row")
|
| 488 |
):
|
| 489 |
"""
|
| 490 |
-
Return
|
|
|
|
|
|
|
| 491 |
"""
|
| 492 |
try:
|
| 493 |
# RFQ
|
| 494 |
rfq_bytes = await rfq_file.read()
|
| 495 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 496 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
gen_series = mapped["generic_name"]
|
| 503 |
-
nonempty_mask = gen_series.notna() & gen_series.astype(
|
| 504 |
-
str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
|
| 505 |
-
generic_list = gen_series[nonempty_mask].astype(str).tolist()
|
| 506 |
|
| 507 |
# Product master
|
| 508 |
pm_bytes = await product_master_json.read()
|
|
@@ -515,29 +503,50 @@ async def match_with_difflib_grouped(
|
|
| 515 |
pm_df, "__product_master_brand_id__")
|
| 516 |
brand_name_col = detect_single_column(
|
| 517 |
pm_df, "__product_master_brand_name__")
|
| 518 |
-
|
| 519 |
if not molecule_col:
|
| 520 |
raise HTTPException(
|
| 521 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 522 |
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
return {
|
| 533 |
-
"rfq_rows": int(
|
| 534 |
"product_master_detected": {
|
| 535 |
"molecule_col": molecule_col,
|
| 536 |
"brand_id_col": brand_id_col,
|
| 537 |
"brand_name_col": brand_name_col
|
| 538 |
},
|
| 539 |
-
"
|
| 540 |
-
"data":
|
| 541 |
}
|
| 542 |
except HTTPException:
|
| 543 |
raise
|
|
@@ -553,4 +562,4 @@ def debug_score(a: str, b: str):
|
|
| 553 |
|
| 554 |
@app.get("/")
|
| 555 |
def root():
|
| 556 |
-
return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-
|
|
|
|
| 295 |
# ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
|
| 296 |
|
| 297 |
|
| 298 |
+
def match_generic_to_product_master_grouped_for_row(
|
| 299 |
+
generic_value: str,
|
| 300 |
pm_df: pd.DataFrame,
|
| 301 |
molecule_col: str,
|
| 302 |
brand_id_col: Optional[str],
|
|
|
|
| 304 |
min_score: float = 60.0,
|
| 305 |
top_n: int = 3
|
| 306 |
) -> List[Dict[str, Any]]:
|
| 307 |
+
"""Compute matches for a *single* RFQ row's generic name."""
|
|
|
|
|
|
|
|
|
|
| 308 |
subset = pm_df.dropna(subset=[molecule_col]).copy()
|
| 309 |
mol_raw = subset[molecule_col].astype(str).tolist()
|
| 310 |
brand_ids = subset[brand_id_col].astype(str).tolist(
|
|
|
|
| 312 |
brand_names = subset[brand_name_col].astype(str).tolist(
|
| 313 |
) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
|
| 314 |
|
| 315 |
+
g_str = str(generic_value or "").strip()
|
| 316 |
+
if not g_str:
|
| 317 |
+
return []
|
| 318 |
+
|
| 319 |
+
scored = []
|
| 320 |
+
for idx, cand in enumerate(mol_raw):
|
| 321 |
+
parts = hybrid_similarity(g_str, cand)
|
| 322 |
+
score = parts["score"]
|
| 323 |
+
if score >= min_score:
|
| 324 |
+
scored.append({
|
| 325 |
+
"matched_name": cand,
|
| 326 |
+
"match_percent": round(score, 2),
|
| 327 |
+
"brand_id": brand_ids[idx],
|
| 328 |
+
"brand_name": brand_names[idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
})
|
| 330 |
+
scored.sort(key=lambda x: x["match_percent"], reverse=True)
|
| 331 |
+
return scored[:top_n]
|
| 332 |
|
| 333 |
# ---------- Endpoints ----------
|
| 334 |
|
|
|
|
| 478 |
top_n: int = Query(3, description="Max number of matches per RFQ row")
|
| 479 |
):
|
| 480 |
"""
|
| 481 |
+
Return ALL extracted RFQ rows (template-aligned fields), each with a `matches` array of
|
| 482 |
+
product master molecules (matched_name, match_percent, brand_id, brand_name) scoring ≥ min_score.
|
| 483 |
+
Rows with no matches still appear with an empty `matches` list.
|
| 484 |
"""
|
| 485 |
try:
|
| 486 |
# RFQ
|
| 487 |
rfq_bytes = await rfq_file.read()
|
| 488 |
rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
|
| 489 |
mapped, mapping = build_mapped_rfq(rfq_df)
|
| 490 |
+
# Ensure columns exist even if not mapped
|
| 491 |
+
for col in TEMPLATE_COLUMNS:
|
| 492 |
+
if col not in mapped.columns:
|
| 493 |
+
mapped[col] = pd.NA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
# Product master
|
| 496 |
pm_bytes = await product_master_json.read()
|
|
|
|
| 503 |
pm_df, "__product_master_brand_id__")
|
| 504 |
brand_name_col = detect_single_column(
|
| 505 |
pm_df, "__product_master_brand_name__")
|
|
|
|
| 506 |
if not molecule_col:
|
| 507 |
raise HTTPException(
|
| 508 |
status_code=400, detail="Could not detect molecule column in product master JSON.")
|
| 509 |
|
| 510 |
+
# Build response data: include every RFQ row as extracted, plus matches
|
| 511 |
+
data_out = []
|
| 512 |
+
match_rows_with_any = 0
|
| 513 |
+
|
| 514 |
+
# Work only with the same index order; keep all rows
|
| 515 |
+
for idx, row in mapped.iterrows():
|
| 516 |
+
# serialize RFQ row (template-aligned)
|
| 517 |
+
rfq_record = {col: (None if pd.isna(row.get(col)) else str(
|
| 518 |
+
row.get(col))) for col in TEMPLATE_COLUMNS}
|
| 519 |
+
|
| 520 |
+
# compute matches based on this row's generic_name
|
| 521 |
+
g_val = rfq_record.get("generic_name") or ""
|
| 522 |
+
matches = match_generic_to_product_master_grouped_for_row(
|
| 523 |
+
generic_value=g_val,
|
| 524 |
+
pm_df=pm_df,
|
| 525 |
+
molecule_col=molecule_col,
|
| 526 |
+
brand_id_col=brand_id_col,
|
| 527 |
+
brand_name_col=brand_name_col,
|
| 528 |
+
min_score=min_score,
|
| 529 |
+
top_n=top_n
|
| 530 |
+
)
|
| 531 |
+
if matches:
|
| 532 |
+
match_rows_with_any += 1
|
| 533 |
+
|
| 534 |
+
data_out.append({
|
| 535 |
+
"row_index": int(idx),
|
| 536 |
+
# ALL extracted fields (id, generic_name, annual_volume_qty, etc.)
|
| 537 |
+
"rfq": rfq_record,
|
| 538 |
+
"matches": matches # zero or more matches
|
| 539 |
+
})
|
| 540 |
|
| 541 |
return {
|
| 542 |
+
"rfq_rows": int(len(mapped)),
|
| 543 |
"product_master_detected": {
|
| 544 |
"molecule_col": molecule_col,
|
| 545 |
"brand_id_col": brand_id_col,
|
| 546 |
"brand_name_col": brand_name_col
|
| 547 |
},
|
| 548 |
+
"rows_with_matches": match_rows_with_any,
|
| 549 |
+
"data": data_out
|
| 550 |
}
|
| 551 |
except HTTPException:
|
| 552 |
raise
|
|
|
|
| 562 |
|
| 563 |
@app.get("/")
|
| 564 |
def root():
|
| 565 |
+
return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
|