aniket9909 commited on
Commit
f14504f
·
verified ·
1 Parent(s): 197636a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +566 -0
app.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import re
4
+ from pathlib import Path
5
+ from typing import List, Optional, Dict, Any, Tuple
6
+
7
+ import pandas as pd
8
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Query
9
+ from fastapi.responses import JSONResponse
10
+ import difflib
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+
13
+ app = FastAPI(title="RFQ ↔ Product Master Matcher (difflib hybrid)")
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=["*"], # lock this down in prod
18
+ allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+ # ---------- Fixed Tender Template ----------
24
+ TEMPLATE_COLUMNS = [
25
+ "id", "tender_id", "tender_code", "customer_id", "customer_name", "fy", "category", "code",
26
+ "current_brand_description", "generic_name", "annual_volume_qty", "quotation Price", "dosage form"
27
+ ]
28
+
29
+ # ---------- Normalization ----------
30
+ UNIT_PATTERN = r"(mg|mcg|g|iu|ml|%)"
31
+
32
+
33
+ def norm_base(s: str) -> str:
34
+ s = str(s or "")
35
+ s = s.lower()
36
+ s = s.replace("+", " ").replace("/", " ")
37
+ # keep word chars, digits, ., %, /, +, -
38
+ s = re.sub(r"[^\w\s.%/+-]", " ", s)
39
+ s = re.sub(r"\s+", " ", s).strip()
40
+ return s
41
+
42
+
43
+ def extract_numbers(s: str) -> List[str]:
44
+ s2 = norm_base(s)
45
+ num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
46
+ nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
47
+ return sorted(set([x.strip() for x in num_unit + nums]))
48
+
49
+
50
+ def token_set(s: str) -> List[str]:
51
+ return [t for t in norm_base(s).split(" ") if t]
52
+
53
+
54
+ # ---------- Synonyms / detection ----------
55
+ SYNONYMS: Dict[str, List[str]] = {
56
+ # RFQ → template mapping
57
+ "generic_name": [
58
+ "generic name", "generic", "molecule", "molecule name", "molecule with strength",
59
+ "composition", "salt", "api", "active ingredient"
60
+ ],
61
+ "current_brand_description": ["brand name", "brand", "trade name", "product", "product name", "item", "item name", "drug name"],
62
+ "annual_volume_qty": ["potential annual volume", "annual volume qty", "annual qty", "annual volume", "qty", "quantity", "rfq qty", "order qty", "excepted annual consumption qty_total", "annual consumption"],
63
+ "quotation Price": ["offer price(unit wise) without taxes in rs", "offer price", "unit price", "quoted rate", "rate", "basic rate", "price per unit", "price"],
64
+ "code": ["item code", "product code", "sku", "catalogue no", "catalog no", "catalog number", "code"],
65
+ "customer_name": ["customer name", "hospital name", "hospital", "buyer", "consignee", "institution", "institute", "organisation", "organization"],
66
+ "fy": ["fy", "financial year", "f.y.", "year"],
67
+ "id": ["s no", "sr no", "serial", "s.no", "line id", "id"],
68
+ "tender_id": ["tender id", "rfq id", "enquiry id"],
69
+ "tender_code": ["tender code", "rfq code", "enquiry code", "tender no", "tender number", "rfq no", "rfq number"],
70
+ "category": ["category", "schedule", "section", "chapter", "dept"],
71
+ "dosage form": ["dosage form", "form", "drug form", "pharmaceutical form", "presentation", "type", "medicine type"],
72
+
73
+ # Product master detection (support your original schema)
74
+ "__product_master_molecule__": ["molecule", "molecule name", "generic", "generic name", "api", "active ingredient", "composition", "salt"],
75
+ "__product_master_brand_id__": ["brand id", "brand_id", "id", "bid", "brand code", "brand_code", "brandcode"],
76
+ "__product_master_brand_name__": ["brand name", "brand", "product", "trade name", "brand_name", "brandname", "product name"],
77
+ }
78
+
79
+ # ---------- Header mapping ----------
80
+
81
+
82
+ def score_header(tcol: str, scol: str) -> float:
83
+ tn, sn = norm_base(tcol), norm_base(scol)
84
+ tset, sset = set(tn.split()), set(sn.split())
85
+ jacc = (len(tset & sset) / len(tset | sset)) if (tset and sset) else 0.0
86
+ contains = 1.0 if (tn in sn or sn in tn) else 0.0
87
+ fuzzy = difflib.SequenceMatcher(None, tn, sn).ratio()
88
+ return 0.60*jacc + 0.25*contains + 0.15*fuzzy
89
+
90
+
91
+ def map_headers_auto(src_cols: List[str], target_cols: List[str]) -> Dict[str, Optional[str]]:
92
+ src_cols = [str(c) for c in src_cols]
93
+ src_norm_map = {norm_base(c): c for c in src_cols}
94
+ mapping: Dict[str, Optional[str]] = {}
95
+ for tcol in target_cols:
96
+ # 1) exact synonym
97
+ for alias in SYNONYMS.get(tcol, []):
98
+ n = norm_base(alias)
99
+ if n in src_norm_map:
100
+ mapping[tcol] = src_norm_map[n]
101
+ break
102
+ else:
103
+ # 2) contains any synonym
104
+ hit = None
105
+ for alias in SYNONYMS.get(tcol, []):
106
+ n = norm_base(alias)
107
+ contain = [orig for nn, orig in src_norm_map.items()
108
+ if (n in nn or nn in n)]
109
+ if contain:
110
+ hit = contain[0]
111
+ break
112
+ if hit:
113
+ mapping[tcol] = hit
114
+ else:
115
+ # 3) best score
116
+ best_src, best_score = None, -1.0
117
+ for scol in src_cols:
118
+ sc = score_header(tcol, scol)
119
+ if sc > best_score:
120
+ best_score, best_src = sc, scol
121
+ mapping[tcol] = best_src if best_score >= 0.35 else None
122
+ return mapping
123
+
124
+
125
+ def detect_single_column(df: pd.DataFrame, logical_name: str) -> Optional[str]:
126
+ cols = [str(c) for c in df.columns]
127
+ norm_map = {norm_base(c): c for c in cols}
128
+ # exact first
129
+ for alias in SYNONYMS.get(logical_name, []):
130
+ n = norm_base(alias)
131
+ if n in norm_map:
132
+ return norm_map[n]
133
+ # contains next
134
+ for alias in SYNONYMS.get(logical_name, []):
135
+ n = norm_base(alias)
136
+ for nn, orig in norm_map.items():
137
+ if n in nn or nn in n:
138
+ return orig
139
+ # fallback: score
140
+ best_col, best_score = None, -1.0
141
+ for c in cols:
142
+ sc = score_header(logical_name, c)
143
+ if sc > best_score:
144
+ best_score, best_col = sc, c
145
+ return best_col if best_score >= 0.35 else None
146
+
147
+ # ---------- File reading ----------
148
+
149
+
150
+ def guess_delimiter(sample: str) -> str:
151
+ for d in ["\t", ";", "|", ","]:
152
+ if d in sample:
153
+ return d if d != "\t" else "\t"
154
+ return ","
155
+
156
+
157
+ def drop_unnamed_columns(df: pd.DataFrame) -> pd.DataFrame:
158
+ keep = [c for c in df.columns if not str(c).startswith("Unnamed")]
159
+ return df.loc[:, keep]
160
+
161
+
162
+ def ensure_str_columns(df: pd.DataFrame) -> pd.DataFrame:
163
+ df.columns = [str(c) for c in df.columns]
164
+ return df
165
+
166
+
167
+ def choose_best_sheet_and_header(xl: pd.ExcelFile, max_header_rows: int = 30):
168
+ best = {"score": -1, "df": None, "sheet": None,
169
+ "header": None, "mapping": None}
170
+ for sheet in xl.sheet_names:
171
+ for header in range(max_header_rows + 1):
172
+ try:
173
+ df = pd.read_excel(xl, sheet_name=sheet, header=header)
174
+ df = drop_unnamed_columns(df)
175
+ if df.dropna(how="all").empty:
176
+ continue
177
+ df = ensure_str_columns(df)
178
+ m = map_headers_auto(df.columns.tolist(), TEMPLATE_COLUMNS)
179
+ score = sum(1 for v in m.values() if v is not None)
180
+ if score > best["score"]:
181
+ best = {"score": score, "df": df, "sheet": sheet,
182
+ "header": header, "mapping": m}
183
+ except:
184
+ continue
185
+ if best["df"] is None:
186
+ raise ValueError("No readable tables found in the Excel workbook.")
187
+ return best
188
+
189
+
190
+ def dataframe_from_upload_bytes(filename: str, data: bytes) -> pd.DataFrame:
191
+ ext = Path(filename).suffix.lower()
192
+ if ext in [".xlsx", ".xls", ".xlsm", ".ods"]:
193
+ xl = pd.ExcelFile(io.BytesIO(data))
194
+ best = choose_best_sheet_and_header(xl)
195
+ return best["df"]
196
+ if ext in [".csv", ".tsv"]:
197
+ text = data.decode("utf-8", errors="ignore")
198
+ delim = guess_delimiter(text[:4096])
199
+ return pd.read_csv(io.StringIO(text), sep=delim, engine="python")
200
+ if ext == ".json":
201
+ js = json.loads(data.decode("utf-8", errors="ignore"))
202
+ # Accept both raw list and your original object with "data"
203
+ if isinstance(js, list):
204
+ return pd.DataFrame(js)
205
+ if isinstance(js, dict) and "data" in js and isinstance(js["data"], list):
206
+ return pd.json_normalize(js["data"])
207
+ raise ValueError(
208
+ "Product master JSON must be a list of objects or an object with a 'data' array.")
209
+ raise ValueError(f"Unsupported file type: {ext}")
210
+
211
+
212
+ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Optional[str]]]:
213
+ src_df = ensure_str_columns(drop_unnamed_columns(src_df))
214
+ mapping = map_headers_auto(src_df.columns.tolist(), TEMPLATE_COLUMNS)
215
+ out = pd.DataFrame(index=src_df.index)
216
+ for tcol in TEMPLATE_COLUMNS:
217
+ src = mapping.get(tcol)
218
+ out[tcol] = src_df[str(src)] if src else pd.Series(
219
+ [pd.NA]*len(src_df), index=src_df.index)
220
+ return out, mapping
221
+
222
+ # ---------- Hybrid difflib score ----------
223
+
224
+
225
+ def hybrid_similarity(a: str, b: str) -> Dict[str, float]:
226
+ a_n, b_n = norm_base(a), norm_base(b)
227
+ if a_n == b_n:
228
+ return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "score": 100.0}
229
+ diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0
230
+ aset, bset = set(token_set(a)), set(token_set(b))
231
+ jacc = (len(aset & bset) / len(aset | bset)
232
+ * 100.0) if (aset and bset) else 0.0
233
+ anums, bnums = extract_numbers(a), extract_numbers(b)
234
+ num_bonus = 100.0 if (anums and bnums and set(anums)
235
+ == set(bnums)) else 0.0
236
+ score = 0.60*diff + 0.30*jacc + 0.10*num_bonus
237
+ return {
238
+ "diff": round(diff, 2),
239
+ "jacc": round(jacc, 2),
240
+ "num": 100.0 if num_bonus else 0.0,
241
+ "score": round(score, 2)
242
+ }
243
+
244
+
245
+ def match_generic_to_product_master(
246
+ generic_list: List[str],
247
+ pm_df: pd.DataFrame,
248
+ molecule_col: str,
249
+ brand_id_col: Optional[str],
250
+ brand_name_col: Optional[str],
251
+ min_score: float = 80.0,
252
+ return_all: bool = False
253
+ ) -> List[Dict[str, Any]]:
254
+ subset = pm_df.dropna(subset=[molecule_col]).copy()
255
+ mol_raw = subset[molecule_col].astype(str).tolist()
256
+
257
+ # brand id/name fallbacks are handled by detect function below; arrays may be None
258
+ brand_ids = subset[brand_id_col].astype(str).tolist(
259
+ ) if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
260
+ # brand name: prefer brand_name; else brand; else product (detect_single_column will choose)
261
+ brand_names = subset[brand_name_col].astype(str).tolist(
262
+ ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
263
+ idxs = subset.index.tolist()
264
+
265
+ results = []
266
+ for i, g in enumerate(generic_list):
267
+ g_str = str(g or "").strip()
268
+ if not g_str:
269
+ continue
270
+ best_score, best_pos, best_parts = -1.0, None, None
271
+ for pos, cand in enumerate(mol_raw):
272
+ parts = hybrid_similarity(g_str, cand)
273
+ if parts["score"] > best_score:
274
+ best_score, best_pos, best_parts = parts["score"], pos, parts
275
+ if best_pos is None:
276
+ continue
277
+
278
+ item = {
279
+ "row_index": i,
280
+ "generic_name": g_str,
281
+ "matched_name": mol_raw[best_pos],
282
+ "match_percent": round(best_score, 2),
283
+ "brand_id": brand_ids[best_pos],
284
+ "brand_name": brand_names[best_pos],
285
+ "master_row_index": int(idxs[best_pos]),
286
+ }
287
+ if return_all:
288
+ item["_debug"] = best_parts
289
+ results.append(item)
290
+ else:
291
+ if best_score >= min_score:
292
+ results.append(item)
293
+
294
+ return results
295
+
296
+ # ---------- NEW: Grouped matcher (generic_name -> array of matches) ----------
297
+
298
+
299
+ def match_generic_to_product_master_grouped_for_row(
300
+ generic_value: str,
301
+ pm_df: pd.DataFrame,
302
+ molecule_col: str,
303
+ brand_id_col: Optional[str],
304
+ brand_name_col: Optional[str],
305
+ min_score: float = 60.0,
306
+ top_n: int = 3
307
+ ) -> List[Dict[str, Any]]:
308
+ """Compute matches for a *single* RFQ row's generic name."""
309
+ subset = pm_df.dropna(subset=[molecule_col]).copy()
310
+ mol_raw = subset[molecule_col].astype(str).tolist()
311
+ brand_ids = subset[brand_id_col].astype(str).tolist(
312
+ ) if brand_id_col and brand_id_col in subset.columns else [None]*len(subset)
313
+ brand_names = subset[brand_name_col].astype(str).tolist(
314
+ ) if brand_name_col and brand_name_col in subset.columns else [None]*len(subset)
315
+
316
+ g_str = str(generic_value or "").strip()
317
+ if not g_str:
318
+ return []
319
+
320
+ scored = []
321
+ for idx, cand in enumerate(mol_raw):
322
+ parts = hybrid_similarity(g_str, cand)
323
+ score = parts["score"]
324
+ if score >= min_score:
325
+ scored.append({
326
+ "matched_name": cand,
327
+ "match_percent": round(score, 2),
328
+ "brand_id": brand_ids[idx],
329
+ "brand_name": brand_names[idx]
330
+ })
331
+ scored.sort(key=lambda x: x["match_percent"], reverse=True)
332
+ return scored[:top_n]
333
+
334
+ # ---------- Endpoints ----------
335
+
336
+
337
+ @app.post("/match-difflib")
338
+ async def match_with_difflib(
339
+ rfq_file: UploadFile = File(...),
340
+ product_master_json: UploadFile = File(...),
341
+ min_score: float = Query(
342
+ 80.0, description="Minimum composite score (0-100)")
343
+ ):
344
+ try:
345
+ # RFQ
346
+ rfq_bytes = await rfq_file.read()
347
+ rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
348
+ mapped, mapping = build_mapped_rfq(rfq_df)
349
+
350
+ if "generic_name" not in mapped.columns:
351
+ raise HTTPException(
352
+ status_code=400, detail="No 'generic_name' column found after mapping RFQ.")
353
+
354
+ gen_series = mapped["generic_name"]
355
+ nonempty_mask = gen_series.notna() & gen_series.astype(
356
+ str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
357
+ generic_list = gen_series[nonempty_mask].astype(str).tolist()
358
+
359
+ # Product master (supports your original JSON shape)
360
+ pm_bytes = await product_master_json.read()
361
+ pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
362
+ pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
363
+
364
+ molecule_col = detect_single_column(
365
+ pm_df, "__product_master_molecule__")
366
+ # brand id: prefer brand_id, else id
367
+ brand_id_col = detect_single_column(
368
+ pm_df, "__product_master_brand_id__")
369
+ # brand name: prefer brand_name, else brand, else product
370
+ brand_name_col = detect_single_column(
371
+ pm_df, "__product_master_brand_name__")
372
+
373
+ if not molecule_col:
374
+ raise HTTPException(
375
+ status_code=400, detail="Could not detect molecule column in product master JSON.")
376
+
377
+ matches = match_generic_to_product_master(
378
+ generic_list, pm_df,
379
+ molecule_col=molecule_col,
380
+ brand_id_col=brand_id_col,
381
+ brand_name_col=brand_name_col,
382
+ min_score=min_score,
383
+ return_all=False
384
+ )
385
+
386
+ return JSONResponse({
387
+ "rfq_rows": int(nonempty_mask.sum()),
388
+ "product_master_detected": {
389
+ "molecule_col": molecule_col,
390
+ "brand_id_col": brand_id_col,
391
+ "brand_name_col": brand_name_col
392
+ },
393
+ "matches_returned": len(matches),
394
+ "data": matches
395
+ })
396
+ except HTTPException:
397
+ raise
398
+ except Exception as e:
399
+ raise HTTPException(status_code=500, detail=str(e))
400
+
401
+
402
+ @app.post("/match-difflib-debug")
403
+ async def match_with_difflib_debug(
404
+ rfq_file: UploadFile = File(...),
405
+ product_master_json: UploadFile = File(...),
406
+ sample: int = Query(5, ge=1, le=200),
407
+ min_score: float = Query(80.0),
408
+ sample_contains: str = Query(
409
+ "", description="Filter RFQ rows by substring (case-insensitive)")
410
+ ):
411
+ """
412
+ Diagnostics: return BEST match (+%) for the first N RFQ rows, optionally filtered by text.
413
+ Always returns best match, even if below min_score, so you can inspect behavior.
414
+ """
415
+ try:
416
+ # RFQ
417
+ rfq_bytes = await rfq_file.read()
418
+ rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
419
+ mapped, mapping = build_mapped_rfq(rfq_df)
420
+
421
+ gen_series = mapped.get("generic_name", pd.Series([], dtype=object))
422
+ nonempty_mask = gen_series.notna() & gen_series.astype(
423
+ str).str.strip().ne("") & gen_series.astype(str).str.lower().ne("<na>")
424
+ generic_list_all = gen_series[nonempty_mask].astype(str)
425
+
426
+ if sample_contains:
427
+ flt = generic_list_all.str.contains(
428
+ sample_contains, case=False, na=False)
429
+ generic_list = generic_list_all[flt].tolist()[:sample]
430
+ else:
431
+ generic_list = generic_list_all.tolist()[:sample]
432
+
433
+ # Product master
434
+ pm_bytes = await product_master_json.read()
435
+ pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
436
+ pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
437
+
438
+ molecule_col = detect_single_column(
439
+ pm_df, "__product_master_molecule__")
440
+ brand_id_col = detect_single_column(
441
+ pm_df, "__product_master_brand_id__")
442
+ brand_name_col = detect_single_column(
443
+ pm_df, "__product_master_brand_name__")
444
+
445
+ demo_matches = match_generic_to_product_master(
446
+ generic_list, pm_df,
447
+ molecule_col=molecule_col,
448
+ brand_id_col=brand_id_col,
449
+ brand_name_col=brand_name_col,
450
+ min_score=min_score,
451
+ return_all=True
452
+ )
453
+
454
+ return JSONResponse({
455
+ "rfq_detected_headers": list(map(str, rfq_df.columns)),
456
+ "template_mapping": mapping,
457
+ "nonempty_generic_count": int(nonempty_mask.sum()),
458
+ "product_master_detected": {
459
+ "molecule_col": molecule_col,
460
+ "brand_id_col": brand_id_col,
461
+ "brand_name_col": brand_name_col
462
+ },
463
+ "filter": sample_contains or None,
464
+ "examples": demo_matches
465
+ })
466
+ except HTTPException:
467
+ raise
468
+ except Exception as e:
469
+ raise HTTPException(status_code=500, detail=str(e))
470
+
471
+ # ---------- NEW: Grouped endpoint ----------
472
+
473
+
474
+ @app.post("/match-difflib-grouped")
475
+ async def match_with_difflib_grouped(
476
+ rfq_file: UploadFile = File(...),
477
+ product_master_json: UploadFile = File(...),
478
+ min_score: float = Query(60.0, description="Minimum score to include"),
479
+ top_n: int = Query(3, description="Max number of matches per RFQ row")
480
+ ):
481
+ """
482
+ Return ALL extracted RFQ rows (template-aligned fields), each with a `matches` array of
483
+ product master molecules (matched_name, match_percent, brand_id, brand_name) scoring ≥ min_score.
484
+ Rows with no matches still appear with an empty `matches` list.
485
+ """
486
+ try:
487
+ # RFQ
488
+ rfq_bytes = await rfq_file.read()
489
+ rfq_df = dataframe_from_upload_bytes(rfq_file.filename, rfq_bytes)
490
+ mapped, mapping = build_mapped_rfq(rfq_df)
491
+ # Ensure columns exist even if not mapped
492
+ for col in TEMPLATE_COLUMNS:
493
+ if col not in mapped.columns:
494
+ mapped[col] = pd.NA
495
+
496
+ # Product master
497
+ pm_bytes = await product_master_json.read()
498
+ pm_df = dataframe_from_upload_bytes("product_master.json", pm_bytes)
499
+ pm_df = ensure_str_columns(drop_unnamed_columns(pm_df))
500
+
501
+ molecule_col = detect_single_column(
502
+ pm_df, "__product_master_molecule__")
503
+ brand_id_col = detect_single_column(
504
+ pm_df, "__product_master_brand_id__")
505
+ brand_name_col = detect_single_column(
506
+ pm_df, "__product_master_brand_name__")
507
+ if not molecule_col:
508
+ raise HTTPException(
509
+ status_code=400, detail="Could not detect molecule column in product master JSON.")
510
+
511
+ # Build response data: include every RFQ row as extracted, plus matches
512
+ data_out = []
513
+ match_rows_with_any = 0
514
+
515
+ # Work only with the same index order; keep all rows
516
+ for idx, row in mapped.iterrows():
517
+ # serialize RFQ row (template-aligned)
518
+ rfq_record = {col: (None if pd.isna(row.get(col)) else str(
519
+ row.get(col))) for col in TEMPLATE_COLUMNS}
520
+
521
+ # compute matches based on this row's generic_name
522
+ g_val = rfq_record.get("generic_name") or ""
523
+ matches = match_generic_to_product_master_grouped_for_row(
524
+ generic_value=g_val,
525
+ pm_df=pm_df,
526
+ molecule_col=molecule_col,
527
+ brand_id_col=brand_id_col,
528
+ brand_name_col=brand_name_col,
529
+ min_score=min_score,
530
+ top_n=top_n
531
+ )
532
+ if matches:
533
+ match_rows_with_any += 1
534
+
535
+ data_out.append({
536
+ "row_index": int(idx),
537
+ # ALL extracted fields (id, generic_name, annual_volume_qty, etc.)
538
+ "rfq": rfq_record,
539
+ "matches": matches # zero or more matches
540
+ })
541
+
542
+ return {
543
+ "rfq_rows": int(len(mapped)),
544
+ "product_master_detected": {
545
+ "molecule_col": molecule_col,
546
+ "brand_id_col": brand_id_col,
547
+ "brand_name_col": brand_name_col
548
+ },
549
+ "rows_with_matches": match_rows_with_any,
550
+ "data": data_out
551
+ }
552
+ except HTTPException:
553
+ raise
554
+ except Exception as e:
555
+ raise HTTPException(status_code=500, detail=str(e))
556
+
557
+
558
+ @app.get("/debug-score")
559
+ def debug_score(a: str, b: str):
560
+ """Quick check for two strings."""
561
+ return hybrid_similarity(a, b)
562
+
563
+
564
+ @app.get("/")
565
+ def root():
566
+ return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}