aniket9909 commited on
Commit
cee2f03
·
verified ·
1 Parent(s): 93e4d69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -29
app.py CHANGED
@@ -40,11 +40,19 @@ def norm_base(s: str) -> str:
40
  return s
41
 
42
 
43
- def extract_numbers(s: str) -> List[str]:
44
  s2 = norm_base(s)
45
- num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
46
- nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
47
- return sorted(set([x.strip() for x in num_unit + nums]))
 
 
 
 
 
 
 
 
48
 
49
 
50
  def token_set(s: str) -> List[str]:
@@ -222,27 +230,28 @@ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Opti
222
  # ---------- Hybrid difflib score ----------
223
 
224
 
225
- def extract_molecule_base(s: str) -> str:
226
  """Extract core molecule name by removing dosages, units, and forms."""
227
  s_norm = norm_base(s)
228
 
229
- # Remove common dosage forms (more comprehensive)
230
- forms = r'\b(tablet|capsule|cap|injection|inj|syrup|suspension|cream|ointment|gel|drops|spray|powder|inhaler|solution|ampule|amp|vial|via|bottle|bot|sachet|sac|suppository|sup|patch|pat|lotion|respules|res|pfs|kit|num|car|pac|tub|box)\b'
231
- s_norm = re.sub(forms, ' ', s_norm)
232
 
233
- # Remove dosage patterns (numbers + units) - MORE AGGRESSIVE
234
- s_norm = re. sub(rf'\d+(?:\.\d+)?\s*{UNIT_PATTERN}', ' ', s_norm)
 
235
 
236
- # Remove fractions and ratios like "30/70", "500/125"
237
- s_norm = re. sub(r'\d+/\d+', ' ', s_norm)
238
 
239
- # Remove standalone numbers
240
- s_norm = re. sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
241
 
242
- # Remove w/w, w/v, v/v
243
- s_norm = re.sub(r'\b[wv]/[wv]\b', ' ', s_norm)
244
 
245
- # Clean up multiple spaces
246
  s_norm = re.sub(r'\s+', ' ', s_norm).strip()
247
 
248
  return s_norm
@@ -359,15 +368,13 @@ def match_generic_to_product_master(
359
 
360
  best_score, best_pos, best_parts = -1.0, None, None
361
 
362
- for pos, mol in enumerate(mol_raw):
363
 
364
- # 🔥 COMBINED TEXT FOR MATCHING
365
- combined_text = f"{mol} {brand_names[pos] or ''}".strip()
366
 
367
- parts = hybrid_similarity(g_str, combined_text)
368
-
369
- if parts["score"] > best_score:
370
- best_score, best_pos, best_parts = parts["score"], pos, parts
371
 
372
  if best_pos is None:
373
  continue
@@ -424,10 +431,8 @@ def match_generic_to_product_master_grouped_for_row(
424
 
425
  for idx, mol in enumerate(mol_raw):
426
 
427
- # 🔥 Use molecule + brand/product name for matching
428
- combined_text = f"{mol} {brand_names[idx] or ''}".strip()
429
-
430
- parts = hybrid_similarity(g_str, combined_text)
431
  score = parts["score"]
432
 
433
  if score >= min_score:
@@ -512,6 +517,21 @@ async def match_with_difflib(
512
  raise HTTPException(status_code=500, detail=str(e))
513
 
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  @app.post("/match-difflib-debug")
516
  async def match_with_difflib_debug(
517
  rfq_file: UploadFile = File(...),
@@ -678,7 +698,6 @@ def debug_score(a: str, b: str):
678
  def root():
679
  return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
680
 
681
-
682
  if __name__ == "__main__":
683
  import uvicorn
684
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
40
  return s
41
 
42
 
43
+ def extract_numbers(s: str) -> List[str]:
44
  s2 = norm_base(s)
45
+
46
+ # Extract number+unit combinations (e.g., "500mg", "500 mg")
47
+ num_unit = re.findall(
48
+ r'\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%)', s2, flags=re.IGNORECASE)
49
+
50
+ # Extract standalone numbers (e.g., "500")
51
+ nums = re.findall(r'\d+(?:\.\d+)?', s2)
52
+
53
+ # Combine and deduplicate
54
+ all_numbers = num_unit + nums
55
+ return sorted(set([x.strip() for x in all_numbers]))
56
 
57
 
58
  def token_set(s: str) -> List[str]:
 
230
  # ---------- Hybrid difflib score ----------
231
 
232
 
233
+ def extract_molecule_base(s: str) -> str:
234
  """Extract core molecule name by removing dosages, units, and forms."""
235
  s_norm = norm_base(s)
236
 
237
+ # Step 1: Remove dosage forms FIRST
238
+ forms = r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b'
239
+ s_norm = re.sub(forms, ' ', s_norm, flags=re.IGNORECASE)
240
 
241
+ # Step 2: Remove number+unit patterns (handles "500mg" and "500 mg")
242
+ s_norm = re.sub(r'\b\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v)\b',
243
+ ' ', s_norm, flags=re. IGNORECASE)
244
 
245
+ # Step 3: Remove fractions and ratios
246
+ s_norm = re.sub(r'\d+\s*/\s*\d+', ' ', s_norm)
247
 
248
+ # Step 4: Remove standalone numbers
249
+ s_norm = re.sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
250
 
251
+ # Step 5: Remove w/w, w/v, v/v
252
+ s_norm = re.sub(r'\b[wv]\s*/\s*[wv]\b', ' ', s_norm, flags=re.IGNORECASE)
253
 
254
+ # Step 6: Clean up spaces
255
  s_norm = re.sub(r'\s+', ' ', s_norm).strip()
256
 
257
  return s_norm
 
368
 
369
  best_score, best_pos, best_parts = -1.0, None, None
370
 
371
+ for pos, mol in enumerate(mol_raw):
372
 
373
+ # 🔥 Match ONLY against molecule name (ignore brand)
374
+ parts = hybrid_similarity(g_str, mol)
375
 
376
+ if parts["score"] > best_score:
377
+ best_score, best_pos, best_parts = parts["score"], pos, parts
 
 
378
 
379
  if best_pos is None:
380
  continue
 
431
 
432
  for idx, mol in enumerate(mol_raw):
433
 
434
+ # 🔥 Match ONLY against molecule name (ignore brand)
435
+ parts = hybrid_similarity(g_str, mol)
 
 
436
  score = parts["score"]
437
 
438
  if score >= min_score:
 
517
  raise HTTPException(status_code=500, detail=str(e))
518
 
519
 
520
+ @app.get("/test-extract-base")
521
+ def test_extract_base(text: str):
522
+ """Test molecule base extraction"""
523
+ normalized = norm_base(text)
524
+ mol_base = extract_molecule_base(text)
525
+
526
+ return {
527
+ "original": text,
528
+ "normalized": normalized,
529
+ "molecule_base": mol_base,
530
+ "numbers_extracted": extract_numbers(text),
531
+ "tokens": token_set(text)
532
+ }
533
+
534
+
535
  @app.post("/match-difflib-debug")
536
  async def match_with_difflib_debug(
537
  rfq_file: UploadFile = File(...),
 
698
  def root():
699
  return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
700
 
 
701
  if __name__ == "__main__":
702
  import uvicorn
703
  uvicorn.run(app, host="0.0.0.0", port=7860)