Spaces:

aniket9909
/

tenderapi

Sleeping

App Files Files Community

aniket9909 commited on Dec 12, 2025

Commit

cee2f03

verified ·

1 Parent(s): 93e4d69

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -29

app.py CHANGED Viewed

@@ -40,11 +40,19 @@ def norm_base(s: str) -> str:
     return s
-def extract_numbers(s: str) -> List[str]:
     s2 = norm_base(s)
-    num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
-    nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
-    return sorted(set([x.strip() for x in num_unit + nums]))
 def token_set(s: str) -> List[str]:
@@ -222,27 +230,28 @@ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Opti
 # ---------- Hybrid difflib score ----------
-def extract_molecule_base(s:  str) -> str:
     """Extract core molecule name by removing dosages, units, and forms."""
     s_norm = norm_base(s)
-    # Remove common dosage forms (more comprehensive)
-    forms = r'\b(tablet|capsule|cap|injection|inj|syrup|suspension|cream|ointment|gel|drops|spray|powder|inhaler|solution|ampule|amp|vial|via|bottle|bot|sachet|sac|suppository|sup|patch|pat|lotion|respules|res|pfs|kit|num|car|pac|tub|box)\b'
-    s_norm = re.sub(forms, ' ', s_norm)
-    # Remove dosage patterns (numbers + units) - MORE AGGRESSIVE
-    s_norm = re. sub(rf'\d+(?:\.\d+)?\s*{UNIT_PATTERN}', ' ', s_norm)
-    # Remove fractions and ratios like "30/70", "500/125"
-    s_norm = re. sub(r'\d+/\d+', ' ', s_norm)
-    # Remove standalone numbers
-    s_norm = re. sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
-    # Remove w/w, w/v, v/v
-    s_norm = re.sub(r'\b[wv]/[wv]\b', ' ', s_norm)
-    # Clean up multiple spaces
     s_norm = re.sub(r'\s+', ' ', s_norm).strip()
     return s_norm
@@ -359,15 +368,13 @@ def match_generic_to_product_master(
         best_score, best_pos, best_parts = -1.0, None, None
-        for pos, mol in enumerate(mol_raw):
-            # 🔥 COMBINED TEXT FOR MATCHING
-            combined_text = f"{mol} {brand_names[pos] or ''}".strip()
-            parts = hybrid_similarity(g_str, combined_text)
-            if parts["score"] > best_score:
-                best_score, best_pos, best_parts = parts["score"], pos, parts
         if best_pos is None:
             continue
@@ -424,10 +431,8 @@ def match_generic_to_product_master_grouped_for_row(
     for idx, mol in enumerate(mol_raw):
-        # 🔥 Use molecule + brand/product name for matching
-        combined_text = f"{mol} {brand_names[idx] or ''}".strip()
-        parts = hybrid_similarity(g_str, combined_text)
         score = parts["score"]
         if score >= min_score:
@@ -512,6 +517,21 @@ async def match_with_difflib(
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/match-difflib-debug")
 async def match_with_difflib_debug(
     rfq_file: UploadFile = File(...),
@@ -678,7 +698,6 @@ def debug_score(a: str, b: str):
 def root():
     return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

     return s
+def extract_numbers(s:  str) -> List[str]:
     s2 = norm_base(s)
+    # Extract number+unit combinations (e.g., "500mg", "500 mg")
+    num_unit = re.findall(
+        r'\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%)', s2, flags=re.IGNORECASE)
+    # Extract standalone numbers (e.g., "500")
+    nums = re.findall(r'\d+(?:\.\d+)?', s2)
+    # Combine and deduplicate
+    all_numbers = num_unit + nums
+    return sorted(set([x.strip() for x in all_numbers]))
 def token_set(s: str) -> List[str]:
 # ---------- Hybrid difflib score ----------
+def extract_molecule_base(s: str) -> str:
     """Extract core molecule name by removing dosages, units, and forms."""
     s_norm = norm_base(s)
+    # Step 1: Remove dosage forms FIRST
+    forms = r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b'
+    s_norm = re.sub(forms, ' ', s_norm, flags=re.IGNORECASE)
+    # Step 2: Remove number+unit patterns (handles "500mg" and "500 mg")
+    s_norm = re.sub(r'\b\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v)\b',
+                    ' ', s_norm, flags=re. IGNORECASE)
+    # Step 3: Remove fractions and ratios
+    s_norm = re.sub(r'\d+\s*/\s*\d+', ' ', s_norm)
+    # Step 4: Remove standalone numbers
+    s_norm = re.sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
+    # Step 5: Remove w/w, w/v, v/v
+    s_norm = re.sub(r'\b[wv]\s*/\s*[wv]\b', ' ', s_norm, flags=re.IGNORECASE)
+    # Step 6: Clean up spaces
     s_norm = re.sub(r'\s+', ' ', s_norm).strip()
     return s_norm
         best_score, best_pos, best_parts = -1.0, None, None
+    for pos, mol in enumerate(mol_raw):
+        # 🔥 Match ONLY against molecule name (ignore brand)
+        parts = hybrid_similarity(g_str, mol)
+        if parts["score"] > best_score:
+            best_score, best_pos, best_parts = parts["score"], pos, parts
         if best_pos is None:
             continue
     for idx, mol in enumerate(mol_raw):
+        # 🔥 Match ONLY against molecule name (ignore brand)
+        parts = hybrid_similarity(g_str, mol)
         score = parts["score"]
         if score >= min_score:
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/test-extract-base")
+def test_extract_base(text: str):
+    """Test molecule base extraction"""
+    normalized = norm_base(text)
+    mol_base = extract_molecule_base(text)
+    return {
+        "original": text,
+        "normalized":  normalized,
+        "molecule_base": mol_base,
+        "numbers_extracted": extract_numbers(text),
+        "tokens":  token_set(text)
+    }
 @app.post("/match-difflib-debug")
 async def match_with_difflib_debug(
     rfq_file: UploadFile = File(...),
 def root():
     return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)