Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -40,11 +40,19 @@ def norm_base(s: str) -> str:
|
|
| 40 |
return s
|
| 41 |
|
| 42 |
|
| 43 |
-
def extract_numbers(s:
|
| 44 |
s2 = norm_base(s)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def token_set(s: str) -> List[str]:
|
|
@@ -222,27 +230,28 @@ def build_mapped_rfq(src_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Opti
|
|
| 222 |
# ---------- Hybrid difflib score ----------
|
| 223 |
|
| 224 |
|
| 225 |
-
def extract_molecule_base(s:
|
| 226 |
"""Extract core molecule name by removing dosages, units, and forms."""
|
| 227 |
s_norm = norm_base(s)
|
| 228 |
|
| 229 |
-
# Remove
|
| 230 |
-
forms = r'\b(tablet|capsule|cap|injection|inj|syrup|suspension|cream|ointment|gel|drops|spray|powder|inhaler|solution|ampule|amp|vial|via|bottle|bot|sachet|sac|suppository|sup|patch|pat|lotion|respules|res|pfs|kit|num|car|pac|tub|box)\b'
|
| 231 |
-
s_norm = re.sub(forms, ' ', s_norm)
|
| 232 |
|
| 233 |
-
# Remove
|
| 234 |
-
s_norm = re.
|
|
|
|
| 235 |
|
| 236 |
-
# Remove fractions and ratios
|
| 237 |
-
s_norm = re.
|
| 238 |
|
| 239 |
-
# Remove standalone numbers
|
| 240 |
-
s_norm = re.
|
| 241 |
|
| 242 |
-
# Remove w/w, w/v, v/v
|
| 243 |
-
s_norm = re.sub(r'\b[wv]
|
| 244 |
|
| 245 |
-
# Clean up
|
| 246 |
s_norm = re.sub(r'\s+', ' ', s_norm).strip()
|
| 247 |
|
| 248 |
return s_norm
|
|
@@ -359,15 +368,13 @@ def match_generic_to_product_master(
|
|
| 359 |
|
| 360 |
best_score, best_pos, best_parts = -1.0, None, None
|
| 361 |
|
| 362 |
-
|
| 363 |
|
| 364 |
-
|
| 365 |
-
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
if parts["score"] > best_score:
|
| 370 |
-
best_score, best_pos, best_parts = parts["score"], pos, parts
|
| 371 |
|
| 372 |
if best_pos is None:
|
| 373 |
continue
|
|
@@ -424,10 +431,8 @@ def match_generic_to_product_master_grouped_for_row(
|
|
| 424 |
|
| 425 |
for idx, mol in enumerate(mol_raw):
|
| 426 |
|
| 427 |
-
# 🔥
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
parts = hybrid_similarity(g_str, combined_text)
|
| 431 |
score = parts["score"]
|
| 432 |
|
| 433 |
if score >= min_score:
|
|
@@ -512,6 +517,21 @@ async def match_with_difflib(
|
|
| 512 |
raise HTTPException(status_code=500, detail=str(e))
|
| 513 |
|
| 514 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
@app.post("/match-difflib-debug")
|
| 516 |
async def match_with_difflib_debug(
|
| 517 |
rfq_file: UploadFile = File(...),
|
|
@@ -678,7 +698,6 @@ def debug_score(a: str, b: str):
|
|
| 678 |
def root():
|
| 679 |
return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
|
| 680 |
|
| 681 |
-
|
| 682 |
if __name__ == "__main__":
|
| 683 |
import uvicorn
|
| 684 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 40 |
return s
|
| 41 |
|
| 42 |
|
| 43 |
+
def extract_numbers(s: str) -> List[str]:
|
| 44 |
s2 = norm_base(s)
|
| 45 |
+
|
| 46 |
+
# Extract number+unit combinations (e.g., "500mg", "500 mg")
|
| 47 |
+
num_unit = re.findall(
|
| 48 |
+
r'\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%)', s2, flags=re.IGNORECASE)
|
| 49 |
+
|
| 50 |
+
# Extract standalone numbers (e.g., "500")
|
| 51 |
+
nums = re.findall(r'\d+(?:\.\d+)?', s2)
|
| 52 |
+
|
| 53 |
+
# Combine and deduplicate
|
| 54 |
+
all_numbers = num_unit + nums
|
| 55 |
+
return sorted(set([x.strip() for x in all_numbers]))
|
| 56 |
|
| 57 |
|
| 58 |
def token_set(s: str) -> List[str]:
|
|
|
|
| 230 |
# ---------- Hybrid difflib score ----------
|
| 231 |
|
| 232 |
|
| 233 |
+
def extract_molecule_base(s: str) -> str:
|
| 234 |
"""Extract core molecule name by removing dosages, units, and forms."""
|
| 235 |
s_norm = norm_base(s)
|
| 236 |
|
| 237 |
+
# Step 1: Remove dosage forms FIRST
|
| 238 |
+
forms = r'\b(tablet|tablets|capsule|capsules|cap|caps|injection|injections|inj|syrup|syrups|suspension|suspensions|cream|creams|ointment|ointments|gel|gels|drop|drops|spray|sprays|powder|powders|inhaler|inhalers|solution|solutions|ampule|ampules|amp|amps|vial|vials|via|bottle|bottles|bot|bots|sachet|sachets|sac|sacs|suppository|suppositories|sup|sups|patch|patches|pat|pats|lotion|lotions|respule|respules|res|pfs|kit|kits|num|nums|car|cars|pac|pacs|tub|tubs|box|boxes|for)\b'
|
| 239 |
+
s_norm = re.sub(forms, ' ', s_norm, flags=re.IGNORECASE)
|
| 240 |
|
| 241 |
+
# Step 2: Remove number+unit patterns (handles "500mg" and "500 mg")
|
| 242 |
+
s_norm = re.sub(r'\b\d+(?:\.\d+)?\s*(?:mg|mcg|μg|µg|gm?|kg|iu|i\.u\.|kiu|miu|ml|l|dl|%|w/w|w/v|v/v)\b',
|
| 243 |
+
' ', s_norm, flags=re. IGNORECASE)
|
| 244 |
|
| 245 |
+
# Step 3: Remove fractions and ratios
|
| 246 |
+
s_norm = re.sub(r'\d+\s*/\s*\d+', ' ', s_norm)
|
| 247 |
|
| 248 |
+
# Step 4: Remove standalone numbers
|
| 249 |
+
s_norm = re.sub(r'\b\d+(?:\.\d+)?\b', ' ', s_norm)
|
| 250 |
|
| 251 |
+
# Step 5: Remove w/w, w/v, v/v
|
| 252 |
+
s_norm = re.sub(r'\b[wv]\s*/\s*[wv]\b', ' ', s_norm, flags=re.IGNORECASE)
|
| 253 |
|
| 254 |
+
# Step 6: Clean up spaces
|
| 255 |
s_norm = re.sub(r'\s+', ' ', s_norm).strip()
|
| 256 |
|
| 257 |
return s_norm
|
|
|
|
| 368 |
|
| 369 |
best_score, best_pos, best_parts = -1.0, None, None
|
| 370 |
|
| 371 |
+
for pos, mol in enumerate(mol_raw):
|
| 372 |
|
| 373 |
+
# 🔥 Match ONLY against molecule name (ignore brand)
|
| 374 |
+
parts = hybrid_similarity(g_str, mol)
|
| 375 |
|
| 376 |
+
if parts["score"] > best_score:
|
| 377 |
+
best_score, best_pos, best_parts = parts["score"], pos, parts
|
|
|
|
|
|
|
| 378 |
|
| 379 |
if best_pos is None:
|
| 380 |
continue
|
|
|
|
| 431 |
|
| 432 |
for idx, mol in enumerate(mol_raw):
|
| 433 |
|
| 434 |
+
# 🔥 Match ONLY against molecule name (ignore brand)
|
| 435 |
+
parts = hybrid_similarity(g_str, mol)
|
|
|
|
|
|
|
| 436 |
score = parts["score"]
|
| 437 |
|
| 438 |
if score >= min_score:
|
|
|
|
| 517 |
raise HTTPException(status_code=500, detail=str(e))
|
| 518 |
|
| 519 |
|
| 520 |
+
@app.get("/test-extract-base")
|
| 521 |
+
def test_extract_base(text: str):
|
| 522 |
+
"""Test molecule base extraction"""
|
| 523 |
+
normalized = norm_base(text)
|
| 524 |
+
mol_base = extract_molecule_base(text)
|
| 525 |
+
|
| 526 |
+
return {
|
| 527 |
+
"original": text,
|
| 528 |
+
"normalized": normalized,
|
| 529 |
+
"molecule_base": mol_base,
|
| 530 |
+
"numbers_extracted": extract_numbers(text),
|
| 531 |
+
"tokens": token_set(text)
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
|
| 535 |
@app.post("/match-difflib-debug")
|
| 536 |
async def match_with_difflib_debug(
|
| 537 |
rfq_file: UploadFile = File(...),
|
|
|
|
| 698 |
def root():
|
| 699 |
return {"status": "ok", "message": "POST /match-difflib (rfq_file + product_master_json). Use /match-difflib-grouped to get full RFQ rows + matches."}
|
| 700 |
|
|
|
|
| 701 |
if __name__ == "__main__":
|
| 702 |
import uvicorn
|
| 703 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|