Spaces:
Sleeping
Sleeping
| import re | |
| import difflib | |
| from fastapi import FastAPI, Query | |
| from fastapi.middleware.cors import CORSMiddleware | |
| app = FastAPI(title="String Similarity API (Hybrid Difflib + Jaccard + Numeric Bonus)") | |
| # CORS: open for testing; lock down in prod | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| UNIT_PATTERN = r"(mg|mcg|g|iu|ml|%)" | |
| def norm_base(s: str) -> str: | |
| s = str(s or "") | |
| s = s.lower() | |
| s = s.replace("+", " ").replace("/", " ") | |
| s = re.sub(r"[^\w\s.%/+-]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def extract_numbers(s: str): | |
| s2 = norm_base(s) | |
| num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2) | |
| nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2) | |
| return sorted(set([x.strip() for x in num_unit + nums])) | |
| def token_set(s: str): | |
| return [t for t in norm_base(s).split(" ") if t] | |
| def hybrid_similarity(a: str, b: str): | |
| a_n, b_n = norm_base(a), norm_base(b) | |
| if a_n == b_n: | |
| return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "score": 100.0} | |
| diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0 | |
| aset, bset = set(token_set(a)), set(token_set(b)) | |
| jacc = (len(aset & bset) / len(aset | bset) * 100.0) if (aset and bset) else 0.0 | |
| anums, bnums = extract_numbers(a), extract_numbers(b) | |
| num_bonus = 100.0 if (anums and bnums and set(anums) == set(bnums)) else 0.0 | |
| score = 0.60 * diff + 0.30 * jacc + 0.10 * num_bonus | |
| return { | |
| "diff": round(diff, 2), | |
| "jacc": round(jacc, 2), | |
| "num": 100.0 if num_bonus else 0.0, | |
| "score": round(score, 2) | |
| } | |
| def string_match( | |
| a: str = Query(..., description="First string to compare"), | |
| b: str = Query(..., description="Second string to compare"), | |
| threshold: float = Query(70.0, ge=0.0, le=100.0, description="Threshold for considering a match") | |
| ): | |
| parts = hybrid_similarity(a, b) # uses your existing logic | |
| score = parts["score"] | |
| return { | |
| "string_a": a, | |
| "string_b": b, | |
| "percent_match": round(score, 2), | |
| "matched": bool(score >= threshold), | |
| "components": parts | |
| } | |
| def root(): | |
| return { | |
| "status": "ok", | |
| "message": "Use /string-match?a=...&b=... for comparisons. See /docs for Swagger UI." | |
| } | |