File size: 2,399 Bytes
57779e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
import difflib
from fastapi import FastAPI, Query
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI(title="String Similarity API (Hybrid Difflib + Jaccard + Numeric Bonus)")

# CORS: open for testing; lock down in prod
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

UNIT_PATTERN = r"(mg|mcg|g|iu|ml|%)"

def norm_base(s: str) -> str:
    s = str(s or "")
    s = s.lower()
    s = s.replace("+", " ").replace("/", " ")
    s = re.sub(r"[^\w\s.%/+-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_numbers(s: str):
    s2 = norm_base(s)
    num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
    nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
    return sorted(set([x.strip() for x in num_unit + nums]))

def token_set(s: str):
    return [t for t in norm_base(s).split(" ") if t]

def hybrid_similarity(a: str, b: str):
    a_n, b_n = norm_base(a), norm_base(b)
    if a_n == b_n:
        return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "score": 100.0}

    diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0
    aset, bset = set(token_set(a)), set(token_set(b))
    jacc = (len(aset & bset) / len(aset | bset) * 100.0) if (aset and bset) else 0.0
    anums, bnums = extract_numbers(a), extract_numbers(b)
    num_bonus = 100.0 if (anums and bnums and set(anums) == set(bnums)) else 0.0
    score = 0.60 * diff + 0.30 * jacc + 0.10 * num_bonus

    return {
        "diff": round(diff, 2),
        "jacc": round(jacc, 2),
        "num": 100.0 if num_bonus else 0.0,
        "score": round(score, 2)
    }

@app.get("/string-match")
def string_match(
    a: str = Query(..., description="First string to compare"),
    b: str = Query(..., description="Second string to compare"),
    threshold: float = Query(70.0, ge=0.0, le=100.0, description="Threshold for considering a match")
):
    parts = hybrid_similarity(a, b)  # uses your existing logic
    score = parts["score"]
    return {
        "string_a": a,
        "string_b": b,
        "percent_match": round(score, 2),
        "matched": bool(score >= threshold),
        "components": parts
    }

@app.get("/")
def root():
    return {
        "status": "ok",
        "message": "Use /string-match?a=...&b=... for comparisons. See /docs for Swagger UI."
    }