Spaces:

anujakkulkarni
/

match-two-strings

Sleeping

App Files Files Community

match-two-strings / app.py

anujakkulkarni

Create app.py

57779e5 verified 4 months ago

raw

history blame contribute delete

2.4 kB

	import re
	import difflib
	from fastapi import FastAPI, Query
	from fastapi.middleware.cors import CORSMiddleware

	app = FastAPI(title="String Similarity API (Hybrid Difflib + Jaccard + Numeric Bonus)")

	# CORS: open for testing; lock down in prod
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	UNIT_PATTERN = r"(mg\|mcg\|g\|iu\|ml\|%)"

	def norm_base(s: str) -> str:
	s = str(s or "")
	s = s.lower()
	s = s.replace("+", " ").replace("/", " ")
	s = re.sub(r"[^\w\s.%/+-]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def extract_numbers(s: str):
	s2 = norm_base(s)
	num_unit = re.findall(rf"\b\d+(?:\.\d+)?\s*{UNIT_PATTERN}\b", s2)
	nums = re.findall(r"\b\d+(?:\.\d+)?\b", s2)
	return sorted(set([x.strip() for x in num_unit + nums]))

	def token_set(s: str):
	return [t for t in norm_base(s).split(" ") if t]

	def hybrid_similarity(a: str, b: str):
	a_n, b_n = norm_base(a), norm_base(b)
	if a_n == b_n:
	return {"diff": 100.0, "jacc": 100.0, "num": 100.0, "score": 100.0}

	diff = difflib.SequenceMatcher(None, a_n, b_n).ratio() * 100.0
	aset, bset = set(token_set(a)), set(token_set(b))
	jacc = (len(aset & bset) / len(aset \| bset) * 100.0) if (aset and bset) else 0.0
	anums, bnums = extract_numbers(a), extract_numbers(b)
	num_bonus = 100.0 if (anums and bnums and set(anums) == set(bnums)) else 0.0
	score = 0.60 * diff + 0.30 * jacc + 0.10 * num_bonus

	return {
	"diff": round(diff, 2),
	"jacc": round(jacc, 2),
	"num": 100.0 if num_bonus else 0.0,
	"score": round(score, 2)
	}

	@app.get("/string-match")
	def string_match(
	a: str = Query(..., description="First string to compare"),
	b: str = Query(..., description="Second string to compare"),
	threshold: float = Query(70.0, ge=0.0, le=100.0, description="Threshold for considering a match")
	):
	parts = hybrid_similarity(a, b) # uses your existing logic
	score = parts["score"]
	return {
	"string_a": a,
	"string_b": b,
	"percent_match": round(score, 2),
	"matched": bool(score >= threshold),
	"components": parts
	}

	@app.get("/")
	def root():
	return {
	"status": "ok",
	"message": "Use /string-match?a=...&b=... for comparisons. See /docs for Swagger UI."
	}