Spaces:

iRecite
/

iRecite-MVP-API

Sleeping

iRecite-MVP-API / step14_align_text_to_canonical.py

didodev

Deploy iRecite MVP API (Docker + FastAPI)

4ca6263 about 1 month ago

3.22 kB

	import json
	import re
	from difflib import SequenceMatcher

	CANON_PATH = "data/fatiha_canonical.json"
	ASR_TEXT_PATH = "output/asr_raw.txt"
	OUT_PATH = "output/text_alignment.json"

	# --- Normalization helpers ---
	ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # harakat etc.
	TATWEEL = "\u0640"

	def normalize_ar(s: str) -> str:
	s = s.replace(TATWEEL, "")
	s = re.sub(ARABIC_DIACRITICS, "", s)
	# normalize common variants
	s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
	s = s.replace("ى", "ي")
	s = s.replace("ة", "ه")
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def tokenize(s: str):
	# keep Arabic letters and spaces only
	s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s.split(" ") if s else []

	def sim(a, b) -> float:
	return SequenceMatcher(None, a, b).ratio()

	def main():
	canon = json.load(open(CANON_PATH, encoding="utf-8"))

	# Load ASR raw text (we will create it in 14.2)
	raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
	raw_n = normalize_ar(raw)

	asr_tokens = tokenize(raw_n)

	# Canonical tokens (word-level) from JSON
	canon_words = []
	for ay in canon["ayahs"]:
	for w in ay["words"]:
	canon_words.append({
	"ayah": ay["ayah"],
	"word": w,
	"norm": normalize_ar(w)
	})

	# Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens
	aligned = []
	j = 0
	WINDOW = 6

	for i, cw in enumerate(canon_words):
	best = None
	best_j = None
	for k in range(j, min(len(asr_tokens), j + WINDOW)):
	score = sim(cw["norm"], asr_tokens[k])
	if (best is None) or (score > best):
	best = score
	best_j = k

	if best is None:
	aligned.append({
	"canon": cw,
	"asr_token": None,
	"score": 0.0,
	"match": False
	})
	continue

	token = asr_tokens[best_j]
	match = best >= 0.75 # MVP threshold

	aligned.append({
	"canon": cw,
	"asr_token": token,
	"score": round(float(best), 3),
	"match": bool(match)
	})

	# advance pointer to keep order
	j = best_j + 1

	# Summaries
	total = len(aligned)
	matches = sum(1 for a in aligned if a["match"])
	mismatches = total - matches

	out = {
	"asr_raw": raw,
	"asr_normalized": raw_n,
	"stats": {
	"canonical_words": total,
	"matches": matches,
	"mismatches": mismatches,
	"match_rate": round(matches / total, 3) if total else 0.0
	},
	"alignment": aligned
	}

	json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)

	print("OK ✅ wrote", OUT_PATH)
	print("Match rate:", out["stats"]["match_rate"])
	print("First 5 alignments:")
	for a in aligned[:5]:
	print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])

	if __name__ == "__main__":
	main()