iRecite-MVP-API / step7_fallback_phonemes_and_madd.py
didodev
Deploy iRecite MVP API (Docker + FastAPI)
4ca6263
import json
import re
from arabic_phonemizer import ArabicPhonemizer
# --- Helpers ---
# Very simple Madd detection from script (MVP-level):
# We mark likely long vowels caused by: ุง, ูˆ, ูŠ, ู‰, and madd sign "ู“"
MADD_CHARS = set(["ุง", "ูˆ", "ูŠ", "ู‰", "ู“"])
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # tanwin, harakat, etc.
def strip_diacritics(s: str) -> str:
return re.sub(ARABIC_DIACRITICS, "", s)
def detect_madd_positions(word: str):
"""
Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear.
MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware).
"""
base = strip_diacritics(word)
return [i for i, ch in enumerate(base) if ch in MADD_CHARS]
def main():
# Instantiate phonemizer once
ph = ArabicPhonemizer()
path_in = "data/fatiha_canonical.json"
with open(path_in, "r", encoding="utf-8") as f:
data = json.load(f)
for ay in data["ayahs"]:
ay_word_info = []
for w in ay["words"]:
base = strip_diacritics(w)
# ArabicPhonemizer API: use .phonemize(text)
# If your version differs, weโ€™ll adapt after you run it.
phonemes = ph.phonemize(w)
ay_word_info.append({
"word": w,
"base": base,
"phonemes_fallback": phonemes,
"madd_positions_base_index": detect_madd_positions(w)
})
ay["word_info"] = ay_word_info
path_out = "data/fatiha_canonical_fallback.json"
with open(path_out, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("OK โœ… wrote", path_out)
print("Sample ayah 1 word_info:")
for item in data["ayahs"][0]["word_info"]:
print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"])
if __name__ == "__main__":
main()