import json import re from arabic_phonemizer import ArabicPhonemizer # --- Helpers --- # Very simple Madd detection from script (MVP-level): # We mark likely long vowels caused by: ا, و, ي, ى, and madd sign "ٓ" MADD_CHARS = set(["ا", "و", "ي", "ى", "ٓ"]) ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # tanwin, harakat, etc. def strip_diacritics(s: str) -> str: return re.sub(ARABIC_DIACRITICS, "", s) def detect_madd_positions(word: str): """ Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear. MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware). """ base = strip_diacritics(word) return [i for i, ch in enumerate(base) if ch in MADD_CHARS] def main(): # Instantiate phonemizer once ph = ArabicPhonemizer() path_in = "data/fatiha_canonical.json" with open(path_in, "r", encoding="utf-8") as f: data = json.load(f) for ay in data["ayahs"]: ay_word_info = [] for w in ay["words"]: base = strip_diacritics(w) # ArabicPhonemizer API: use .phonemize(text) # If your version differs, we’ll adapt after you run it. phonemes = ph.phonemize(w) ay_word_info.append({ "word": w, "base": base, "phonemes_fallback": phonemes, "madd_positions_base_index": detect_madd_positions(w) }) ay["word_info"] = ay_word_info path_out = "data/fatiha_canonical_fallback.json" with open(path_out, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print("OK ✅ wrote", path_out) print("Sample ayah 1 word_info:") for item in data["ayahs"][0]["word_info"]: print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"]) if __name__ == "__main__": main()