Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from arabic_phonemizer import ArabicPhonemizer | |
| # --- Helpers --- | |
| # Very simple Madd detection from script (MVP-level): | |
| # We mark likely long vowels caused by: ุง, ู, ู, ู, and madd sign "ู" | |
| MADD_CHARS = set(["ุง", "ู", "ู", "ู", "ู"]) | |
| ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # tanwin, harakat, etc. | |
| def strip_diacritics(s: str) -> str: | |
| return re.sub(ARABIC_DIACRITICS, "", s) | |
| def detect_madd_positions(word: str): | |
| """ | |
| Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear. | |
| MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware). | |
| """ | |
| base = strip_diacritics(word) | |
| return [i for i, ch in enumerate(base) if ch in MADD_CHARS] | |
| def main(): | |
| # Instantiate phonemizer once | |
| ph = ArabicPhonemizer() | |
| path_in = "data/fatiha_canonical.json" | |
| with open(path_in, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| for ay in data["ayahs"]: | |
| ay_word_info = [] | |
| for w in ay["words"]: | |
| base = strip_diacritics(w) | |
| # ArabicPhonemizer API: use .phonemize(text) | |
| # If your version differs, weโll adapt after you run it. | |
| phonemes = ph.phonemize(w) | |
| ay_word_info.append({ | |
| "word": w, | |
| "base": base, | |
| "phonemes_fallback": phonemes, | |
| "madd_positions_base_index": detect_madd_positions(w) | |
| }) | |
| ay["word_info"] = ay_word_info | |
| path_out = "data/fatiha_canonical_fallback.json" | |
| with open(path_out, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| print("OK โ wrote", path_out) | |
| print("Sample ayah 1 word_info:") | |
| for item in data["ayahs"][0]["word_info"]: | |
| print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"]) | |
| if __name__ == "__main__": | |
| main() |