Spaces:
Running
Running
File size: 1,998 Bytes
4ca6263 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import json
import re
from arabic_phonemizer import ArabicPhonemizer
# --- Helpers ---
# Very simple Madd detection from script (MVP-level):
# We mark likely long vowels caused by: ا, و, ي, ى, and madd sign "ٓ"
MADD_CHARS = set(["ا", "و", "ي", "ى", "ٓ"])
ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # tanwin, harakat, etc.
def strip_diacritics(s: str) -> str:
return re.sub(ARABIC_DIACRITICS, "", s)
def detect_madd_positions(word: str):
"""
Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear.
MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware).
"""
base = strip_diacritics(word)
return [i for i, ch in enumerate(base) if ch in MADD_CHARS]
def main():
# Instantiate phonemizer once
ph = ArabicPhonemizer()
path_in = "data/fatiha_canonical.json"
with open(path_in, "r", encoding="utf-8") as f:
data = json.load(f)
for ay in data["ayahs"]:
ay_word_info = []
for w in ay["words"]:
base = strip_diacritics(w)
# ArabicPhonemizer API: use .phonemize(text)
# If your version differs, we’ll adapt after you run it.
phonemes = ph.phonemize(w)
ay_word_info.append({
"word": w,
"base": base,
"phonemes_fallback": phonemes,
"madd_positions_base_index": detect_madd_positions(w)
})
ay["word_info"] = ay_word_info
path_out = "data/fatiha_canonical_fallback.json"
with open(path_out, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("OK ✅ wrote", path_out)
print("Sample ayah 1 word_info:")
for item in data["ayahs"][0]["word_info"]:
print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"])
if __name__ == "__main__":
main() |