Spaces:

Muhammad7777
/

Arabic-Diacritizer

Sleeping

App Files Files Community

Muhammad7777 commited on Oct 1, 2025

Commit

7150d27

1 Parent(s): 2cc687a

Deploy application

Browse files

Files changed (14) hide show

README.md +4 -5
app.py +186 -0
arabic_diacritizer_common/__init__.py +24 -0
arabic_diacritizer_common/cleaners.py +141 -0
arabic_diacritizer_common/constants.py +150 -0
arabic_diacritizer_common/postprocessor.py +81 -0
arabic_diacritizer_common/segmenter.py +94 -0
arabic_diacritizer_common/tokenizer.py +169 -0
diacritizer/__init__.py +11 -0
diacritizer/diacritizer.py +132 -0
diacritizer/exceptions.py +11 -0
diacritizer/hub_manager.py +103 -0
diacritizer/predictor.py +46 -0
requirements.txt +12 -0

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: Arabic Diacritizer
-emoji: 🐠
-colorFrom: yellow
 colorTo: red
 sdk: gradio
 sdk_version: 5.47.2
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Arabic Diacritizer Demo
+emoji: ⚡
+colorFrom: blue
 colorTo: red
 sdk: gradio
 sdk_version: 5.47.2
 app_file: app.py
 pinned: false
+license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import gradio as gr
+import time
+from diacritizer import Diacritizer, ModelNotFound
+MODEL_INFO = {
+    "bilstm": {
+        "display_name": "BiLSTM",
+        "models": {
+            "medium": {
+                "size": "4 MB",
+                "details": "Balanced speed and accuracy.",
+            },
+            "large": {
+                "size": "15.5 MB",
+                "details": "Highest accuracy model.",
+            },
+        },
+    },
+    "bigru": {
+        "display_name": "BiGRU",
+        "models": {
+            "medium": {
+                "size": "3.8 MB",
+                "details": "Slightly faster than BiLSTM with comparable accuracy.",
+            },
+            "large": {
+                "size": "14.9 MB",
+                "details": "High accuracy alternative to the BiLSTM model.",
+            },
+        },
+    },
+}
+MODEL_CACHE = {}
+def get_model(architecture: str, size: str, progress=gr.Progress()):
+    """
+    Lazily loads and caches a Diacritizer model.
+    Includes user feedback via gr.Progress to show loading status.
+    """
+    model_key = f"{architecture}/{size}"
+    if model_key not in MODEL_CACHE:
+        progress(0.5, desc=f"Loading {architecture}/{size} model...")
+        try:
+            MODEL_CACHE[model_key] = Diacritizer(architecture=architecture, size=size)
+        except ModelNotFound:
+            raise gr.Error(
+                f"The requested model ({model_key}) was not found on the Hugging Face Hub."
+            )
+        except Exception as e:
+            raise gr.Error(f"An unexpected error occurred while loading the model: {e}")
+    return MODEL_CACHE[model_key]
+def diacritize_text(text: str, architecture: str, size: str, progress=gr.Progress()):
+    """
+    Main function to diacritize text, now with progress tracking.
+    """
+    if not text or not text.strip():
+        return "", "0.000s", "Please enter some text to diacritize."
+    progress(0, desc="Loading model...")
+    diacritizer = get_model(architecture, size, progress)
+    progress(0.8, desc="Diacritizing text...")
+    start_time = time.time()
+    diacritized_text = diacritizer.diacritize(text)
+    end_time = time.time()
+    inference_time = f"{end_time - start_time:.3f}s"
+    # Update the info text with the final result details
+    model_details = MODEL_INFO[architecture]["models"][size]["details"]
+    final_info_text = f"**Model:** {architecture}/{size} | **Size:** {MODEL_INFO[architecture]['models'][size]['size']} | {model_details}"
+    return diacritized_text, inference_time, final_info_text
+def update_available_sizes(architecture: str):
+    """Callback to update the size choices when the architecture changes."""
+    available_sizes = list(MODEL_INFO[architecture]["models"].keys())
+    # Return a new Radio component with updated choices and a default value
+    return gr.Radio(
+        choices=available_sizes,
+        value=available_sizes[0],  # Default to the first available size
+        label="Model Size",
+        info="Select the model size.",
+    )
+theme = gr.themes.Soft(
+    primary_hue="zinc",
+    secondary_hue="blue",
+    neutral_hue="slate",
+    font=(gr.themes.GoogleFont("Noto Sans"), gr.themes.GoogleFont("Noto Sans Arabic")),
+).set(
+    body_background_fill_dark="#111827"  # A slightly off-black for dark mode
+)
+DESCRIPTION = """
+# ⚡ End-to-End Arabic Diacritizer
+A lightweight and efficient model for automatic Arabic diacritization.
+Select an architecture and size, enter some text, and see it in action. For more details, visit the
+[GitHub repository](https://github.com/muhammad-abdelsattar/arabic-diacritizer).
+"""
+with gr.Blocks(theme=theme, css=".footer {display: none !important}") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Row():
+                arch_selector = gr.Radio(
+                    choices=[
+                        (info["display_name"], arch)
+                        for arch, info in MODEL_INFO.items()
+                    ],
+                    label="Model Architecture",
+                    value="bilstm",
+                    info="Select the model architecture.",
+                )
+                model_selector = gr.Radio(
+                    choices=["medium", "large"],
+                    label="Model Size",
+                    value="medium",
+                    info="Select the model size.",
+                )
+            info_display = gr.Markdown(
+                "**Model:** bilstm/medium | **Size:** 4 MB | Balanced speed and accuracy. (Formerly 'small')",
+                elem_id="info-display",
+            )
+        with gr.Column(scale=1):
+            inference_time_output = gr.Textbox(
+                label="Inference Time", interactive=False, max_lines=1
+            )
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text (Undiacritized)",
+                placeholder="اكتب جملة عربية غير مشكولة هنا...",
+                lines=8,
+                rtl=True,
+            )
+        with gr.Column():
+            output_text = gr.Textbox(
+                label="Output Text (Diacritized)",
+                lines=8,
+                rtl=True,
+                interactive=False,
+            )
+    submit_button = gr.Button("Diacritize ✨", variant="primary")
+    gr.Examples(
+        [
+            ["أعلنت الشركة عن نتائجها المالية للربع الأول من العام."],
+            ["إن مع العسر يسرا."],
+            ["هل يمكن للذكاء الاصطناعي أن يكون مبدعا؟"],
+            ["كان المتنبي شاعرا عظيما في العصر العباسي."],
+        ],
+        inputs=input_text,
+        label="Examples",
+    )
+    submit_button.click(
+        fn=diacritize_text,
+        inputs=[input_text, arch_selector, model_selector],
+        outputs=[output_text, inference_time_output, info_display],
+    )
+    # When architecture changes, update the available sizes
+    arch_selector.change(
+        fn=update_available_sizes, inputs=arch_selector, outputs=model_selector
+    )
+if __name__ == "__main__":
+    # Pre-load the default model for a faster first-time user experience
+    print("Pre-loading default 'bilstm/medium' model...")
+    get_model(architecture="bilstm", size="medium")
+    print("Default model loaded successfully.")
+    demo.launch()

arabic_diacritizer_common/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from .constants import (
+    ArabicDiacritics,
+    VALID_ARABIC_CHARS,
+    DIACRITIC_CHARS,
+    ARABIC_LETTERS,
+    ARABIC_LETTERS_REGEX,
+)
+from .cleaners import TextCleaner, DiacriticValidator
+from .segmenter import TextSegmenter
+from .tokenizer import CharTokenizer
+from .postprocessor import Postprocessor
+__all__ = [
+    "ArabicDiacritics",
+    "VALID_ARABIC_CHARS",
+    "DIACRITIC_CHARS",
+    "ARABIC_LETTERS",
+    "TextCleaner",
+    "Postprocessor",
+    "DiacriticValidator",
+    "TextSegmenter",
+    "CharTokenizer",
+    "ARABIC_LETTERS_REGEX",
+]

arabic_diacritizer_common/cleaners.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import re
+from typing import List, Tuple
+from .constants import (
+    VALID_ARABIC_CHARS,
+    DIACRITIC_CHARS,
+    INVALID_SEQUENCES,
+    ALL_VALID_DIACRITICS,
+    ArabicDiacritics,
+)
+# Whitespace regex
+_whitespace_re = re.compile(r"\s+")
+class TextCleaner:
+    """Modular text cleaning utilities"""
+    @staticmethod
+    def collapse_whitespace(text: str) -> str:
+        """Collapse multiple whitespace characters into a single space"""
+        return re.sub(_whitespace_re, " ", text).strip()
+    @staticmethod
+    def filter_valid_arabic(text: str) -> str:
+        """Keep only valid Arabic characters, punctuation, and diacritics"""
+        return "".join(char for char in text if char in VALID_ARABIC_CHARS)
+    @staticmethod
+    def remove_diacritics(text: str) -> str:
+        """Remove all diacritic characters from text"""
+        return "".join(ch for ch in text if ch not in DIACRITIC_CHARS)
+    @staticmethod
+    def normalize_text(text: str) -> str:
+        """Normalize common text irregularities & diacritic order"""
+        # Canonicalize diacritic order (make sure Shadda always comes first)
+        for invalid, correct in INVALID_SEQUENCES.items():
+            text = text.replace(invalid, correct)
+        # Normalize alef variants to bare alef (optional, safer for training)
+        normalize_map = {"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا"}
+        for k, v in normalize_map.items():
+            text = text.replace(k, v)
+        # Remove Tatweel (ـ) since it is purely decorative
+        text = text.replace("ـ", "")
+        return text
+    @staticmethod
+    def clean_text(
+        text: str, keep_valid_only: bool = True, normalize: bool = False
+    ) -> str:
+        """Complete cleaning pipeline: normalize → optional filtering → collapse ws"""
+        if normalize:
+            text = TextCleaner.normalize_text(text)
+        if keep_valid_only:
+            text = TextCleaner.filter_valid_arabic(text)
+        return TextCleaner.collapse_whitespace(text)
+    @staticmethod
+    def strip_diacritics(text: str) -> str:
+        """Efficiently strips all diacritic marks from an Arabic string."""
+        return "".join(char for char in text if char not in DIACRITIC_CHARS)
+class DiacriticValidator:
+    """Handles diacritic validation and extraction"""
+    @staticmethod
+    def extract_diacritics(text: str) -> Tuple[str, List[str]]:
+        """
+        Extract base text and list of diacritics.
+        Each base character gets an associated diacritic string (possibly multiple).
+        Example:
+          "بَّ" → ("ب", ["َّ"])
+        """
+        base_chars = []
+        diacritics = []
+        i = 0
+        while i < len(text):
+            char = text[i]
+            if char in DIACRITIC_CHARS:
+                # attach to previous base character if exists
+                if base_chars:
+                    # Append this diacritic to most recent slot
+                    diacritics[-1] = diacritics[-1] + char
+                else:
+                    # Stray diacritic at beginning — skip or treat as invalid
+                    pass
+            else:
+                # New base char: allocate diacritic slot
+                base_chars.append(char)
+                diacritics.append("")
+            i += 1
+        # Normalize combined diacritics to canonical representations
+        normalized_diacritics = []
+        for d in diacritics:
+            if d in ALL_VALID_DIACRITICS:
+                normalized_diacritics.append(d)
+            else:
+                # try to reorder if contains shadda + vowel
+                if "ّ" in d:
+                    # move shadda to front
+                    d = "ّ" + "".join(c for c in d if c != "ّ")
+                # keep only known chars
+                d = "".join(c for c in d if c in DIACRITIC_CHARS)
+                normalized_diacritics.append(d)
+        return "".join(base_chars), normalized_diacritics
+    @staticmethod
+    def validate_diacritics(
+        text: str, require_any: bool = False, strict: bool = False
+    ) -> str:
+        """
+        Validate that text diacritics are well-formed.
+        - require_any: if True, reject sentences with no diacritics at all.
+        - strict: if True, reject unknown/malformed diacritics, else sanitize them.
+        Returns text if valid, otherwise "".
+        """
+        try:
+            base_text, diacritics_list = DiacriticValidator.extract_diacritics(text)
+            # Optionally require that at least one diacritic is present
+            if require_any:
+                if not any(
+                    d
+                    for d in diacritics_list
+                    if d != ArabicDiacritics.NO_DIACRITIC.value
+                ):
+                    return ""
+            # In strict mode, reject any diacritic not in valid set
+            if strict:
+                for d in diacritics_list:
+                    if d not in ALL_VALID_DIACRITICS:
+                        return ""
+            return text
+        except Exception:
+            return ""

arabic_diacritizer_common/constants.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import enum
+from typing import Set, FrozenSet
+import re
+class ArabicDiacritics(enum.Enum):
+    """All possible Arabic diacritics (standard + extended)."""
+    NO_DIACRITIC = ""
+    SUKOON = "ْ"
+    SHADDA = "ّ"
+    DAMMA = "ُ"
+    FATHA = "َ"
+    KASRA = "ِ"
+    TANWEEN_DAMMA = "ٌ"
+    TANWEEN_FATHA = "ً"
+    TANWEEN_KASRA = "ٍ"
+    # Combinations where order may vary in Unicode
+    SHADDA_PLUS_DAMMA = "ُّ"  # normalize to 'shadda then vowel'
+    SHADDA_PLUS_FATHA = "َّ"
+    SHADDA_PLUS_KASRA = "ِّ"
+    SHADDA_PLUS_TANWEEN_DAMMA = "ٌّ"
+    SHADDA_PLUS_TANWEEN_FATHA = "ًّ"
+    SHADDA_PLUS_TANWEEN_KASRA = "ٍّ"
+    # Quranic / orthographic additions
+    DAGGER_ALEF = "ٰ"  # superscript Alef (dagger)
+    MADDA = "ٓ"  # Maddah
+    WASLA = "ٱ"  # Hamzat Wasl (technically letter with mark)
+    @classmethod
+    def chars(cls) -> Set[str]:
+        """Return set of atomic (single-character) diacritics."""
+        return {
+            cls.SUKOON.value,
+            cls.SHADDA.value,
+            cls.DAMMA.value,
+            cls.FATHA.value,
+            cls.KASRA.value,
+            cls.TANWEEN_DAMMA.value,
+            cls.TANWEEN_FATHA.value,
+            cls.TANWEEN_KASRA.value,
+            cls.DAGGER_ALEF.value,
+            cls.MADDA.value,
+        }
+    @classmethod
+    def valid_combinations(cls) -> Set[str]:
+        """Return full set of valid diacritic combinations."""
+        return {
+            cls.NO_DIACRITIC.value,
+            # Singles
+            cls.SUKOON.value,
+            cls.DAMMA.value,
+            cls.FATHA.value,
+            cls.KASRA.value,
+            cls.TANWEEN_DAMMA.value,
+            cls.TANWEEN_FATHA.value,
+            cls.TANWEEN_KASRA.value,
+            cls.DAGGER_ALEF.value,
+            cls.MADDA.value,
+            # Shadda combos
+            cls.SHADDA_PLUS_DAMMA.value,
+            cls.SHADDA_PLUS_FATHA.value,
+            cls.SHADDA_PLUS_KASRA.value,
+            cls.SHADDA_PLUS_TANWEEN_DAMMA.value,
+            cls.SHADDA_PLUS_TANWEEN_FATHA.value,
+            cls.SHADDA_PLUS_TANWEEN_KASRA.value,
+        }
+    @classmethod
+    def is_valid_diacritic(cls, diacritic: str) -> bool:
+        return diacritic in cls.valid_combinations()
+# Character sets
+WORD_SEPARATOR = " "
+# Arabic letters base Unicode block (0600–06FF covers standard Arabic letters)
+ARABIC_LETTERS_BASE = [chr(c) for c in range(0x0621, 0x064B)]
+# Extended Arabic letters (found in borrowed words, Persian/Urdu usage)
+ARABIC_LETTERS_EXTENDED_BLOCK = [
+    "ى",  # Alef Maqsura
+    "ة",  # Taa Marbuta
+    "پ",
+    "چ",
+    "ڤ",
+    "گ",  # Persian/Urdu additions
+]
+ALEF_VARIANTS = {"ا", "أ", "إ", "آ"}
+# Merge all letters
+ARABIC_LETTERS = frozenset(
+    ARABIC_LETTERS_BASE + ARABIC_LETTERS_EXTENDED_BLOCK + list(ALEF_VARIANTS)
+)
+# Punctuation
+PUNCTUATIONS = frozenset(
+    {
+        ".",
+        "،",
+        ":",
+        "؛",
+        "-",
+        "؟",
+        "!",
+        "(",
+        ")",
+        "[",
+        "]",
+        '"',
+        "«",
+        "»",
+        "/",
+        ";",
+        ",",
+        "…",
+        "ـ",  # ellipsis + tatweel
+    }
+)
+SENTENCE_DELIMITERS = {".", "؟", "!", "،", ":", "؛", "…"}
+WORD_DELIMITERS = {WORD_SEPARATOR, *SENTENCE_DELIMITERS}
+# Diacritics sets
+DIACRITIC_CHARS = ArabicDiacritics.chars()
+ALL_VALID_DIACRITICS = ArabicDiacritics.valid_combinations()
+# All valid characters
+VALID_ARABIC_CHARS = {WORD_SEPARATOR, *ARABIC_LETTERS, *PUNCTUATIONS, *DIACRITIC_CHARS}
+# Text normalization (fixes diacritic ordering inconsistencies)
+INVALID_SEQUENCES = {
+    # Normalize to canonical "SHADDA first, VOWEL after"
+    "َّ": "َّ",  # fatha + shadda → shadda + fatha
+    "ِّ": "ِّ",  # kasra + shadda → shadda + kasra
+    "ُّ": "ُّ",  # damma + shadda → shadda + damma
+    "ًّ": "ًّ",  # tanween fatha
+    "ٍّ": "ٍّ",  # tanween kasra
+    "ٌّ": "ٌّ",  # tanween damma
+    # Punctuation spacing corrections
+    " ،": "،",
+    " .": ".",
+    " ؟": "؟",
+    " ؛": "؛",
+    " …": "…",
+}
+# Regex for Arabic letters
+ARABIC_LETTERS_REGEX = re.compile(f'[{"".join(ARABIC_LETTERS)}]+')

arabic_diacritizer_common/postprocessor.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from .constants import DIACRITIC_CHARS, ArabicDiacritics
+from .cleaners import DiacriticValidator, TextCleaner
+class Postprocessor:
+    """
+    Provides rule-based post-processing to correct common linguistic errors
+    in model-generated diacritized text.
+    """
+    @staticmethod
+    def postprocess(text: str) -> str:
+        text = Postprocessor._correct_tanween_fatha_placement(text)
+        text = Postprocessor._remove_diacritics_from_alifs(text)
+        return text
+    @staticmethod
+    def _correct_tanween_fatha_placement(text: str) -> str:
+        """
+        Corrects the placement of Tanween Fatha (ً) from a final Alif (ا)
+        to the preceding character. This is a common model error.
+        Example: "مَرْحَبَاً" (incorrect) -> "مَرْحَبًا" (correct)
+        """
+        words = text.split(" ")
+        corrected_words = []
+        tanween_fatha = ArabicDiacritics.TANWEEN_FATHA.value
+        for word in words:
+            # Ensure the word is long enough to have a character before a final Alif.
+            if len(word) < 2:
+                corrected_words.append(word)
+                continue
+            base_chars, diacritics = DiacriticValidator.extract_diacritics(word)
+            if (
+                len(base_chars) > 1
+                and base_chars[-1] == "ا"
+                and diacritics[-1] == tanween_fatha
+            ):
+                diacritics[-2] = tanween_fatha
+                # Ensure the final Alif is left with no diacritic.
+                diacritics[-1] = ""
+                corrected_word = "".join(
+                    [c + d for c, d in zip(base_chars, diacritics)]
+                )
+                corrected_words.append(corrected_word)
+            else:
+                corrected_words.append(word)
+        return " ".join(corrected_words)
+    @staticmethod
+    def _remove_diacritics_from_alifs(text: str) -> str:
+        """
+        Removes any diacritics from plain Alif (ا) and Alif Maqsura (ى)
+        anywhere within a word. These characters should not carry short vowels.
+        Example:
+            - "عَلَىَ" -> "عَلَى"
+            - "اِسْم" -> "اِسْم" (This is correct; the model predicted a base letter, not a diacritic)
+            - "كِتَابُ" -> "كِتَاب" (Incorrect model output gets corrected)
+        """
+        words = text.split(" ")
+        corrected_words = []
+        for word in words:
+            base_chars, diacritics = DiacriticValidator.extract_diacritics(word)
+            for i in range(len(base_chars)):
+                if base_chars[i] == "ا" or base_chars[i] == "ى":
+                    diacritics[i] = ""
+            corrected_word = "".join([c + d for c, d in zip(base_chars, diacritics)])
+            corrected_words.append(corrected_word)
+        return " ".join(corrected_words)

arabic_diacritizer_common/segmenter.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import re
+from typing import List
+from .cleaners import TextCleaner
+from .constants import DIACRITIC_CHARS
+def grapheme_length(text: str) -> int:
+    """Return logical length of text, counting base characters only (ignore diacritics)."""
+    return sum(1 for ch in text if ch not in DIACRITIC_CHARS)
+class TextSegmenter:
+    """Handles text segmentation and sentence splitting"""
+    # Regex for sentence boundaries (includes multi-char delimiters)
+    # Treats sequences like "؟!" or "..." as a single delimiter
+    SENTENCE_BOUNDARY_RE = re.compile(r"(؟!|!|\?|\.{2,}|…|،|؛)")
+    @staticmethod
+    def segment_sentences(max_chars: int, line: str) -> List[str]:
+        """
+        Segment long lines into shorter sentences with max length constraint.
+        """
+        line = line.strip()
+        if not line:
+            return []
+        if grapheme_length(line) <= max_chars:
+            return [TextCleaner.collapse_whitespace(line)]
+        # Perform segmentation
+        return TextSegmenter._do_segment_sentences(line, max_chars)
+    @staticmethod
+    def _do_segment_sentences(line: str, max_chars: int) -> List[str]:
+        """
+        Internal recursive sentence segmentation logic.
+        """
+        # Split based on boundary regex (keeps delimiters)
+        parts = []
+        last_idx = 0
+        for match in TextSegmenter.SENTENCE_BOUNDARY_RE.finditer(line):
+            start, end = match.span()
+            segment = line[last_idx:start].strip()
+            delimiter = match.group()
+            if segment:
+                parts.append(segment + delimiter)
+            last_idx = end
+        if last_idx < len(line):
+            remainder = line[last_idx:].strip()
+            if remainder:
+                parts.append(remainder)
+        # Now filter by length
+        results: List[str] = []
+        for sent in parts:
+            sent = TextCleaner.collapse_whitespace(sent)
+            if not sent:
+                continue
+            if grapheme_length(sent) <= max_chars:
+                results.append(sent)
+            else:
+                # Recursive split if still too long
+                subsegments = TextSegmenter._recursive_split(sent, max_chars)
+                results.extend(subsegments)
+        return results
+    @staticmethod
+    def _recursive_split(text: str, max_chars: int) -> List[str]:
+        """
+        Splits oversized text recursively by words if necessary.
+        """
+        words = text.split()
+        if not words:
+            return []
+        segments = []
+        cur_segment = []
+        cur_len = 0
+        for w in words:
+            if cur_len + grapheme_length(w) + 1 > max_chars:
+                if cur_segment:
+                    segments.append(" ".join(cur_segment))
+                cur_segment = [w]
+                cur_len = grapheme_length(w)
+            else:
+                cur_segment.append(w)
+                cur_len += grapheme_length(w) + 1
+        if cur_segment:
+            segments.append(" ".join(cur_segment))
+        return segments

arabic_diacritizer_common/tokenizer.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import json
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from .constants import ArabicDiacritics, ARABIC_LETTERS, VALID_ARABIC_CHARS
+from .cleaners import DiacriticValidator, TextCleaner
+class CharTokenizer:
+    """
+    Character-level tokenizer for Arabic diacritization.
+    Input: bare characters (without diacritics)
+    Output: per-character diacritic labels (including NO_DIACRITIC)
+    """
+    def __init__(
+        self,
+        char2id: Optional[Dict[str, int]] = None,
+        diacritic2id: Optional[Dict[str, int]] = None,
+        include_punct: bool = True,
+        extra_chars: Optional[List[str]] = None,
+    ):
+        """
+        If no vocab mappings are provided, builds defaults from constants.py
+        """
+        if char2id is None or diacritic2id is None:
+            # Base vocabulary from constants
+            vocab_chars = list(ARABIC_LETTERS)
+            if include_punct:
+                vocab_chars += [
+                    c for c in VALID_ARABIC_CHARS if c not in ARABIC_LETTERS
+                ]
+            if extra_chars:
+                vocab_chars += extra_chars
+            vocab_chars = sorted(set(vocab_chars))
+            # Char vocab (+PAD, +UNK)
+            char2id = {"<PAD>": 0, "<UNK>": 1}
+            char2id.update({ch: idx + 2 for idx, ch in enumerate(vocab_chars)})
+            # Diacritic vocab (includes NO_DIACRITIC "")
+            diacritic2id = {
+                d: i
+                for i, d in enumerate(sorted(ArabicDiacritics.valid_combinations()))
+            }
+        self.char2id = char2id
+        self.id2char = {i: c for c, i in char2id.items()}
+        self.diacritic2id = diacritic2id
+        self.id2diacritic = {i: d for d, i in diacritic2id.items()}
+    def save(self, path: str):
+        Path(path).write_text(
+            json.dumps(
+                {"char2id": self.char2id, "diacritic2id": self.diacritic2id},
+                ensure_ascii=False,
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+    @classmethod
+    def load(cls, path: str):
+        data = json.loads(Path(path).read_text(encoding="utf-8"))
+        return cls(data["char2id"], data["diacritic2id"])
+    def encode(self, text: str) -> Tuple[List[int], List[int]]:
+        """
+        Encode a diacritized string → (input_ids, diacritic_labels)
+        """
+        # clean_text = TextCleaner.clean_text(text, keep_valid_only=True)
+        base_text, diacritics = DiacriticValidator.extract_diacritics(text)
+        input_ids = [self.char2id.get(ch, self.char2id["<PAD>"]) for ch in base_text]
+        label_ids = [
+            self.diacritic2id.get(
+                d, self.diacritic2id[ArabicDiacritics.NO_DIACRITIC.value]
+            )
+            for d in diacritics
+        ]
+        return input_ids, label_ids
+    def encode_for_inference(self, text: str) -> Tuple[List[int], List[int]]:
+        """
+        Encode a diacritized string → (input_ids, diacritic_labels)
+        """
+        # clean_text = TextCleaner.clean_text(text, keep_valid_only=True)
+        input_ids = [self.char2id.get(ch, self.char2id["<PAD>"]) for ch in base_text]
+        label_ids = [
+            self.diacritic2id.get(
+                d, self.diacritic2id[ArabicDiacritics.NO_DIACRITIC.value]
+            )
+            for d in diacritics
+        ]
+        return input_ids, label_ids
+    def decode(
+        self, input_ids: List[int], label_ids: List[int], cleanup_mode: str = "clean"
+    ) -> str:
+        """
+        Decode (input_ids, label_ids) -> string with diacritics.
+        Args:
+            input_ids: List of character IDs.
+            label_ids: List of predicted diacritic IDs.
+            cleanup_mode (str): Determines the post-processing strategy.
+                - "clean": (Default) Removes diacritics from non-Arabic letters (e.g., punctuation, spaces).
+                - "raw": Returns the raw model output without any cleanup.
+        Returns:
+            The reconstructed, diacritized string.
+        """
+        if cleanup_mode not in {"clean", "raw"}:
+            raise ValueError("cleanup_mode must be either 'clean' or 'raw'.")
+        chars = [self.id2char.get(i, "<UNK>") for i in input_ids]
+        diacs = [self.id2diacritic.get(i, "") for i in label_ids]
+        if cleanup_mode == "raw":
+            return "".join(ch + d for ch, d in zip(chars, diacs))
+        # Default is "clean" mode
+        cleaned_output = []
+        for char, diac in zip(chars, diacs):
+            # Only attach a diacritic if the character is a valid Arabic letter
+            if char in ARABIC_LETTERS:
+                cleaned_output.append(char + diac)
+            else:
+                cleaned_output.append(
+                    char
+                )  # Append the character without the predicted diacritic
+        return "".join(cleaned_output)
+    def decode_inference(
+        self,
+        text_list: list,
+        label_ids: list,
+        cleanup_mode: str = "clean",
+    ):
+        """
+        Decode (input_ids, label_ids) -> string with diacritics.
+        Args:
+            text_list: List of chars in the original text without diacritics.
+            label_ids: List of predicted diacritic IDs.
+            cleanup_mode (str): Determines the post-processing strategy.
+                - "clean": (Default) Removes diacritics from non-Arabic letters (e.g., punctuation, spaces).
+                - "raw": Returns the raw model output without any cleanup.
+        Returns:
+            The reconstructed, diacritized string.
+        """
+        if cleanup_mode not in {"clean", "raw"}:
+            raise ValueError("cleanup_mode must be either 'clean' or 'raw'.")
+        diacs = [self.id2diacritic.get(i, "") for i in label_ids]
+        cleaned_output = []
+        for char, diac in zip(text_list, diacs):
+            # Only attach a diacritic if the character is a valid Arabic letter
+            if char in ARABIC_LETTERS:
+                cleaned_output.append(str(char) + str(diac))
+            else:
+                cleaned_output.append(
+                    char
+                )  # Append the character without the predicted diacritic
+        return "".join(cleaned_output)

diacritizer/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .diacritizer import Diacritizer
+from .exceptions import ModelNotFound, InvalidInputError, DiacritizerException
+__all__ = [
+    "Diacritizer",
+    "ModelNotFound",
+    "InvalidInputError",
+    "DiacritizerException"
+]
+__version__ = "0.1.0"

diacritizer/diacritizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import json
+from pathlib import Path
+from typing import Union, List, Optional
+import numpy as np
+from arabic_diacritizer_common import (
+    CharTokenizer,
+    TextSegmenter,
+    Postprocessor,
+    DiacriticValidator,
+    ARABIC_LETTERS_REGEX,
+    TextCleaner,
+    DIACRITIC_CHARS,
+)
+from .predictor import OnnxPredictor
+from .hub_manager import resolve_model_path, DEFAULT_HUB_REPO_ID
+class Diacritizer:
+    def __init__(
+        self,
+        model_identifier: Optional[str] = None,
+        architecture: str = "bilstm",
+        size: str = "medium",
+        revision: str = "main",
+        force_sync: bool = False,
+        use_gpu: bool = False,
+    ):
+        """
+        Initializes the Diacritizer by loading the model and tokenizer.
+        Args:
+            model_identifier (str, optional): The identifier for the model. Can be a
+                local path or a Hugging Face Hub repo ID. Defaults to the official repo.
+            architecture (str): The model architecture ('bilstm', 'bigru', etc.).
+                Defaults to "bilstm".
+            size (str): The model size ('small', 'medium'). Defaults to "medium".
+            revision (str): A specific model version from the Hub. Defaults to "main".
+            force_sync (bool): If True, forces a re-download. Defaults to False.
+            use_gpu (bool): If True, attempts to use CUDA. Defaults to False.
+        """
+        self.max_length = -1
+        repo_to_resolve = model_identifier or DEFAULT_HUB_REPO_ID
+        # Pass the new 'architecture' parameter to the resolver function
+        onnx_path, vocab_path = resolve_model_path(
+            model_identifier=repo_to_resolve,
+            architecture=architecture,  # MODIFIED
+            size=size,
+            revision=revision,
+            force_sync=force_sync,
+        )
+        self.predictor = OnnxPredictor(onnx_path, use_gpu)
+        vocab_data = json.loads(vocab_path.read_text(encoding="utf-8"))
+        self.tokenizer = CharTokenizer(
+            char2id=vocab_data["char2id"],
+            diacritic2id=vocab_data["diacritic2id"],
+        )
+        self.segmenter = TextSegmenter()
+    def _diacritize_sentence(self, text: str) -> str:
+        """Helper to diacritize an arabic sentence string."""
+        if not text.strip():
+            return ""
+        input_ids, diacritic_ids = self.tokenizer.encode(text)
+        text_list = list(TextCleaner.remove_diacritics(text))
+        original_len = len(input_ids)
+        if original_len == 0:
+            return ""
+        input_chars = np.array(input_ids).astype(np.int64).reshape(1, -1)
+        no_diacritic_id = self.tokenizer.diacritic2id.get("", 0)
+        input_hints = np.full_like(
+            input_chars, fill_value=no_diacritic_id, dtype=np.int64
+        )
+        # inference
+        logits = self.predictor.predict(input_ids=input_chars, hints=input_hints)
+        predicted_diac_ids = np.argmax(logits, axis=-1)
+        # Decode the predictions
+        return self.tokenizer.decode_inference(
+            text_list, predicted_diac_ids[0].tolist()
+        )
+    def diacritize(
+        self, text: Union[str, List[str]], postprocess: bool = True
+    ) -> List[str]:
+        """
+        Diacritizes text while preserving non-Arabic characters and structure.
+        This method dissects the input text into Arabic and non-Arabic segments.
+        It processes only the Arabic segments and then reassembles the string,
+        maintaining the original order and content of all non-Arabic parts.
+        Any existing diacritics in the Arabic segments are stripped before
+        being processed by the model to ensure a consistent output.
+        Args:
+            text (str): The input text.
+        Returns:
+            The diacritized string.
+        """
+        if not text:
+            return ""
+        if isinstance(text, str):
+            # To handle a single string input gracefully
+            text_or_list = [text]
+        else:
+            # To handle a list of strings input gracefully
+            text_or_list = text
+        if not isinstance(text_or_list, list):
+            raise TypeError("Input must be a string or a list of strings.")
+        diacritized_list = [self._diacritize_sentence(s) for s in text_or_list]
+        if postprocess:
+            for i, diacritized_sentence in enumerate(diacritized_list):
+                diacritized_list[i] = Postprocessor.postprocess(diacritized_sentence)
+        if isinstance(text, str):
+            return diacritized_list[0]
+        return diacritized_list

diacritizer/exceptions.py ADDED Viewed

	@@ -0,0 +1,11 @@

+class DiacritizerException(Exception):
+    """Base exception for all errors raised by the diacritizer package."""
+    pass
+class ModelNotFound(DiacritizerException):
+    """Raised when the model files (ONNX, vocab, etc.) cannot be found."""
+    pass
+class InvalidInputError(DiacritizerException):
+    """Raised when the input text provided to the diacritizer is invalid."""
+    pass

diacritizer/hub_manager.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+from pathlib import Path
+from typing import Tuple
+from huggingface_hub import hf_hub_download
+from huggingface_hub.errors import EntryNotFoundError, LocalEntryNotFoundError
+from .exceptions import ModelNotFound
+DEFAULT_HUB_REPO_ID = "muhammad7777/arabic-diacritizer-models"
+def _download_from_hub(
+    repo_id: str, architecture: str, size: str, revision: str, force_sync: bool
+) -> Tuple[str, str]:
+    """
+    Internal helper to download model artifacts from the Hub, using a nested subfolder.
+    """
+    # Construct the nested path (e.g., "bilstm/medium")
+    model_subfolder = f"{architecture}/{size}"
+    try:
+        # Attempt to load from cache first (offline-first)
+        if not force_sync:
+            onnx_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="model.onnx",
+                subfolder=model_subfolder,
+                revision=revision,
+                local_files_only=True,
+            )
+            vocab_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="vocab.json",
+                subfolder=model_subfolder,
+                revision=revision,
+                local_files_only=True,
+            )
+            return onnx_path, vocab_path
+    except LocalEntryNotFoundError:
+        pass  # Not found in cache, proceed to download.
+    # Download from the Hub (online fallback)
+    try:
+        onnx_path = hf_hub_download(
+            repo_id=repo_id,
+            filename="model.onnx",
+            subfolder=model_subfolder,
+            revision=revision,
+            force_download=force_sync,
+        )
+        vocab_path = hf_hub_download(
+            repo_id=repo_id,
+            filename="vocab.json",
+            subfolder=model_subfolder,
+            revision=revision,
+            force_download=force_sync,
+        )
+        return onnx_path, vocab_path
+    except EntryNotFoundError as e:
+        # Make the error message more informative
+        raise ModelNotFound(
+            f"Could not find model for architecture '{architecture}' and size '{size}' "
+            f"at revision '{revision}' in repository '{repo_id}'. "
+            f"Please check the Hub for available models."
+        ) from e
+    except Exception as e:
+        raise ModelNotFound(
+            f"Failed to download model from the Hub. Please check your internet connection. "
+            f"Original error: {e}"
+        ) from e
+def resolve_model_path(
+    model_identifier: str, architecture: str, size: str, revision: str, force_sync: bool
+) -> Tuple[Path, Path]:
+    """
+    Resolves model artifact paths, now with architecture awareness.
+    """
+    # Case 1: The identifier is a local directory path (no change here)
+    if os.path.isdir(model_identifier):
+        model_dir = Path(model_identifier)
+        onnx_path = model_dir / "model.onnx"
+        vocab_path = model_dir / "vocab.json"
+        if not onnx_path.exists() or not vocab_path.exists():
+            raise ModelNotFound(
+                f"Local model directory must contain 'model.onnx' and 'vocab.json'. "
+                f"Path: {model_dir}"
+            )
+        return onnx_path, vocab_path
+    # Case 2: The identifier is a Hugging Face Hub repository ID
+    # Delegate the download logic, passing the new architecture parameter.
+    onnx_path_str, vocab_path_str = _download_from_hub(
+        repo_id=model_identifier,
+        architecture=architecture,
+        size=size,
+        revision=revision,
+        force_sync=force_sync,
+    )
+    return Path(onnx_path_str), Path(vocab_path_str)

diacritizer/predictor.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from .exceptions import ModelNotFound
+class OnnxPredictor:
+    def __init__(self, model_path: Path, use_gpu: bool = False):
+        """
+        Initializes the ONNX Runtime session.
+        Args:
+            model_path: Path to the .onnx model file.
+            use_gpu: Whether to use the GPU for inference. Defaults to False.
+        Raises:
+            ModelNotFound: If the model file does not exist at the given path.
+        """
+        if not model_path.exists():
+            raise ModelNotFound(f"ONNX model file not found at: {model_path}")
+        providers = ["CPUExecutionProvider"]
+        if use_gpu:
+            # You can customize this list based on your target hardware
+            providers.insert(0, "CUDAExecutionProvider")
+        self.session = ort.InferenceSession(str(model_path), providers=providers)
+        self.input_name = self.session.get_inputs()[0].name
+        self.hints_name = self.session.get_inputs()[1].name
+        self.output_name = self.session.get_outputs()[0].name
+    def predict(self, input_ids: np.ndarray, hints: np.ndarray) -> np.ndarray:
+        """
+        Runs inference on a batch of tokenized input IDs.
+        Args:
+            input_ids: A numpy array of shape (batch_size, sequence_length).
+            hints: A numpy array of shape (batch_size, sequence_length).
+        Returns:
+            A numpy array of logits of shape (batch_size, sequence_length, num_classes).
+        """
+        ort_inputs = {self.input_name: input_ids, self.hints_name: hints}
+        # The output is a list, we are interested in the first element
+        logits = self.session.run([self.output_name], ort_inputs)[0]
+        return logits

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Gradio for the UI
+gradio>=4.0.0
+# Core dependencies
+onnxruntime
+numpy
+huggingface-hub
+# --- Custom Local Packages ---
+# These will be installed from the .whl files in the repo
+# arabic_diacritizer_common
+# arabic_diacritizer