SamaaliWhisper / post_processing_notebook_content.txt
MuhammadHijazii's picture
Upload 17 files
dfdd9cb verified
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"collapsed": true,
"id": "iQh3AuBaJeJB",
"outputId": "38f2ae65-160d-4c2e-b94d-60a53d5169a4"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (4.0.0)",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.18.0)",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.0.2)",
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)",
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.8)",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)",
"Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)",
"Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)",
"Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.16)",
"Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)",
"Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.34.1)",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (25.0)",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)",
"Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.12.14)",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.14.1)",
"Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.1.5)",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.2)",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.5.0)",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2025.7.14)",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.9.0.post0)",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)",
"Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)",
"Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.6.3)",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)",
"Collecting pyxDamerauLevenshtein",
" Downloading pyxdameraulevenshtein-1.8.0.tar.gz (62 kB)",
" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.6/62.6 kB 1.6 MB/s eta 0:00:00",
" Installing build dependencies ... done",
" Getting requirements to build wheel ... done",
" Preparing metadata (pyproject.toml) ... done",
"Building wheels for collected packages: pyxDamerauLevenshtein",
" Building wheel for pyxDamerauLevenshtein (pyproject.toml) ... done",
" Created wheel for pyxDamerauLevenshtein: filename=pyxdameraulevenshtein-1.8.0-cp311-cp311-linux_x86_64.whl size=118739 sha256=f1a7f59df5dd8d815600f6cfee77b16e715a82cce31ccc070f60041eee959c49",
" Stored in directory: /root/.cache/pip/wheels/84/82/46/a39875a3db5c4804edd541b19bf2b5eacead1949b0a396be70",
"Successfully built pyxDamerauLevenshtein",
"Installing collected packages: pyxDamerauLevenshtein",
"Successfully installed pyxDamerauLevenshtein-1.8.0"
]
}
],
"source": [
"!pip install datasets",
"!pip install pyxDamerauLevenshtein"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SNs-yKfosQAI",
"outputId": "33f9f10b-a59f-4d40-9fbd-ab28fef1ead0"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting textdistance",
" Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)",
"Downloading textdistance-4.6.3-py3-none-any.whl (31 kB)",
"Installing collected packages: textdistance",
"Successfully installed textdistance-4.6.3"
]
}
],
"source": [
"!pip install textdistance"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"collapsed": true,
"id": "HIIq7sZRIlmc",
"outputId": "a8e3e445-7529-4b83-ce4d-3d4c3a44228a"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...",
"[nltk_data] Unzipping tokenizers/punkt.zip.",
"[nltk_data] Downloading package punkt_tab to /root/nltk_data...",
"[nltk_data] Unzipping tokenizers/punkt_tab.zip."
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"import pandas as pd",
"import nltk",
"from collections import Counter",
"from nltk.tokenize import word_tokenize",
"import textdistance",
"import re",
"from datasets import load_dataset",
"nltk.download('punkt')",
"nltk.download('punkt_tab')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sXoucKYaamZu"
},
"source": [
"##Example 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "R2CT02BINo_L"
},
"outputs": [],
"source": [
"original_text = '''",
"",
"يهتز غشاء الطبل تنتقل عظيمات السمع الاهتزازات إلى النافذة البيضية. يهتز غشاء النافذة البيضية. يهتز اللمف الخارجي في القناة الدهليزية. يهتز غشاء رايسنر. تنتقل الاهتزازات إلى اللمف الداخلي في القناة القوقعية. اهتزاز الغشاء القاعدي بشكل موجي",
"'''",
"extracted_text = '''",
"يحتاز غشاء الطبل تنتقل عظيمات السماء الاحتزازات إلى النافذة البيضية يحتاز غشاء النافذة البيضية يحتاز الملف الخارجي في القناة الدهليزية يحتاز غشاء رايسنر تنتقل الاحتزازات إلى الملف الداخلي في القناة القوقعية احتزاز الغشاء القاعدة بشكل موجه",
"",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CGtISmHtNi5N"
},
"outputs": [],
"source": [
"original_words = word_tokenize(original_text)",
"extracted_words = word_tokenize(extracted_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "HjGEN9-4ON8H"
},
"outputs": [],
"source": [
"from pyxdameraulevenshtein import damerau_levenshtein_distance",
"",
"def dl_distance(word1, word2):",
" return damerau_levenshtein_distance(word1, word2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "eAni3AxuOQqc"
},
"outputs": [],
"source": [
"def find_closest_words(misspelled_word, dictionary, max_distance=2):",
" candidates = []",
" for word, freq in dictionary.items():",
" if word in ['.', '،', '(', ')', ':']: # نتجاهل علامات الترقيم",
" continue",
" distance = dl_distance(misspelled_word, word)",
" if distance <= max_distance:",
" candidates.append((word, freq, distance))",
" # ترتيب حسب: المسافة، ثم التكرار، ثم التقارب في الطول، ثم نفس الحرف الأول",
" candidates.sort(key=lambda x: (",
" x[2], # أقل مسافة أولًا",
" -x[1], # أكثر تكرارًا",
" abs(len(x[0]) - len(misspelled_word)), # تقارب الطول",
" x[0][0] != misspelled_word[0] # يبدأ بنفس الحرف",
" ))",
" return candidates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tVGDYn-dRbyo",
"outputId": "93c6799e-594b-480f-e5e0-2357a0668d3e"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Counter({'.': 5, 'يهتز': 4, 'غشاء': 3, 'تنتقل': 2, 'الاهتزازات': 2, 'إلى': 2, 'النافذة': 2, 'البيضية': 2, 'اللمف': 2, 'في': 2, 'القناة': 2, 'الطبل': 1, 'عظيمات': 1, 'السمع': 1, 'الخارجي': 1, 'الدهليزية': 1, 'رايسنر': 1, 'الداخلي': 1, 'القوقعية': 1, 'اهتزاز': 1, 'الغشاء': 1, 'القاعدي': 1, 'بشكل': 1, 'موجي': 1})"
]
}
],
"source": [
"def create_dictionary(texts):",
" words = []",
" for sentence in texts:",
" words.extend(nltk.word_tokenize(sentence))",
" word_counts = Counter(words)",
" return word_counts",
"",
"texts = ['''",
"يهتز غشاء الطبل تنتقل عظيمات السمع الاهتزازات إلى النافذة البيضية. يهتز غشاء النافذة البيضية. يهتز اللمف الخارجي في القناة الدهليزية. يهتز غشاء رايسنر. تنتقل الاهتزازات إلى اللمف الداخلي في القناة القوقعية. اهتزاز الغشاء القاعدي بشكل موجي",
"",
"'''",
"]",
"dictionary = create_dictionary(texts)",
"print(dictionary)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "VRP1EJLi7Mf2"
},
"outputs": [],
"source": [
"# print(\"🔧 النص بعد التصحيح:\")",
"# print(corrected_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "cOSSoJDPO_WV",
"outputId": "7bbb5b24-8824-44a0-ee0d-144698d9f6b6"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"تصحيح 'يحتاز' إلى 'يهتز' (تكرار: 4, مسافة: 2)",
"الاحتفاظ بالكلمة الأصلية: 'غشاء'",
"الاحتفاظ بالكلمة الأصلية: 'الطبل'",
"الاحتفاظ بالكلمة الأصلية: 'تنتقل'",
"الاحتفاظ بالكلمة الأصلية: 'عظيمات'",
"الاحتفاظ بالكلمة الأصلية: 'السماء'",
"تصحيح 'الاحتزازات' إلى 'الاهتزازات' (تكرار: 2, مسافة: 1)",
"الاحتفاظ بالكلمة الأصلية: 'إلى'",
"الاحتفاظ بالكلمة الأصلية: 'النافذة'",
"الاحتفاظ بالكلمة الأصلية: 'البيضية'",
"تصحيح 'يحتاز' إلى 'يهتز' (تكرار: 4, مسافة: 2)",
"الاحتفاظ بالكلمة الأصلية: 'غشاء'",
"الاحتفاظ بالكلمة الأصلية: 'النافذة'",
"الاحتفاظ بالكلمة الأصلية: 'البيضية'",
"تصحيح 'يحتاز' إلى 'يهتز' (تكرار: 4, مسافة: 2)",
"تصحيح 'الملف' إلى 'اللمف' (تكرار: 2, مسافة: 1)",
"الاحتفاظ بالكلمة الأصلية: 'الخارجي'",
"الاحتفاظ بالكلمة الأصلية: 'في'",
"الاحتفاظ بالكلمة الأصلية: 'القناة'",
"الاحتفاظ بالكلمة الأصلية: 'الدهليزية'",
"تصحيح 'يحتاز' إلى 'يهتز' (تكرار: 4, مسافة: 2)",
"الاحتفاظ بالكلمة الأصلية: 'غشاء'",
"الاحتفاظ بالكلمة الأصلية: 'رايسنر'",
"الاحتفاظ بالكلمة الأصلية: 'تنتقل'",
"تصحيح 'الاحتزازات' إلى 'الاهتزازات' (تكرار: 2, مسافة: 1)",
"الاحتفاظ بالكلمة الأصلية: 'إلى'",
"تصحيح 'الملف' إلى 'اللمف' (تكرار: 2, مسافة: 1)",
"الاحتفاظ بالكلمة الأصلية: 'الداخلي'",
"الاحتفاظ بالكلمة الأصلية: 'في'",
"الاحتفاظ بالكلمة الأصلية: 'القناة'",
"الاحتفاظ بالكلمة الأصلية: 'القوقعية'",
"الاحتفاظ بالكلمة الأصلية: 'احتزاز'",
"الاحتفاظ بالكلمة الأصلية: 'الغشاء'",
"الاحتفاظ بالكلمة الأصلية: 'القاعدة'",
"الاحتفاظ بالكلمة الأصلية: 'بشكل'",
"الاحتفاظ بالكلمة الأصلية: 'موجه'"
]
}
],
"source": [
"max_distance =3",
"threshold_freq = 1",
"for word in extracted_words:",
" suggestions = find_closest_words(word, dictionary, max_distance)",
" if suggestions:",
" best_suggestion, freq, dist = suggestions[0]",
" if best_suggestion != word and freq > threshold_freq:",
" print(f\"تصحيح '{word}' إلى '{best_suggestion}' (تكرار: {freq}, مسافة: {dist})\")",
" else:",
" print(f\"الاحتفاظ بالكلمة الأصلية: '{word}'\")",
" else:",
" print(f\"لا اقتراحات للكلمة: '{word}'\")"
]
},
{
"cell_type"
}
]
}