uslap-query / Code_files /archive /expand_known_forms.py
uslap's picture
Upload folder using huggingface_hub
7cc8e29 verified
Raw
History Blame Contribute Delete
13.8 kB
#!/usr/bin/env python3
"""
Phase 1C: Expand quran_known_forms from 255 to 3,000+
Auto-maps unrooted Qur'anic word forms to roots using:
1. Manual curated mappings for high-frequency complex forms
2. Algorithmic extraction with dictionary verification for the rest
"""
import sqlite3
try:
from uslap_db_connect import connect as _uslap_connect
_HAS_WRAPPER = True
except ImportError:
_HAS_WRAPPER = False
import re
import os
DB_PATH = os.path.join(os.path.dirname(__file__), "uslap_database_v3.db")
DIACRITICS = re.compile(r'[\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]')
ALEF_VARIANTS = re.compile(r'[ุฅุฃุขูฑ]')
HAMZA_CARRIERS = re.compile(r'[ุคุฆ]')
def strip_bare(word):
"""Strip to bare form for known_forms matching."""
text = word
text = text.replace('\u0671', 'ุง') # wasla
text = text.replace('\u06E5', '') # small waw
text = text.replace('\u06E6', '') # small yaa
text = text.replace('\u0654', 'ุก') # hamza above
text = HAMZA_CARRIERS.sub('ุก', text)
text = text.replace('ุข', 'ุงุก')
text = DIACRITICS.sub('', text)
text = text.replace('\u0640', '') # tatweel
text = text.replace('\u200D', '').replace('\u200C', '')
text = ALEF_VARIANTS.sub('ุง', text)
text = text.replace('\u0649', 'ูŠ') # alef maksura โ†’ yaa
return text
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# MANUAL CURATED MAPPINGS โ€” complex forms that need human judgment
# Format: bare_form โ†’ (root_unhyphenated, word_type, verb_form)
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
CURATED = {
# โ”€โ”€ ุก-ูŠ-ูŠ (ุฃูŠูŠ) family: signs, verses โ”€โ”€
'ุจุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
'ุจุงูŠุช': ('ุฃูŠูŠ', 'NOUN', None),
'ู„ุงูŠุช': ('ุฃูŠูŠ', 'NOUN', None),
'ุงู„ุงูŠุช': ('ุฃูŠูŠ', 'NOUN', None),
'ู„ุงูŠุฉ': ('ุฃูŠูŠ', 'NOUN', None),
'ุกุงูŠุชู‡': ('ุฃูŠูŠ', 'NOUN', None),
'ุจุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
'ุจุงูŠุฉ': ('ุฃูŠูŠ', 'NOUN', None),
'ุงูŠุชู‡': ('ุฃูŠูŠ', 'NOUN', None),
'ูˆุงูŠุชู‡': ('ุฃูŠูŠ', 'NOUN', None),
'ูˆุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
'ูุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
'ุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
'ุงูŠุฉ': ('ุฃูŠูŠ', 'NOUN', None),
'ูˆุงูŠุช': ('ุฃูŠูŠ', 'NOUN', None),
'ุกุงูŠุฉ': ('ุฃูŠูŠ', 'NOUN', None),
'ู„ุงูŠุชู†ุง': ('ุฃูŠูŠ', 'NOUN', None),
# โ”€โ”€ ู‚-ูˆ-ู… family: standing, straight โ”€โ”€
'ู…ุณุชู‚ูŠู…': ('ู‚ูˆู…', 'NOUN', 'X'),
'ู…ุณุชู‚ูŠู…ุง': ('ู‚ูˆู…', 'NOUN', 'X'),
'ูŠุณุชู‚ูŠู…': ('ู‚ูˆู…', 'VERB', 'X'),
'ุงุณุชู‚ู…': ('ู‚ูˆู…', 'VERB', 'X'),
'ุงุณุชู‚ุงู…ูˆุง': ('ู‚ูˆู…', 'VERB', 'X'),
'ู‚ูˆุฉ': ('ู‚ูˆูŠ', 'NOUN', None),
# โ”€โ”€ ูˆ-ู‚-ูŠ family: protect, be conscious โ”€โ”€
'ุชุชู‚ูˆู†': ('ูˆู‚ูŠ', 'VERB', 'VIII'),
'ุงุชู‚ูˆุง': ('ูˆู‚ูŠ', 'VERB', 'VIII'),
'ุงุชู‚ูŠ': ('ูˆู‚ูŠ', 'VERB', 'VIII'),
'ูŠุชู‚ูˆู†': ('ูˆู‚ูŠ', 'VERB', 'VIII'),
'ุงู„ู…ุชู‚ูŠู†': ('ูˆู‚ูŠ', 'NOUN', 'VIII'),
# โ”€โ”€ ูˆ-ู„-ูŠ family: close, govern, turn โ”€โ”€
'ุชูˆู„ูˆุง': ('ูˆู„ูŠ', 'VERB', 'V'),
'ุชูˆู„ูŠ': ('ูˆู„ูŠ', 'VERB', 'V'),
'ูŠุชูˆู„ู‰': ('ูˆู„ูŠ', 'VERB', 'V'),
'ุชูˆู„ูŠู‡ู…': ('ูˆู„ูŠ', 'VERB', 'V'),
'ุงูˆู„ูŠุงุก': ('ูˆู„ูŠ', 'NOUN', None),
'ุงูˆู„ูŠุงุฆู‡': ('ูˆู„ูŠ', 'NOUN', None),
'ุงูˆู„ูŠุงุฆู‡ู…': ('ูˆู„ูŠ', 'NOUN', None),
'ูˆุงูˆู„ูŠุงุฆู‡': ('ูˆู„ูŠ', 'NOUN', None),
# โ”€โ”€ ุฑ-ูˆ-ุฏ family: seek, intend โ”€โ”€
'ุงุฑุงุฏ': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ุงุฑุงุฏูˆุง': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ุงุฑุฏู†ุง': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ุงุฑุฏุชู…': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ูŠุฑูŠุฏ': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ูŠุฑูŠุฏูˆู†': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ุชุฑูŠุฏูˆู†': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ู†ุฑูŠุฏ': ('ุฑูˆุฏ', 'VERB', 'IV'),
'ู…ุฑูŠุฏ': ('ุฑูˆุฏ', 'NOUN', None),
# โ”€โ”€ ู†-ูˆ-ุฑ / ู†-ุง-ุฑ family: light, fire โ”€โ”€
'ู†ุงุฑุง': ('ู†ูˆุฑ', 'NOUN', None),
'ุงู„ู†ุงุฑ': ('ู†ูˆุฑ', 'NOUN', None),
'ู†ุงุฑ': ('ู†ูˆุฑ', 'NOUN', None),
# โ”€โ”€ ูŠ-ูˆ-ู… family: day โ”€โ”€
'ุงูŠุงู…': ('ูŠูˆู…', 'NOUN', None),
'ุงูŠุงู…ู‡': ('ูŠูˆู…', 'NOUN', None),
'ุงูŠุงู…ู‡ุง': ('ูŠูˆู…', 'NOUN', None),
# โ”€โ”€ ู…-ู„-ูƒ family: king, angel, possess โ”€โ”€
'ุงู„ู…ู„ุงุฆูƒุฉ': ('ู…ู„ูƒ', 'NOUN', None),
'ุงู„ู…ู„ุกูƒุฉ': ('ู…ู„ูƒ', 'NOUN', None),
'ู…ู„ุงุฆูƒุฉ': ('ู…ู„ูƒ', 'NOUN', None),
'ู…ู„ุกูƒุฉ': ('ู…ู„ูƒ', 'NOUN', None),
'ู…ู„ุงุฆูƒุชู‡': ('ู…ู„ูƒ', 'NOUN', None),
'ุงู„ู…ู„ูƒ': ('ู…ู„ูƒ', 'NOUN', None),
# โ”€โ”€ ุฎ-ูˆ-ู family: fear โ”€โ”€
'ุงุฎุงู': ('ุฎูˆู', 'VERB', None),
'ุฎุงููˆุง': ('ุฎูˆู', 'VERB', None),
'ูŠุฎุงููˆู†': ('ุฎูˆู', 'VERB', None),
'ุชุฎุงููˆู†': ('ุฎูˆู', 'VERB', None),
'ุชุฎุงู': ('ุฎูˆู', 'VERB', None),
# โ”€โ”€ ู†-ุจ-ุฃ family: prophet, news โ”€โ”€
'ุงู„ู†ุจูŠ': ('ู†ุจุฃ', 'NOUN', None),
'ุงู„ู†ุจูŠูŠู†': ('ู†ุจุฃ', 'NOUN', None),
'ู†ุจูŠุง': ('ู†ุจุฃ', 'NOUN', None),
'ู†ุจูŠ': ('ู†ุจุฃ', 'NOUN', None),
# โ”€โ”€ ุฃ-ุฎ-ุฐ family: take โ”€โ”€
'ุงุชุฎุฐูˆุง': ('ุฃุฎุฐ', 'VERB', 'VIII'),
'ุงุชุฎุฐ': ('ุฃุฎุฐ', 'VERB', 'VIII'),
'ูŠุชุฎุฐูˆุง': ('ุฃุฎุฐ', 'VERB', 'VIII'),
'ูŠุชุฎุฐ': ('ุฃุฎุฐ', 'VERB', 'VIII'),
'ุชุชุฎุฐูˆุง': ('ุฃุฎุฐ', 'VERB', 'VIII'),
'ุงุชุฎุฐุชู…': ('ุฃุฎุฐ', 'VERB', 'VIII'),
# โ”€โ”€ ุช-ู„-ูˆ family: recite โ”€โ”€
'ุชุชู„ู‰': ('ุชู„ูˆ', 'VERB', None),
'ูŠุชู„ูˆ': ('ุชู„ูˆ', 'VERB', None),
'ูŠุชู„ู‰': ('ุชู„ูˆ', 'VERB', None),
'ูŠุชู„ูˆู†': ('ุชู„ูˆ', 'VERB', None),
'ุชุชู„ูˆุง': ('ุชู„ูˆ', 'VERB', None),
'ู†ุชู„ูˆ': ('ุชู„ูˆ', 'VERB', None),
'ู†ุชู„ูˆู‡ุง': ('ุชู„ูˆ', 'VERB', None),
# โ”€โ”€ ู‚-ูˆ-ู„ family: say โ”€โ”€
'ู‚ู„ู†ุง': ('ู‚ูˆู„', 'VERB', None),
'ูˆู‚ู„': ('ู‚ูˆู„', 'VERB', None),
'ูู‚ู„': ('ู‚ูˆู„', 'VERB', None),
'ู‚ูŠู„': ('ู‚ูˆู„', 'VERB', None),
# โ”€โ”€ ุฃ-ุช-ูŠ family: come, give โ”€โ”€
'ุงูˆุชูŠ': ('ุฃุชูŠ', 'VERB', None),
'ุงูˆุชูˆุง': ('ุฃุชูŠ', 'VERB', None),
'ูŠูˆุชู‰': ('ุฃุชูŠ', 'VERB', None),
# โ”€โ”€ ุท-ูˆ-ุน family: obey โ”€โ”€
'ูˆุงุทูŠุนูˆุง': ('ุทูˆุน', 'VERB', None),
'ุงุทูŠุนูˆุง': ('ุทูˆุน', 'VERB', None),
'ุงุทุนู†ุง': ('ุทูˆุน', 'VERB', None),
'ูŠุทูŠุนูˆู†': ('ุทูˆุน', 'VERB', None),
# โ”€โ”€ ุฎ-ู„-ูˆ family: pass, empty โ”€โ”€
'ุฎู„ุช': ('ุฎู„ูˆ', 'VERB', None),
'ุฎู„ูˆุง': ('ุฎู„ูˆ', 'VERB', None),
# โ”€โ”€ ุจ-ู†-ูŠ family: son, build โ”€โ”€
'ุงุจู†': ('ุจู†ูŠ', 'NOUN', None),
'ุงุจู†ู‡': ('ุจู†ูŠ', 'NOUN', None),
# โ”€โ”€ ุน-ุฏ-ู† family: Eden, permanent residence โ”€โ”€
'ุนุฏู†': ('ุนุฏู†', 'NOUN', None),
# โ”€โ”€ ุณ-ูˆ-ุฃ family: evil, bad โ”€โ”€
'ุณุงุก': ('ุณูˆุฃ', 'VERB', None),
'ุณุงุกุช': ('ุณูˆุฃ', 'VERB', None),
# โ”€โ”€ ุฌ-ูŠ-ุฃ family: come โ”€โ”€
'ุฌุงุกุชู‡ู…': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุกู‡ู…': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุกูƒ': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุก': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุกูˆุง': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุกู†ุง': ('ุฌูŠุฃ', 'VERB', None),
'ุฌุงุกูƒู…': ('ุฌูŠุฃ', 'VERB', None),
# โ”€โ”€ ู‡-ุฒ-ุก family: mock โ”€โ”€
'ู‡ุฒูˆุง': ('ู‡ุฒุฃ', 'NOUN', None),
'ูŠุณุชู‡ุฒุกูˆู†': ('ู‡ุฒุฃ', 'VERB', 'X'),
# โ”€โ”€ ุด-ูŠ-ุฃ family: will, want โ”€โ”€
'ุดุงุก': ('ุดูŠุฃ', 'VERB', None),
'ูŠุดุงุก': ('ุดูŠุฃ', 'VERB', None),
'ู†ุดุงุก': ('ุดูŠุฃ', 'VERB', None),
'ุชุดุงุกูˆู†': ('ุดูŠุฃ', 'VERB', None),
# โ”€โ”€ ู‚-ูˆ-ู… family: establish โ”€โ”€
'ูˆุงู‚ูŠู…ูˆุง': ('ู‚ูˆู…', 'VERB', 'IV'),
'ุงู‚ูŠู…ูˆุง': ('ู‚ูˆู…', 'VERB', 'IV'),
'ุงู‚ู…ุชู…': ('ู‚ูˆู…', 'VERB', 'IV'),
'ูŠู‚ูŠู…ูˆู†': ('ู‚ูˆู…', 'VERB', 'IV'),
# โ”€โ”€ ุฃ-ุช-ูŠ family: give (with ูˆูŽ prefix) โ”€โ”€
'ูˆุกุงุชูˆุง': ('ุฃุชูŠ', 'VERB', None),
'ูˆุกุงุชู‰': ('ุฃุชูŠ', 'VERB', None),
'ุกุงุชูˆุง': ('ุฃุชูŠ', 'VERB', None),
'ุกุงุชุงู‡ู…': ('ุฃุชูŠ', 'VERB', None),
'ุกุงุชูŠู†ุง': ('ุฃุชูŠ', 'VERB', None),
'ุกุงุชูŠู†ุงู‡': ('ุฃุชูŠ', 'VERB', None),
'ุกุงุชุงู‡': ('ุฃุชูŠ', 'VERB', None),
# โ”€โ”€ ุณ-ูˆ-ูŠ family: equal โ”€โ”€
'ูŠุณุชูˆูŠ': ('ุณูˆูŠ', 'VERB', 'VIII'),
'ุงุณุชูˆู‰': ('ุณูˆูŠ', 'VERB', 'VIII'),
# โ”€โ”€ Particles that were missed โ”€โ”€
'ุฐูˆ': (None, 'PARTICLE', None),
'ุฐูŠ': (None, 'PARTICLE', None),
'ุฐุง': (None, 'PARTICLE', None),
'ูˆู†ุญู†': (None, 'PARTICLE', None),
'ูˆู„ูƒู…': (None, 'PARTICLE', None),
'ูˆู‡ุฐุง': (None, 'PARTICLE', None),
'ูˆุงู†ุช': (None, 'PARTICLE', None),
'ุจู‡ุฐุง': (None, 'PARTICLE', None),
'ูƒู…': (None, 'PARTICLE', None),
'ุนู†ุง': (None, 'PARTICLE', None),
'ุนู„ูŠู‡ู†': (None, 'PARTICLE', None),
'ุงูู…ู†': (None, 'PARTICLE', None),
'ูˆุชู„ูƒ': (None, 'PARTICLE', None),
'ุงุกุฐุง': (None, 'PARTICLE', None),
'ูˆูŠู„': (None, 'PARTICLE', None),
# โ”€โ”€ ูˆ-ุฌ-ุฏ family: find โ”€โ”€
'ุชุฌุฏ': ('ูˆุฌุฏ', 'VERB', None),
'ูŠุฌุฏูˆู†': ('ูˆุฌุฏ', 'VERB', None),
'ูˆุฌุฏูˆุง': ('ูˆุฌุฏ', 'VERB', None),
# โ”€โ”€ ุก-ู„ family: people of โ”€โ”€
'ุกุงู„': ('ุฃูˆู„', 'NOUN', None),
# โ”€โ”€ ุจ-ุฐ-ุช family: wealth โ”€โ”€
'ุจุฐุงุช': (None, 'PARTICLE', None), # ุจู + ุฐุงุช
# โ”€โ”€ ู†-ุฒ-ู„ family: send down โ”€โ”€
'ุงู†ุฒู„': ('ู†ุฒู„', 'VERB', 'IV'),
'ุงู†ุฒู„ู†ุง': ('ู†ุฒู„', 'VERB', 'IV'),
'ุงู†ุฒู„ู‡': ('ู†ุฒู„', 'VERB', 'IV'),
'ู†ุฒู„': ('ู†ุฒู„', 'VERB', None),
# โ”€โ”€ ู‚-ูŠ-ู… family: establish prayer โ”€โ”€
'ุงู‚ู…': ('ู‚ูˆู…', 'VERB', 'IV'),
'ูˆุงู‚ู…': ('ู‚ูˆู…', 'VERB', 'IV'),
# โ”€โ”€ Misc high-frequency โ”€โ”€
'ูŠุณุชูˆูŠ': ('ุณูˆูŠ', 'VERB', 'VIII'),
'ุนู†ุง': (None, 'PARTICLE', None),
}
def main():
conn = _uslap_connect(DB_PATH) if _HAS_WRAPPER else sqlite3.connect(DB_PATH)
conn.execute("PRAGMA foreign_keys = ON")
# Get all unrooted word forms
rows = conn.execute("""
SELECT DISTINCT arabic_word, COUNT(*) as cnt
FROM quran_word_roots
WHERE (root IS NULL OR root='') AND word_type <> 'PARTICLE'
GROUP BY arabic_word
ORDER BY cnt DESC
""").fetchall()
print(f"Unrooted word forms: {len(rows)}")
inserted = 0
updated_particles = 0
curated_used = 0
for arabic_word, freq in rows:
bare = strip_bare(arabic_word)
# Check if already in known_forms
exists = conn.execute(
"SELECT rowid FROM quran_known_forms WHERE bare_form=?", (bare,)
).fetchone()
if exists:
continue
# Check curated mappings
if bare in CURATED:
root_un, wtype, vform = CURATED[bare]
if wtype == 'PARTICLE':
# Just need to add to particles, not known_forms
updated_particles += 1
else:
conn.execute(
"INSERT INTO quran_known_forms (arabic_form, bare_form, root_unhyphenated, word_type, verb_form) "
"VALUES (?, ?, ?, ?, ?)",
(arabic_word, bare, root_un, wtype, vform)
)
inserted += 1
curated_used += 1
continue
conn.commit()
# Now auto-map remaining unrooted forms using the improved extraction
# Import the compiler's improved algorithms
import uslap_compiler as c
import importlib
importlib.reload(c)
if hasattr(c.classify_word, '_particle_cache'):
delattr(c.classify_word, '_particle_cache')
auto_mapped = 0
remaining_rows = conn.execute("""
SELECT DISTINCT arabic_word, COUNT(*) as cnt
FROM quran_word_roots
WHERE (root IS NULL OR root='') AND word_type <> 'PARTICLE'
GROUP BY arabic_word
ORDER BY cnt DESC
""").fetchall()
for arabic_word, freq in remaining_rows:
bare = strip_bare(arabic_word)
# Skip if already handled
exists = conn.execute(
"SELECT rowid FROM quran_known_forms WHERE bare_form=?", (bare,)
).fetchone()
if exists:
continue
# Skip if curated as particle
if bare in CURATED and CURATED[bare][1] == 'PARTICLE':
continue
# Try the improved extraction against the dictionary
result = c.find_root(arabic_word, conn)
if result[0] and result[1]:
# Found a root with meaning โ€” add to known_forms
root_hyph = result[0]
root_un = root_hyph.replace('-', '')
wtype = result[3] or c.classify_word(arabic_word)
vform = result[4] or c.detect_verb_form(arabic_word)
conn.execute(
"INSERT OR IGNORE INTO quran_known_forms (arabic_form, bare_form, root_unhyphenated, word_type, verb_form) "
"VALUES (?, ?, ?, ?, ?)",
(arabic_word, bare, root_un, wtype, vform)
)
auto_mapped += 1
conn.commit()
print(f"\nResults:")
print(f" Curated entries inserted: {curated_used}")
print(f" Particles identified: {updated_particles}")
print(f" Auto-mapped: {auto_mapped}")
print(f" Total new known_forms: {curated_used + auto_mapped}")
# Count total known_forms now
total_kf = conn.execute("SELECT COUNT(*) FROM quran_known_forms").fetchone()[0]
print(f" Total known_forms: {total_kf}")
conn.close()
if __name__ == '__main__':
main()