Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Phase 1C: Expand quran_known_forms from 255 to 3,000+ | |
| Auto-maps unrooted Qur'anic word forms to roots using: | |
| 1. Manual curated mappings for high-frequency complex forms | |
| 2. Algorithmic extraction with dictionary verification for the rest | |
| """ | |
| import sqlite3 | |
| try: | |
| from uslap_db_connect import connect as _uslap_connect | |
| _HAS_WRAPPER = True | |
| except ImportError: | |
| _HAS_WRAPPER = False | |
| import re | |
| import os | |
| DB_PATH = os.path.join(os.path.dirname(__file__), "uslap_database_v3.db") | |
| DIACRITICS = re.compile(r'[\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]') | |
| ALEF_VARIANTS = re.compile(r'[ุฅุฃุขูฑ]') | |
| HAMZA_CARRIERS = re.compile(r'[ุคุฆ]') | |
| def strip_bare(word): | |
| """Strip to bare form for known_forms matching.""" | |
| text = word | |
| text = text.replace('\u0671', 'ุง') # wasla | |
| text = text.replace('\u06E5', '') # small waw | |
| text = text.replace('\u06E6', '') # small yaa | |
| text = text.replace('\u0654', 'ุก') # hamza above | |
| text = HAMZA_CARRIERS.sub('ุก', text) | |
| text = text.replace('ุข', 'ุงุก') | |
| text = DIACRITICS.sub('', text) | |
| text = text.replace('\u0640', '') # tatweel | |
| text = text.replace('\u200D', '').replace('\u200C', '') | |
| text = ALEF_VARIANTS.sub('ุง', text) | |
| text = text.replace('\u0649', 'ู') # alef maksura โ yaa | |
| return text | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # MANUAL CURATED MAPPINGS โ complex forms that need human judgment | |
| # Format: bare_form โ (root_unhyphenated, word_type, verb_form) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| CURATED = { | |
| # โโ ุก-ู-ู (ุฃูู) family: signs, verses โโ | |
| 'ุจุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| 'ุจุงูุช': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุช': ('ุฃูู', 'NOUN', None), | |
| 'ุงูุงูุช': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุฉ': ('ุฃูู', 'NOUN', None), | |
| 'ุกุงูุชู': ('ุฃูู', 'NOUN', None), | |
| 'ุจุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| 'ุจุงูุฉ': ('ุฃูู', 'NOUN', None), | |
| 'ุงูุชู': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุชู': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| 'ุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| 'ุงูุฉ': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุช': ('ุฃูู', 'NOUN', None), | |
| 'ุกุงูุฉ': ('ุฃูู', 'NOUN', None), | |
| 'ูุงูุชูุง': ('ุฃูู', 'NOUN', None), | |
| # โโ ู-ู-ู family: standing, straight โโ | |
| 'ู ุณุชููู ': ('ููู ', 'NOUN', 'X'), | |
| 'ู ุณุชููู ุง': ('ููู ', 'NOUN', 'X'), | |
| 'ูุณุชููู ': ('ููู ', 'VERB', 'X'), | |
| 'ุงุณุชูู ': ('ููู ', 'VERB', 'X'), | |
| 'ุงุณุชูุงู ูุง': ('ููู ', 'VERB', 'X'), | |
| 'ููุฉ': ('ููู', 'NOUN', None), | |
| # โโ ู-ู-ู family: protect, be conscious โโ | |
| 'ุชุชููู': ('ููู', 'VERB', 'VIII'), | |
| 'ุงุชููุง': ('ููู', 'VERB', 'VIII'), | |
| 'ุงุชูู': ('ููู', 'VERB', 'VIII'), | |
| 'ูุชููู': ('ููู', 'VERB', 'VIII'), | |
| 'ุงูู ุชููู': ('ููู', 'NOUN', 'VIII'), | |
| # โโ ู-ู-ู family: close, govern, turn โโ | |
| 'ุชูููุง': ('ููู', 'VERB', 'V'), | |
| 'ุชููู': ('ููู', 'VERB', 'V'), | |
| 'ูุชููู': ('ููู', 'VERB', 'V'), | |
| 'ุชููููู ': ('ููู', 'VERB', 'V'), | |
| 'ุงูููุงุก': ('ููู', 'NOUN', None), | |
| 'ุงูููุงุฆู': ('ููู', 'NOUN', None), | |
| 'ุงูููุงุฆูู ': ('ููู', 'NOUN', None), | |
| 'ูุงูููุงุฆู': ('ููู', 'NOUN', None), | |
| # โโ ุฑ-ู-ุฏ family: seek, intend โโ | |
| 'ุงุฑุงุฏ': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ุงุฑุงุฏูุง': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ุงุฑุฏูุง': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ุงุฑุฏุชู ': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ูุฑูุฏ': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ูุฑูุฏูู': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ุชุฑูุฏูู': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ูุฑูุฏ': ('ุฑูุฏ', 'VERB', 'IV'), | |
| 'ู ุฑูุฏ': ('ุฑูุฏ', 'NOUN', None), | |
| # โโ ู-ู-ุฑ / ู-ุง-ุฑ family: light, fire โโ | |
| 'ูุงุฑุง': ('ููุฑ', 'NOUN', None), | |
| 'ุงููุงุฑ': ('ููุฑ', 'NOUN', None), | |
| 'ูุงุฑ': ('ููุฑ', 'NOUN', None), | |
| # โโ ู-ู-ู family: day โโ | |
| 'ุงูุงู ': ('ููู ', 'NOUN', None), | |
| 'ุงูุงู ู': ('ููู ', 'NOUN', None), | |
| 'ุงูุงู ูุง': ('ููู ', 'NOUN', None), | |
| # โโ ู -ู-ู family: king, angel, possess โโ | |
| 'ุงูู ูุงุฆูุฉ': ('ู ูู', 'NOUN', None), | |
| 'ุงูู ูุกูุฉ': ('ู ูู', 'NOUN', None), | |
| 'ู ูุงุฆูุฉ': ('ู ูู', 'NOUN', None), | |
| 'ู ูุกูุฉ': ('ู ูู', 'NOUN', None), | |
| 'ู ูุงุฆูุชู': ('ู ูู', 'NOUN', None), | |
| 'ุงูู ูู': ('ู ูู', 'NOUN', None), | |
| # โโ ุฎ-ู-ู family: fear โโ | |
| 'ุงุฎุงู': ('ุฎูู', 'VERB', None), | |
| 'ุฎุงููุง': ('ุฎูู', 'VERB', None), | |
| 'ูุฎุงููู': ('ุฎูู', 'VERB', None), | |
| 'ุชุฎุงููู': ('ุฎูู', 'VERB', None), | |
| 'ุชุฎุงู': ('ุฎูู', 'VERB', None), | |
| # โโ ู-ุจ-ุฃ family: prophet, news โโ | |
| 'ุงููุจู': ('ูุจุฃ', 'NOUN', None), | |
| 'ุงููุจููู': ('ูุจุฃ', 'NOUN', None), | |
| 'ูุจูุง': ('ูุจุฃ', 'NOUN', None), | |
| 'ูุจู': ('ูุจุฃ', 'NOUN', None), | |
| # โโ ุฃ-ุฎ-ุฐ family: take โโ | |
| 'ุงุชุฎุฐูุง': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| 'ุงุชุฎุฐ': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| 'ูุชุฎุฐูุง': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| 'ูุชุฎุฐ': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| 'ุชุชุฎุฐูุง': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| 'ุงุชุฎุฐุชู ': ('ุฃุฎุฐ', 'VERB', 'VIII'), | |
| # โโ ุช-ู-ู family: recite โโ | |
| 'ุชุชูู': ('ุชูู', 'VERB', None), | |
| 'ูุชูู': ('ุชูู', 'VERB', None), | |
| 'ูุชูู': ('ุชูู', 'VERB', None), | |
| 'ูุชููู': ('ุชูู', 'VERB', None), | |
| 'ุชุชููุง': ('ุชูู', 'VERB', None), | |
| 'ูุชูู': ('ุชูู', 'VERB', None), | |
| 'ูุชูููุง': ('ุชูู', 'VERB', None), | |
| # โโ ู-ู-ู family: say โโ | |
| 'ูููุง': ('ููู', 'VERB', None), | |
| 'ููู': ('ููู', 'VERB', None), | |
| 'ููู': ('ููู', 'VERB', None), | |
| 'ููู': ('ููู', 'VERB', None), | |
| # โโ ุฃ-ุช-ู family: come, give โโ | |
| 'ุงูุชู': ('ุฃุชู', 'VERB', None), | |
| 'ุงูุชูุง': ('ุฃุชู', 'VERB', None), | |
| 'ููุชู': ('ุฃุชู', 'VERB', None), | |
| # โโ ุท-ู-ุน family: obey โโ | |
| 'ูุงุทูุนูุง': ('ุทูุน', 'VERB', None), | |
| 'ุงุทูุนูุง': ('ุทูุน', 'VERB', None), | |
| 'ุงุทุนูุง': ('ุทูุน', 'VERB', None), | |
| 'ูุทูุนูู': ('ุทูุน', 'VERB', None), | |
| # โโ ุฎ-ู-ู family: pass, empty โโ | |
| 'ุฎูุช': ('ุฎูู', 'VERB', None), | |
| 'ุฎููุง': ('ุฎูู', 'VERB', None), | |
| # โโ ุจ-ู-ู family: son, build โโ | |
| 'ุงุจู': ('ุจูู', 'NOUN', None), | |
| 'ุงุจูู': ('ุจูู', 'NOUN', None), | |
| # โโ ุน-ุฏ-ู family: Eden, permanent residence โโ | |
| 'ุนุฏู': ('ุนุฏู', 'NOUN', None), | |
| # โโ ุณ-ู-ุฃ family: evil, bad โโ | |
| 'ุณุงุก': ('ุณูุฃ', 'VERB', None), | |
| 'ุณุงุกุช': ('ุณูุฃ', 'VERB', None), | |
| # โโ ุฌ-ู-ุฃ family: come โโ | |
| 'ุฌุงุกุชูู ': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุกูู ': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุกู': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุก': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุกูุง': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุกูุง': ('ุฌูุฃ', 'VERB', None), | |
| 'ุฌุงุกูู ': ('ุฌูุฃ', 'VERB', None), | |
| # โโ ู-ุฒ-ุก family: mock โโ | |
| 'ูุฒูุง': ('ูุฒุฃ', 'NOUN', None), | |
| 'ูุณุชูุฒุกูู': ('ูุฒุฃ', 'VERB', 'X'), | |
| # โโ ุด-ู-ุฃ family: will, want โโ | |
| 'ุดุงุก': ('ุดูุฃ', 'VERB', None), | |
| 'ูุดุงุก': ('ุดูุฃ', 'VERB', None), | |
| 'ูุดุงุก': ('ุดูุฃ', 'VERB', None), | |
| 'ุชุดุงุกูู': ('ุดูุฃ', 'VERB', None), | |
| # โโ ู-ู-ู family: establish โโ | |
| 'ูุงููู ูุง': ('ููู ', 'VERB', 'IV'), | |
| 'ุงููู ูุง': ('ููู ', 'VERB', 'IV'), | |
| 'ุงูู ุชู ': ('ููู ', 'VERB', 'IV'), | |
| 'ูููู ูู': ('ููู ', 'VERB', 'IV'), | |
| # โโ ุฃ-ุช-ู family: give (with ูู prefix) โโ | |
| 'ูุกุงุชูุง': ('ุฃุชู', 'VERB', None), | |
| 'ูุกุงุชู': ('ุฃุชู', 'VERB', None), | |
| 'ุกุงุชูุง': ('ุฃุชู', 'VERB', None), | |
| 'ุกุงุชุงูู ': ('ุฃุชู', 'VERB', None), | |
| 'ุกุงุชููุง': ('ุฃุชู', 'VERB', None), | |
| 'ุกุงุชููุงู': ('ุฃุชู', 'VERB', None), | |
| 'ุกุงุชุงู': ('ุฃุชู', 'VERB', None), | |
| # โโ ุณ-ู-ู family: equal โโ | |
| 'ูุณุชูู': ('ุณูู', 'VERB', 'VIII'), | |
| 'ุงุณุชูู': ('ุณูู', 'VERB', 'VIII'), | |
| # โโ Particles that were missed โโ | |
| 'ุฐู': (None, 'PARTICLE', None), | |
| 'ุฐู': (None, 'PARTICLE', None), | |
| 'ุฐุง': (None, 'PARTICLE', None), | |
| 'ููุญู': (None, 'PARTICLE', None), | |
| 'ูููู ': (None, 'PARTICLE', None), | |
| 'ููุฐุง': (None, 'PARTICLE', None), | |
| 'ูุงูุช': (None, 'PARTICLE', None), | |
| 'ุจูุฐุง': (None, 'PARTICLE', None), | |
| 'ูู ': (None, 'PARTICLE', None), | |
| 'ุนูุง': (None, 'PARTICLE', None), | |
| 'ุนูููู': (None, 'PARTICLE', None), | |
| 'ุงูู ู': (None, 'PARTICLE', None), | |
| 'ูุชูู': (None, 'PARTICLE', None), | |
| 'ุงุกุฐุง': (None, 'PARTICLE', None), | |
| 'ููู': (None, 'PARTICLE', None), | |
| # โโ ู-ุฌ-ุฏ family: find โโ | |
| 'ุชุฌุฏ': ('ูุฌุฏ', 'VERB', None), | |
| 'ูุฌุฏูู': ('ูุฌุฏ', 'VERB', None), | |
| 'ูุฌุฏูุง': ('ูุฌุฏ', 'VERB', None), | |
| # โโ ุก-ู family: people of โโ | |
| 'ุกุงู': ('ุฃูู', 'NOUN', None), | |
| # โโ ุจ-ุฐ-ุช family: wealth โโ | |
| 'ุจุฐุงุช': (None, 'PARTICLE', None), # ุจู + ุฐุงุช | |
| # โโ ู-ุฒ-ู family: send down โโ | |
| 'ุงูุฒู': ('ูุฒู', 'VERB', 'IV'), | |
| 'ุงูุฒููุง': ('ูุฒู', 'VERB', 'IV'), | |
| 'ุงูุฒูู': ('ูุฒู', 'VERB', 'IV'), | |
| 'ูุฒู': ('ูุฒู', 'VERB', None), | |
| # โโ ู-ู-ู family: establish prayer โโ | |
| 'ุงูู ': ('ููู ', 'VERB', 'IV'), | |
| 'ูุงูู ': ('ููู ', 'VERB', 'IV'), | |
| # โโ Misc high-frequency โโ | |
| 'ูุณุชูู': ('ุณูู', 'VERB', 'VIII'), | |
| 'ุนูุง': (None, 'PARTICLE', None), | |
| } | |
| def main(): | |
| conn = _uslap_connect(DB_PATH) if _HAS_WRAPPER else sqlite3.connect(DB_PATH) | |
| conn.execute("PRAGMA foreign_keys = ON") | |
| # Get all unrooted word forms | |
| rows = conn.execute(""" | |
| SELECT DISTINCT arabic_word, COUNT(*) as cnt | |
| FROM quran_word_roots | |
| WHERE (root IS NULL OR root='') AND word_type <> 'PARTICLE' | |
| GROUP BY arabic_word | |
| ORDER BY cnt DESC | |
| """).fetchall() | |
| print(f"Unrooted word forms: {len(rows)}") | |
| inserted = 0 | |
| updated_particles = 0 | |
| curated_used = 0 | |
| for arabic_word, freq in rows: | |
| bare = strip_bare(arabic_word) | |
| # Check if already in known_forms | |
| exists = conn.execute( | |
| "SELECT rowid FROM quran_known_forms WHERE bare_form=?", (bare,) | |
| ).fetchone() | |
| if exists: | |
| continue | |
| # Check curated mappings | |
| if bare in CURATED: | |
| root_un, wtype, vform = CURATED[bare] | |
| if wtype == 'PARTICLE': | |
| # Just need to add to particles, not known_forms | |
| updated_particles += 1 | |
| else: | |
| conn.execute( | |
| "INSERT INTO quran_known_forms (arabic_form, bare_form, root_unhyphenated, word_type, verb_form) " | |
| "VALUES (?, ?, ?, ?, ?)", | |
| (arabic_word, bare, root_un, wtype, vform) | |
| ) | |
| inserted += 1 | |
| curated_used += 1 | |
| continue | |
| conn.commit() | |
| # Now auto-map remaining unrooted forms using the improved extraction | |
| # Import the compiler's improved algorithms | |
| import uslap_compiler as c | |
| import importlib | |
| importlib.reload(c) | |
| if hasattr(c.classify_word, '_particle_cache'): | |
| delattr(c.classify_word, '_particle_cache') | |
| auto_mapped = 0 | |
| remaining_rows = conn.execute(""" | |
| SELECT DISTINCT arabic_word, COUNT(*) as cnt | |
| FROM quran_word_roots | |
| WHERE (root IS NULL OR root='') AND word_type <> 'PARTICLE' | |
| GROUP BY arabic_word | |
| ORDER BY cnt DESC | |
| """).fetchall() | |
| for arabic_word, freq in remaining_rows: | |
| bare = strip_bare(arabic_word) | |
| # Skip if already handled | |
| exists = conn.execute( | |
| "SELECT rowid FROM quran_known_forms WHERE bare_form=?", (bare,) | |
| ).fetchone() | |
| if exists: | |
| continue | |
| # Skip if curated as particle | |
| if bare in CURATED and CURATED[bare][1] == 'PARTICLE': | |
| continue | |
| # Try the improved extraction against the dictionary | |
| result = c.find_root(arabic_word, conn) | |
| if result[0] and result[1]: | |
| # Found a root with meaning โ add to known_forms | |
| root_hyph = result[0] | |
| root_un = root_hyph.replace('-', '') | |
| wtype = result[3] or c.classify_word(arabic_word) | |
| vform = result[4] or c.detect_verb_form(arabic_word) | |
| conn.execute( | |
| "INSERT OR IGNORE INTO quran_known_forms (arabic_form, bare_form, root_unhyphenated, word_type, verb_form) " | |
| "VALUES (?, ?, ?, ?, ?)", | |
| (arabic_word, bare, root_un, wtype, vform) | |
| ) | |
| auto_mapped += 1 | |
| conn.commit() | |
| print(f"\nResults:") | |
| print(f" Curated entries inserted: {curated_used}") | |
| print(f" Particles identified: {updated_particles}") | |
| print(f" Auto-mapped: {auto_mapped}") | |
| print(f" Total new known_forms: {curated_used + auto_mapped}") | |
| # Count total known_forms now | |
| total_kf = conn.execute("SELECT COUNT(*) FROM quran_known_forms").fetchone()[0] | |
| print(f" Total known_forms: {total_kf}") | |
| conn.close() | |
| if __name__ == '__main__': | |
| main() | |