Spaces:
Running
Running
wiktionary diagnostics, fix NoneType
Browse files
app.py
CHANGED
|
@@ -1270,6 +1270,55 @@ def wiktionary_download_db() -> bool:
|
|
| 1270 |
# traceback.print_exc() # Uncomment for deep debugging
|
| 1271 |
return False
|
| 1272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1273 |
def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
|
| 1274 |
""" Thread-safe function to get a single, read-only SQLite connection. """
|
| 1275 |
global WIKTIONARY_CONN, WIKTIONARY_AVAILABLE
|
|
@@ -1568,15 +1617,13 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1568 |
|
| 1569 |
spacy_pos_hint, spacy_lemma_hint = None, None
|
| 1570 |
try:
|
| 1571 |
-
# Quick heuristic to help sort results if multiple entries exist
|
| 1572 |
nlp_en = SPACY_MODELS.get("en")
|
| 1573 |
if nlp_en:
|
| 1574 |
doc = nlp_en(word)
|
| 1575 |
token = doc[0]
|
| 1576 |
spacy_pos_hint = token.pos_.lower()
|
| 1577 |
spacy_lemma_hint = token.lemma_
|
| 1578 |
-
except Exception
|
| 1579 |
-
log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
|
| 1580 |
|
| 1581 |
try:
|
| 1582 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
|
@@ -1598,11 +1645,11 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1598 |
|
| 1599 |
word_lower = word.lower()
|
| 1600 |
for wikt_report in wiktionary_reports:
|
| 1601 |
-
# --- FIX: Safe Extraction
|
| 1602 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1603 |
-
lemma = wikt_report.get("lemma") or word
|
| 1604 |
-
|
| 1605 |
-
|
| 1606 |
|
| 1607 |
inflections_wikt_block = {
|
| 1608 |
"base_form": lemma,
|
|
@@ -1613,9 +1660,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1613 |
pattern_block = {}
|
| 1614 |
if PATTERN_EN_AVAILABLE:
|
| 1615 |
try:
|
| 1616 |
-
# Safe check now that pos_title is definitely a string
|
| 1617 |
use_word = word if "form" in pos_title.lower() else lemma
|
| 1618 |
-
|
| 1619 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
|
| 1620 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
|
| 1621 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
|
|
@@ -1636,9 +1681,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1636 |
}
|
| 1637 |
}
|
| 1638 |
|
| 1639 |
-
# Validation Logic
|
| 1640 |
is_valid = False
|
| 1641 |
-
# Safe check now that pos_title is definitely a string
|
| 1642 |
is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
|
| 1643 |
|
| 1644 |
if lemma.lower() == word_lower: is_valid = True
|
|
@@ -1654,10 +1697,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1654 |
if pos_key not in final_result["analysis"]:
|
| 1655 |
final_result["analysis"][pos_key] = []
|
| 1656 |
final_result["analysis"][pos_key].append(pos_entry_report)
|
| 1657 |
-
else:
|
| 1658 |
-
log(f"[DEBUG] Wiktionary (EN): DROPPING entry '{lemma}' ({pos_key}) ...")
|
| 1659 |
|
| 1660 |
-
final_result["info"] = f"Analysis from Wiktionary
|
| 1661 |
return final_result
|
| 1662 |
|
| 1663 |
|
|
@@ -2250,11 +2291,16 @@ def _format_word_analysis_html(data: Dict[str, Any]) -> str:
|
|
| 2250 |
|
| 2251 |
# --- Header ---
|
| 2252 |
pos_class = f"pos-{pos_key}" if pos_key in ["noun", "verb", "adj", "adv"] else "pos-noun"
|
| 2253 |
-
|
| 2254 |
-
|
| 2255 |
-
|
| 2256 |
-
|
| 2257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2258 |
|
| 2259 |
html += f"""
|
| 2260 |
<div class="ling-card">
|
|
@@ -2268,70 +2314,73 @@ def _format_word_analysis_html(data: Dict[str, Any]) -> str:
|
|
| 2268 |
html += "<div class='ling-section'><div class='ling-subtitle'>Morphology & Inflections</div>"
|
| 2269 |
html += "<table class='inflection-table'>"
|
| 2270 |
|
| 2271 |
-
pat = entry.get("inflections_pattern", {})
|
| 2272 |
-
wikt = entry.get("inflections_wiktionary", {})
|
| 2273 |
-
|
| 2274 |
# Noun Logic
|
| 2275 |
if pos_key == 'noun':
|
| 2276 |
-
if
|
| 2277 |
-
html += f"<tr><td class='inflection-label'>Singular</td><td>{
|
| 2278 |
-
html += f"<tr><td class='inflection-label'>Plural</td><td>{
|
| 2279 |
-
html += f"<tr><td class='inflection-label'>Context</td><td>{
|
| 2280 |
|
| 2281 |
# Verb Logic
|
| 2282 |
elif pos_key == 'verb':
|
| 2283 |
-
cj =
|
| 2284 |
-
|
| 2285 |
-
|
| 2286 |
-
|
| 2287 |
-
|
| 2288 |
-
|
| 2289 |
-
|
| 2290 |
-
|
| 2291 |
-
|
| 2292 |
-
|
| 2293 |
|
| 2294 |
# Adjective Logic
|
| 2295 |
elif pos_key == 'adjective':
|
| 2296 |
-
gr =
|
| 2297 |
-
|
| 2298 |
-
|
| 2299 |
-
|
| 2300 |
-
html += f"<tr><td class='inflection-label'>Superlative</td><td>{gr.get('Superlative', '-')}</td></tr>"
|
| 2301 |
|
| 2302 |
# Wiktionary Forms Fallback
|
| 2303 |
-
|
| 2304 |
-
|
| 2305 |
-
|
|
|
|
|
|
|
|
|
|
| 2306 |
|
| 2307 |
html += "</table></div>"
|
| 2308 |
|
| 2309 |
# --- Semantics Section ---
|
| 2310 |
-
sem = entry.get("semantics_combined", {})
|
| 2311 |
html += "<div class='ling-section'><div class='ling-subtitle'>Definitions & Senses</div>"
|
| 2312 |
|
| 2313 |
# Wiktionary Senses
|
| 2314 |
-
wikt_senses =
|
| 2315 |
-
for s in wikt_senses[:3]:
|
| 2316 |
-
|
| 2317 |
-
|
|
|
|
|
|
|
|
|
|
| 2318 |
|
| 2319 |
# OEWN Senses
|
| 2320 |
-
oewn_senses =
|
| 2321 |
for s in oewn_senses[:3]:
|
| 2322 |
-
defi = s.get("definition"
|
| 2323 |
-
|
|
|
|
| 2324 |
|
| 2325 |
html += "</div>"
|
| 2326 |
|
| 2327 |
# --- Relations Section (ConceptNet) ---
|
| 2328 |
-
rels =
|
| 2329 |
if rels:
|
| 2330 |
html += "<div class='ling-section'><div class='ling-subtitle'>Knowledge Graph (Top 5)</div>"
|
| 2331 |
html += "<div>"
|
| 2332 |
for r in rels[:5]:
|
| 2333 |
rel_name = r.get("relation", "Related")
|
| 2334 |
-
|
|
|
|
| 2335 |
html += f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
|
| 2336 |
html += "</div></div>"
|
| 2337 |
|
|
@@ -2826,6 +2875,7 @@ if __name__ == "__main__":
|
|
| 2826 |
print("✗ WARNING: Failed to download English Wiktionary DB. Primary engine is disabled.")
|
| 2827 |
else:
|
| 2828 |
_ = wiktionary_get_connection() # Pre-warm
|
|
|
|
| 2829 |
except Exception as e:
|
| 2830 |
print(f"✗ FAILED to initialize Wiktionary: {e}")
|
| 2831 |
print("--- Wiktionary Done ---\n")
|
|
|
|
| 1270 |
# traceback.print_exc() # Uncomment for deep debugging
|
| 1271 |
return False
|
| 1272 |
|
| 1273 |
+
def wiktionary_run_startup_diagnostics():
|
| 1274 |
+
""" Runs critical checks on the DB structure and content at startup. """
|
| 1275 |
+
print("\n" + "="*50)
|
| 1276 |
+
print("RUNNING WIKTIONARY DB DIAGNOSTICS")
|
| 1277 |
+
print("="*50)
|
| 1278 |
+
|
| 1279 |
+
conn = wiktionary_get_connection()
|
| 1280 |
+
if not conn:
|
| 1281 |
+
print("✗ Diagnostics aborted: No DB connection.")
|
| 1282 |
+
return
|
| 1283 |
+
|
| 1284 |
+
try:
|
| 1285 |
+
# 1. Check Table Structure
|
| 1286 |
+
print("[1] Checking Tables...")
|
| 1287 |
+
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
| 1288 |
+
table_names = [t['name'] for t in tables]
|
| 1289 |
+
print(f" Found tables: {table_names}")
|
| 1290 |
+
|
| 1291 |
+
if 'entries' not in table_names:
|
| 1292 |
+
print("CRITICAL ERROR: 'entries' table missing!")
|
| 1293 |
+
return
|
| 1294 |
+
|
| 1295 |
+
# 2. Check Language Encoding (The likely cause of your empty results)
|
| 1296 |
+
print("\n[2] Checking Language Format (Top 5)...")
|
| 1297 |
+
langs = conn.execute("SELECT lang, COUNT(*) as c FROM entries GROUP BY lang ORDER BY c DESC LIMIT 5").fetchall()
|
| 1298 |
+
for row in langs:
|
| 1299 |
+
print(f" - '{row['lang']}': {row['c']} entries")
|
| 1300 |
+
|
| 1301 |
+
# 3. Check Specific 'Missing' Words
|
| 1302 |
+
test_words = ["ready", "runner", "run", "house"]
|
| 1303 |
+
print(f"\n[3] Probing missing words: {test_words}")
|
| 1304 |
+
for word in test_words:
|
| 1305 |
+
# Check exact match raw
|
| 1306 |
+
raw = conn.execute("SELECT count(*) as c FROM entries WHERE word = ?", (word,)).fetchone()
|
| 1307 |
+
print(f" - '{word}' (Raw check): Found {raw['c']} rows")
|
| 1308 |
+
|
| 1309 |
+
if raw['c'] == 0:
|
| 1310 |
+
# Check case insensitive
|
| 1311 |
+
nocase = conn.execute("SELECT word FROM entries WHERE word LIKE ? LIMIT 1", (word,)).fetchone()
|
| 1312 |
+
if nocase:
|
| 1313 |
+
print(f" ! WARNING: '{word}' not found exactly, but found '{nocase['word']}' (Case mismatch?)")
|
| 1314 |
+
else:
|
| 1315 |
+
print(f" ! CRITICAL: '{word}' does not exist in DB at all.")
|
| 1316 |
+
|
| 1317 |
+
except Exception as e:
|
| 1318 |
+
print(f"✗ Diagnostics crashed: {e}")
|
| 1319 |
+
traceback.print_exc()
|
| 1320 |
+
print("="*50 + "\n")
|
| 1321 |
+
|
| 1322 |
def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
|
| 1323 |
""" Thread-safe function to get a single, read-only SQLite connection. """
|
| 1324 |
global WIKTIONARY_CONN, WIKTIONARY_AVAILABLE
|
|
|
|
| 1617 |
|
| 1618 |
spacy_pos_hint, spacy_lemma_hint = None, None
|
| 1619 |
try:
|
|
|
|
| 1620 |
nlp_en = SPACY_MODELS.get("en")
|
| 1621 |
if nlp_en:
|
| 1622 |
doc = nlp_en(word)
|
| 1623 |
token = doc[0]
|
| 1624 |
spacy_pos_hint = token.pos_.lower()
|
| 1625 |
spacy_lemma_hint = token.lemma_
|
| 1626 |
+
except Exception: pass
|
|
|
|
| 1627 |
|
| 1628 |
try:
|
| 1629 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
|
|
|
| 1645 |
|
| 1646 |
word_lower = word.lower()
|
| 1647 |
for wikt_report in wiktionary_reports:
|
| 1648 |
+
# --- FIX START: Safe Extraction ---
|
| 1649 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1650 |
+
lemma = wikt_report.get("lemma") or word
|
| 1651 |
+
pos_title = wikt_report.get("pos_title") or "" # FORCE STRING
|
| 1652 |
+
# --- FIX END ---
|
| 1653 |
|
| 1654 |
inflections_wikt_block = {
|
| 1655 |
"base_form": lemma,
|
|
|
|
| 1660 |
pattern_block = {}
|
| 1661 |
if PATTERN_EN_AVAILABLE:
|
| 1662 |
try:
|
|
|
|
| 1663 |
use_word = word if "form" in pos_title.lower() else lemma
|
|
|
|
| 1664 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
|
| 1665 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
|
| 1666 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
|
|
|
|
| 1681 |
}
|
| 1682 |
}
|
| 1683 |
|
|
|
|
| 1684 |
is_valid = False
|
|
|
|
| 1685 |
is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
|
| 1686 |
|
| 1687 |
if lemma.lower() == word_lower: is_valid = True
|
|
|
|
| 1697 |
if pos_key not in final_result["analysis"]:
|
| 1698 |
final_result["analysis"][pos_key] = []
|
| 1699 |
final_result["analysis"][pos_key].append(pos_entry_report)
|
|
|
|
|
|
|
| 1700 |
|
| 1701 |
+
final_result["info"] = f"Analysis from Wiktionary. Found {len(wiktionary_reports)} raw entries."
|
| 1702 |
return final_result
|
| 1703 |
|
| 1704 |
|
|
|
|
| 2291 |
|
| 2292 |
# --- Header ---
|
| 2293 |
pos_class = f"pos-{pos_key}" if pos_key in ["noun", "verb", "adj", "adv"] else "pos-noun"
|
| 2294 |
+
|
| 2295 |
+
# Safe extraction of lemma
|
| 2296 |
+
inf_wikt = entry.get("inflections_wiktionary") or {}
|
| 2297 |
+
inf_pat = entry.get("inflections_pattern") or {}
|
| 2298 |
+
sem_comb = entry.get("semantics_combined") or {}
|
| 2299 |
+
|
| 2300 |
+
lemma = inf_wikt.get("base_form") or \
|
| 2301 |
+
inf_pat.get("base_form") or \
|
| 2302 |
+
sem_comb.get("lemma") or \
|
| 2303 |
+
data.get("input_word") or "?"
|
| 2304 |
|
| 2305 |
html += f"""
|
| 2306 |
<div class="ling-card">
|
|
|
|
| 2314 |
html += "<div class='ling-section'><div class='ling-subtitle'>Morphology & Inflections</div>"
|
| 2315 |
html += "<table class='inflection-table'>"
|
| 2316 |
|
|
|
|
|
|
|
|
|
|
| 2317 |
# Noun Logic
|
| 2318 |
if pos_key == 'noun':
|
| 2319 |
+
if inf_pat:
|
| 2320 |
+
html += f"<tr><td class='inflection-label'>Singular</td><td>{inf_pat.get('singular', '-')}</td></tr>"
|
| 2321 |
+
html += f"<tr><td class='inflection-label'>Plural</td><td>{inf_pat.get('plural', '-')}</td></tr>"
|
| 2322 |
+
html += f"<tr><td class='inflection-label'>Context</td><td>{inf_pat.get('article', '-')}</td></tr>"
|
| 2323 |
|
| 2324 |
# Verb Logic
|
| 2325 |
elif pos_key == 'verb':
|
| 2326 |
+
cj = inf_pat.get('conjugation') or {}
|
| 2327 |
+
pres = cj.get('Present') or {}
|
| 2328 |
+
past = cj.get('Past') or {}
|
| 2329 |
+
parts = inf_pat.get('participles') or {}
|
| 2330 |
+
|
| 2331 |
+
html += f"<tr><td class='inflection-label'>Infinitive</td><td>{inf_pat.get('infinitive', lemma)}</td></tr>"
|
| 2332 |
+
html += f"<tr><td class='inflection-label'>3rd Person (He/She)</td><td>{pres.get('he/she (3sg)', '-')}</td></tr>"
|
| 2333 |
+
html += f"<tr><td class='inflection-label'>Past Simple</td><td>{past.get('General', '-')}</td></tr>"
|
| 2334 |
+
html += f"<tr><td class='inflection-label'>Participle (Ing)</td><td>{parts.get('Present Participle (gerund)', '-')}</td></tr>"
|
| 2335 |
+
html += f"<tr><td class='inflection-label'>Participle (Past)</td><td>{parts.get('Past Participle', '-')}</td></tr>"
|
| 2336 |
|
| 2337 |
# Adjective Logic
|
| 2338 |
elif pos_key == 'adjective':
|
| 2339 |
+
gr = inf_pat.get('grading') or {}
|
| 2340 |
+
html += f"<tr><td class='inflection-label'>Positive</td><td>{gr.get('Positive', '-')}</td></tr>"
|
| 2341 |
+
html += f"<tr><td class='inflection-label'>Comparative</td><td>{gr.get('Comparative', '-')}</td></tr>"
|
| 2342 |
+
html += f"<tr><td class='inflection-label'>Superlative</td><td>{gr.get('Superlative', '-')}</td></tr>"
|
|
|
|
| 2343 |
|
| 2344 |
# Wiktionary Forms Fallback
|
| 2345 |
+
forms_list = inf_wikt.get("forms_list") or []
|
| 2346 |
+
if forms_list:
|
| 2347 |
+
# Safe extraction of form_text
|
| 2348 |
+
forms_str_list = [f.get('form_text', '') for f in forms_list[:5] if f.get('form_text')]
|
| 2349 |
+
if forms_str_list:
|
| 2350 |
+
html += f"<tr><td class='inflection-label'>Other Forms (Wikt)</td><td>{', '.join(forms_str_list)}...</td></tr>"
|
| 2351 |
|
| 2352 |
html += "</table></div>"
|
| 2353 |
|
| 2354 |
# --- Semantics Section ---
|
|
|
|
| 2355 |
html += "<div class='ling-section'><div class='ling-subtitle'>Definitions & Senses</div>"
|
| 2356 |
|
| 2357 |
# Wiktionary Senses
|
| 2358 |
+
wikt_senses = sem_comb.get("wiktionary_senses") or []
|
| 2359 |
+
for s in wikt_senses[:3]:
|
| 2360 |
+
# FIX: Ensure definition is a string before replacing
|
| 2361 |
+
gloss_raw = s.get("definition") or ""
|
| 2362 |
+
gloss = gloss_raw.replace(";", "<br>")
|
| 2363 |
+
if gloss:
|
| 2364 |
+
html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
|
| 2365 |
|
| 2366 |
# OEWN Senses
|
| 2367 |
+
oewn_senses = sem_comb.get("odenet_senses") or []
|
| 2368 |
for s in oewn_senses[:3]:
|
| 2369 |
+
defi = s.get("definition") or ""
|
| 2370 |
+
if defi:
|
| 2371 |
+
html += f"<div class='sense-item'><span class='source-badge src-oewn'>OEWN</span> {defi}</div>"
|
| 2372 |
|
| 2373 |
html += "</div>"
|
| 2374 |
|
| 2375 |
# --- Relations Section (ConceptNet) ---
|
| 2376 |
+
rels = sem_comb.get("conceptnet_relations") or []
|
| 2377 |
if rels:
|
| 2378 |
html += "<div class='ling-section'><div class='ling-subtitle'>Knowledge Graph (Top 5)</div>"
|
| 2379 |
html += "<div>"
|
| 2380 |
for r in rels[:5]:
|
| 2381 |
rel_name = r.get("relation", "Related")
|
| 2382 |
+
# Safe extraction of target
|
| 2383 |
+
target = r.get("other_node") or r.get("surface") or "?"
|
| 2384 |
html += f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
|
| 2385 |
html += "</div></div>"
|
| 2386 |
|
|
|
|
| 2875 |
print("✗ WARNING: Failed to download English Wiktionary DB. Primary engine is disabled.")
|
| 2876 |
else:
|
| 2877 |
_ = wiktionary_get_connection() # Pre-warm
|
| 2878 |
+
wiktionary_run_startup_diagnostics()
|
| 2879 |
except Exception as e:
|
| 2880 |
print(f"✗ FAILED to initialize Wiktionary: {e}")
|
| 2881 |
print("--- Wiktionary Done ---\n")
|