cstr commited on
Commit
9d0a528
·
verified ·
1 Parent(s): 3beabc6

wiktionary diagnostics, fix NoneType

Browse files
Files changed (1) hide show
  1. app.py +104 -54
app.py CHANGED
@@ -1270,6 +1270,55 @@ def wiktionary_download_db() -> bool:
1270
  # traceback.print_exc() # Uncomment for deep debugging
1271
  return False
1272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1273
  def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
1274
  """ Thread-safe function to get a single, read-only SQLite connection. """
1275
  global WIKTIONARY_CONN, WIKTIONARY_AVAILABLE
@@ -1568,15 +1617,13 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1568
 
1569
  spacy_pos_hint, spacy_lemma_hint = None, None
1570
  try:
1571
- # Quick heuristic to help sort results if multiple entries exist
1572
  nlp_en = SPACY_MODELS.get("en")
1573
  if nlp_en:
1574
  doc = nlp_en(word)
1575
  token = doc[0]
1576
  spacy_pos_hint = token.pos_.lower()
1577
  spacy_lemma_hint = token.lemma_
1578
- except Exception as e:
1579
- log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
1580
 
1581
  try:
1582
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
@@ -1598,11 +1645,11 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1598
 
1599
  word_lower = word.lower()
1600
  for wikt_report in wiktionary_reports:
1601
- # --- FIX: Safe Extraction of DB Fields ---
1602
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1603
- lemma = wikt_report.get("lemma") or word # Fallback if None
1604
- # FORCE STRING: Use 'or ""' to handle DB NULLs preventing "NoneType is not iterable"
1605
- pos_title = wikt_report.get("pos_title") or ""
1606
 
1607
  inflections_wikt_block = {
1608
  "base_form": lemma,
@@ -1613,9 +1660,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1613
  pattern_block = {}
1614
  if PATTERN_EN_AVAILABLE:
1615
  try:
1616
- # Safe check now that pos_title is definitely a string
1617
  use_word = word if "form" in pos_title.lower() else lemma
1618
-
1619
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
1620
  elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
1621
  elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
@@ -1636,9 +1681,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1636
  }
1637
  }
1638
 
1639
- # Validation Logic
1640
  is_valid = False
1641
- # Safe check now that pos_title is definitely a string
1642
  is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
1643
 
1644
  if lemma.lower() == word_lower: is_valid = True
@@ -1654,10 +1697,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1654
  if pos_key not in final_result["analysis"]:
1655
  final_result["analysis"][pos_key] = []
1656
  final_result["analysis"][pos_key].append(pos_entry_report)
1657
- else:
1658
- log(f"[DEBUG] Wiktionary (EN): DROPPING entry '{lemma}' ({pos_key}) ...")
1659
 
1660
- final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching entries."
1661
  return final_result
1662
 
1663
 
@@ -2250,11 +2291,16 @@ def _format_word_analysis_html(data: Dict[str, Any]) -> str:
2250
 
2251
  # --- Header ---
2252
  pos_class = f"pos-{pos_key}" if pos_key in ["noun", "verb", "adj", "adv"] else "pos-noun"
2253
- # Try to find a lemma from one of the sub-blocks
2254
- lemma = entry.get("inflections_wiktionary", {}).get("base_form") or \
2255
- entry.get("inflections_pattern", {}).get("base_form") or \
2256
- entry.get("semantics_combined", {}).get("lemma") or \
2257
- data.get("input_word")
 
 
 
 
 
2258
 
2259
  html += f"""
2260
  <div class="ling-card">
@@ -2268,70 +2314,73 @@ def _format_word_analysis_html(data: Dict[str, Any]) -> str:
2268
  html += "<div class='ling-section'><div class='ling-subtitle'>Morphology & Inflections</div>"
2269
  html += "<table class='inflection-table'>"
2270
 
2271
- pat = entry.get("inflections_pattern", {})
2272
- wikt = entry.get("inflections_wiktionary", {})
2273
-
2274
  # Noun Logic
2275
  if pos_key == 'noun':
2276
- if pat:
2277
- html += f"<tr><td class='inflection-label'>Singular</td><td>{pat.get('singular', '-')}</td></tr>"
2278
- html += f"<tr><td class='inflection-label'>Plural</td><td>{pat.get('plural', '-')}</td></tr>"
2279
- html += f"<tr><td class='inflection-label'>Context</td><td>{pat.get('article', '-')}</td></tr>"
2280
 
2281
  # Verb Logic
2282
  elif pos_key == 'verb':
2283
- cj = pat.get('conjugation', {})
2284
- if cj:
2285
- pres = cj.get('Present', {})
2286
- past = cj.get('Past', {})
2287
- html += f"<tr><td class='inflection-label'>Infinitive</td><td>{pat.get('infinitive', lemma)}</td></tr>"
2288
- html += f"<tr><td class='inflection-label'>3rd Person (He/She)</td><td>{pres.get('he/she (3sg)', '-')}</td></tr>"
2289
- html += f"<tr><td class='inflection-label'>Past Simple</td><td>{past.get('General', '-')}</td></tr>"
2290
- parts = pat.get('participles', {})
2291
- html += f"<tr><td class='inflection-label'>Participle (Ing)</td><td>{parts.get('Present Participle (gerund)', '-')}</td></tr>"
2292
- html += f"<tr><td class='inflection-label'>Participle (Past)</td><td>{parts.get('Past Participle', '-')}</td></tr>"
2293
 
2294
  # Adjective Logic
2295
  elif pos_key == 'adjective':
2296
- gr = pat.get('grading', {})
2297
- if gr:
2298
- html += f"<tr><td class='inflection-label'>Positive</td><td>{gr.get('Positive', '-')}</td></tr>"
2299
- html += f"<tr><td class='inflection-label'>Comparative</td><td>{gr.get('Comparative', '-')}</td></tr>"
2300
- html += f"<tr><td class='inflection-label'>Superlative</td><td>{gr.get('Superlative', '-')}</td></tr>"
2301
 
2302
  # Wiktionary Forms Fallback
2303
- if wikt and wikt.get("forms_list"):
2304
- forms_str = ", ".join([f['form_text'] for f in wikt['forms_list'][:5]])
2305
- html += f"<tr><td class='inflection-label'>Other Forms (Wikt)</td><td>{forms_str}...</td></tr>"
 
 
 
2306
 
2307
  html += "</table></div>"
2308
 
2309
  # --- Semantics Section ---
2310
- sem = entry.get("semantics_combined", {})
2311
  html += "<div class='ling-section'><div class='ling-subtitle'>Definitions & Senses</div>"
2312
 
2313
  # Wiktionary Senses
2314
- wikt_senses = sem.get("wiktionary_senses", [])
2315
- for s in wikt_senses[:3]: # Limit to top 3 for UI
2316
- gloss = s.get("definition", "").replace(";", "<br>")
2317
- html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
 
 
 
2318
 
2319
  # OEWN Senses
2320
- oewn_senses = sem.get("odenet_senses", [])
2321
  for s in oewn_senses[:3]:
2322
- defi = s.get("definition", "")
2323
- html += f"<div class='sense-item'><span class='source-badge src-oewn'>OEWN</span> {defi}</div>"
 
2324
 
2325
  html += "</div>"
2326
 
2327
  # --- Relations Section (ConceptNet) ---
2328
- rels = sem.get("conceptnet_relations", [])
2329
  if rels:
2330
  html += "<div class='ling-section'><div class='ling-subtitle'>Knowledge Graph (Top 5)</div>"
2331
  html += "<div>"
2332
  for r in rels[:5]:
2333
  rel_name = r.get("relation", "Related")
2334
- target = r.get("other_node") or r.get("surface")
 
2335
  html += f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
2336
  html += "</div></div>"
2337
 
@@ -2826,6 +2875,7 @@ if __name__ == "__main__":
2826
  print("✗ WARNING: Failed to download English Wiktionary DB. Primary engine is disabled.")
2827
  else:
2828
  _ = wiktionary_get_connection() # Pre-warm
 
2829
  except Exception as e:
2830
  print(f"✗ FAILED to initialize Wiktionary: {e}")
2831
  print("--- Wiktionary Done ---\n")
 
1270
  # traceback.print_exc() # Uncomment for deep debugging
1271
  return False
1272
 
1273
+ def wiktionary_run_startup_diagnostics():
1274
+ """ Runs critical checks on the DB structure and content at startup. """
1275
+ print("\n" + "="*50)
1276
+ print("RUNNING WIKTIONARY DB DIAGNOSTICS")
1277
+ print("="*50)
1278
+
1279
+ conn = wiktionary_get_connection()
1280
+ if not conn:
1281
+ print("✗ Diagnostics aborted: No DB connection.")
1282
+ return
1283
+
1284
+ try:
1285
+ # 1. Check Table Structure
1286
+ print("[1] Checking Tables...")
1287
+ tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
1288
+ table_names = [t['name'] for t in tables]
1289
+ print(f" Found tables: {table_names}")
1290
+
1291
+ if 'entries' not in table_names:
1292
+ print("CRITICAL ERROR: 'entries' table missing!")
1293
+ return
1294
+
1295
+ # 2. Check Language Encoding (The likely cause of your empty results)
1296
+ print("\n[2] Checking Language Format (Top 5)...")
1297
+ langs = conn.execute("SELECT lang, COUNT(*) as c FROM entries GROUP BY lang ORDER BY c DESC LIMIT 5").fetchall()
1298
+ for row in langs:
1299
+ print(f" - '{row['lang']}': {row['c']} entries")
1300
+
1301
+ # 3. Check Specific 'Missing' Words
1302
+ test_words = ["ready", "runner", "run", "house"]
1303
+ print(f"\n[3] Probing missing words: {test_words}")
1304
+ for word in test_words:
1305
+ # Check exact match raw
1306
+ raw = conn.execute("SELECT count(*) as c FROM entries WHERE word = ?", (word,)).fetchone()
1307
+ print(f" - '{word}' (Raw check): Found {raw['c']} rows")
1308
+
1309
+ if raw['c'] == 0:
1310
+ # Check case insensitive
1311
+ nocase = conn.execute("SELECT word FROM entries WHERE word LIKE ? LIMIT 1", (word,)).fetchone()
1312
+ if nocase:
1313
+ print(f" ! WARNING: '{word}' not found exactly, but found '{nocase['word']}' (Case mismatch?)")
1314
+ else:
1315
+ print(f" ! CRITICAL: '{word}' does not exist in DB at all.")
1316
+
1317
+ except Exception as e:
1318
+ print(f"✗ Diagnostics crashed: {e}")
1319
+ traceback.print_exc()
1320
+ print("="*50 + "\n")
1321
+
1322
  def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
1323
  """ Thread-safe function to get a single, read-only SQLite connection. """
1324
  global WIKTIONARY_CONN, WIKTIONARY_AVAILABLE
 
1617
 
1618
  spacy_pos_hint, spacy_lemma_hint = None, None
1619
  try:
 
1620
  nlp_en = SPACY_MODELS.get("en")
1621
  if nlp_en:
1622
  doc = nlp_en(word)
1623
  token = doc[0]
1624
  spacy_pos_hint = token.pos_.lower()
1625
  spacy_lemma_hint = token.lemma_
1626
+ except Exception: pass
 
1627
 
1628
  try:
1629
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
 
1645
 
1646
  word_lower = word.lower()
1647
  for wikt_report in wiktionary_reports:
1648
+ # --- FIX START: Safe Extraction ---
1649
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1650
+ lemma = wikt_report.get("lemma") or word
1651
+ pos_title = wikt_report.get("pos_title") or "" # FORCE STRING
1652
+ # --- FIX END ---
1653
 
1654
  inflections_wikt_block = {
1655
  "base_form": lemma,
 
1660
  pattern_block = {}
1661
  if PATTERN_EN_AVAILABLE:
1662
  try:
 
1663
  use_word = word if "form" in pos_title.lower() else lemma
 
1664
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
1665
  elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
1666
  elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(use_word)
 
1681
  }
1682
  }
1683
 
 
1684
  is_valid = False
 
1685
  is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
1686
 
1687
  if lemma.lower() == word_lower: is_valid = True
 
1697
  if pos_key not in final_result["analysis"]:
1698
  final_result["analysis"][pos_key] = []
1699
  final_result["analysis"][pos_key].append(pos_entry_report)
 
 
1700
 
1701
+ final_result["info"] = f"Analysis from Wiktionary. Found {len(wiktionary_reports)} raw entries."
1702
  return final_result
1703
 
1704
 
 
2291
 
2292
  # --- Header ---
2293
  pos_class = f"pos-{pos_key}" if pos_key in ["noun", "verb", "adj", "adv"] else "pos-noun"
2294
+
2295
+ # Safe extraction of lemma
2296
+ inf_wikt = entry.get("inflections_wiktionary") or {}
2297
+ inf_pat = entry.get("inflections_pattern") or {}
2298
+ sem_comb = entry.get("semantics_combined") or {}
2299
+
2300
+ lemma = inf_wikt.get("base_form") or \
2301
+ inf_pat.get("base_form") or \
2302
+ sem_comb.get("lemma") or \
2303
+ data.get("input_word") or "?"
2304
 
2305
  html += f"""
2306
  <div class="ling-card">
 
2314
  html += "<div class='ling-section'><div class='ling-subtitle'>Morphology & Inflections</div>"
2315
  html += "<table class='inflection-table'>"
2316
 
 
 
 
2317
  # Noun Logic
2318
  if pos_key == 'noun':
2319
+ if inf_pat:
2320
+ html += f"<tr><td class='inflection-label'>Singular</td><td>{inf_pat.get('singular', '-')}</td></tr>"
2321
+ html += f"<tr><td class='inflection-label'>Plural</td><td>{inf_pat.get('plural', '-')}</td></tr>"
2322
+ html += f"<tr><td class='inflection-label'>Context</td><td>{inf_pat.get('article', '-')}</td></tr>"
2323
 
2324
  # Verb Logic
2325
  elif pos_key == 'verb':
2326
+ cj = inf_pat.get('conjugation') or {}
2327
+ pres = cj.get('Present') or {}
2328
+ past = cj.get('Past') or {}
2329
+ parts = inf_pat.get('participles') or {}
2330
+
2331
+ html += f"<tr><td class='inflection-label'>Infinitive</td><td>{inf_pat.get('infinitive', lemma)}</td></tr>"
2332
+ html += f"<tr><td class='inflection-label'>3rd Person (He/She)</td><td>{pres.get('he/she (3sg)', '-')}</td></tr>"
2333
+ html += f"<tr><td class='inflection-label'>Past Simple</td><td>{past.get('General', '-')}</td></tr>"
2334
+ html += f"<tr><td class='inflection-label'>Participle (Ing)</td><td>{parts.get('Present Participle (gerund)', '-')}</td></tr>"
2335
+ html += f"<tr><td class='inflection-label'>Participle (Past)</td><td>{parts.get('Past Participle', '-')}</td></tr>"
2336
 
2337
  # Adjective Logic
2338
  elif pos_key == 'adjective':
2339
+ gr = inf_pat.get('grading') or {}
2340
+ html += f"<tr><td class='inflection-label'>Positive</td><td>{gr.get('Positive', '-')}</td></tr>"
2341
+ html += f"<tr><td class='inflection-label'>Comparative</td><td>{gr.get('Comparative', '-')}</td></tr>"
2342
+ html += f"<tr><td class='inflection-label'>Superlative</td><td>{gr.get('Superlative', '-')}</td></tr>"
 
2343
 
2344
  # Wiktionary Forms Fallback
2345
+ forms_list = inf_wikt.get("forms_list") or []
2346
+ if forms_list:
2347
+ # Safe extraction of form_text
2348
+ forms_str_list = [f.get('form_text', '') for f in forms_list[:5] if f.get('form_text')]
2349
+ if forms_str_list:
2350
+ html += f"<tr><td class='inflection-label'>Other Forms (Wikt)</td><td>{', '.join(forms_str_list)}...</td></tr>"
2351
 
2352
  html += "</table></div>"
2353
 
2354
  # --- Semantics Section ---
 
2355
  html += "<div class='ling-section'><div class='ling-subtitle'>Definitions & Senses</div>"
2356
 
2357
  # Wiktionary Senses
2358
+ wikt_senses = sem_comb.get("wiktionary_senses") or []
2359
+ for s in wikt_senses[:3]:
2360
+ # FIX: Ensure definition is a string before replacing
2361
+ gloss_raw = s.get("definition") or ""
2362
+ gloss = gloss_raw.replace(";", "<br>")
2363
+ if gloss:
2364
+ html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
2365
 
2366
  # OEWN Senses
2367
+ oewn_senses = sem_comb.get("odenet_senses") or []
2368
  for s in oewn_senses[:3]:
2369
+ defi = s.get("definition") or ""
2370
+ if defi:
2371
+ html += f"<div class='sense-item'><span class='source-badge src-oewn'>OEWN</span> {defi}</div>"
2372
 
2373
  html += "</div>"
2374
 
2375
  # --- Relations Section (ConceptNet) ---
2376
+ rels = sem_comb.get("conceptnet_relations") or []
2377
  if rels:
2378
  html += "<div class='ling-section'><div class='ling-subtitle'>Knowledge Graph (Top 5)</div>"
2379
  html += "<div>"
2380
  for r in rels[:5]:
2381
  rel_name = r.get("relation", "Related")
2382
+ # Safe extraction of target
2383
+ target = r.get("other_node") or r.get("surface") or "?"
2384
  html += f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
2385
  html += "</div></div>"
2386
 
 
2875
  print("✗ WARNING: Failed to download English Wiktionary DB. Primary engine is disabled.")
2876
  else:
2877
  _ = wiktionary_get_connection() # Pre-warm
2878
+ wiktionary_run_startup_diagnostics()
2879
  except Exception as e:
2880
  print(f"✗ FAILED to initialize Wiktionary: {e}")
2881
  print("--- Wiktionary Done ---\n")