cstr commited on
Commit
afcdc52
·
verified ·
1 Parent(s): 23d7efa

more wiktionary data

Browse files
Files changed (1) hide show
  1. app.py +111 -32
app.py CHANGED
@@ -1804,13 +1804,15 @@ def _wiktionary_map_pos_key(wikt_pos: Optional[str]) -> str:
1804
 
1805
  def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
1806
  """
1807
- Fetches all associated data for a single Wiktionary entry_id.
 
 
1808
  """
1809
  report = {}
1810
 
1811
  # 1. Get Base Entry Info
1812
  entry_data = conn.execute(
1813
- "SELECT word, pos, pos_title, lang FROM entries WHERE id = ?", (entry_id,)
1814
  ).fetchone()
1815
  if not entry_data:
1816
  return {"error": "Entry ID not found"}
@@ -1818,25 +1820,43 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1818
  report["entry_id"] = entry_id
1819
  report["lemma"] = entry_data["word"] # Alias for clarity
1820
 
1821
- # 2. Get Senses (Definitions)
1822
  senses_q = conn.execute(
1823
  """
1824
- SELECT s.id as sense_id, g.gloss_text
 
 
 
 
 
 
 
1825
  FROM senses s
1826
- JOIN glosses g ON s.id = g.sense_id
1827
  WHERE s.entry_id = ?
1828
- ORDER BY s.id, g.id
1829
  """, (entry_id,)
1830
  ).fetchall()
1831
- report["senses"] = [dict(s) for s in senses_q]
1832
-
1833
- # 3. Get Inflected Forms
 
 
 
 
 
 
 
 
 
 
1834
  forms_q = conn.execute(
1835
  """
1836
- SELECT f.form_text, GROUP_CONCAT(t.tag, ', ') as tags
 
 
 
 
1837
  FROM forms f
1838
- LEFT JOIN form_tags ft ON f.id = ft.form_id
1839
- LEFT JOIN tags t ON ft.tag_id = t.id
1840
  WHERE f.entry_id = ?
1841
  GROUP BY f.id
1842
  ORDER BY f.id
@@ -1844,35 +1864,86 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1844
  ).fetchall()
1845
  report["forms"] = [dict(f) for f in forms_q]
1846
 
1847
- # 4. Get Pronunciations
1848
  sounds_q = conn.execute(
1849
- "SELECT ipa, audio FROM sounds WHERE entry_id = ?", (entry_id,)
 
 
 
 
 
 
 
1850
  ).fetchall()
1851
  report["sounds"] = [dict(s) for s in sounds_q]
1852
 
1853
- # 5. Get Synonyms
1854
  syn_q = conn.execute(
1855
- "SELECT synonym_word FROM synonyms WHERE entry_id = ?", (entry_id,)
 
 
 
 
 
 
 
 
1856
  ).fetchall()
1857
- report["synonyms"] = [s["synonym_word"] for s in syn_q]
1858
 
1859
- # 6. Get Antonyms
1860
  ant_q = conn.execute(
1861
- "SELECT antonym_word FROM antonyms WHERE entry_id = ?", (entry_id,)
 
 
 
 
 
 
 
1862
  ).fetchall()
1863
- report["antonyms"] = [a["antonym_word"] for a in ant_q]
1864
-
1865
- # 7. Get Examples (Limit 5 for brevity)
1866
- ex_q = conn.execute(
1867
  """
1868
- SELECT ex.text
1869
- FROM examples ex
1870
- JOIN senses s ON ex.sense_id = s.id
1871
- WHERE s.entry_id = ?
1872
- LIMIT 5
 
1873
  """, (entry_id,)
1874
  ).fetchall()
1875
- report["examples"] = [ex["text"] for ex in ex_q]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1876
 
1877
  return report
1878
 
@@ -1997,7 +2068,11 @@ def _wiktionary_format_semantics_block(
1997
  "odenet_senses": odenet_senses,
1998
  "conceptnet_relations": conceptnet_relations,
1999
  "wiktionary_synonyms": wikt_report.get("synonyms", []),
2000
- "wiktionary_antonyms": wikt_report.get("antonyms", [])
 
 
 
 
2001
  }
2002
 
2003
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
@@ -2117,8 +2192,12 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2117
  "semantics_combined": semantics_block,
2118
  "wiktionary_metadata": {
2119
  "pos_title": pos_title,
 
2120
  "pronunciation": wikt_report.get("sounds"),
2121
- "examples": wikt_report.get("examples")
 
 
 
2122
  }
2123
  }
2124
 
@@ -3566,7 +3645,7 @@ if __name__ == "__main__":
3566
  hanta_get_tagger() # Call the function to load the model
3567
  except Exception as e:
3568
  print(f"✗ FAILED to start HanTa tagger: {e}")
3569
- print("  'Word Encyclopedia' tab will fail.")
3570
  else:
3571
  print("INFO: HanTa library not available, skipping tagger.")
3572
  print("--- HanTa Done ---\n")
 
1804
 
1805
  def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
1806
  """
1807
+ (REVISED)
1808
+ Fetches ALL associated data for a single Wiktionary entry_id,
1809
+ making full use of the normalized norm4.py schema.
1810
  """
1811
  report = {}
1812
 
1813
  # 1. Get Base Entry Info
1814
  entry_data = conn.execute(
1815
+ "SELECT word, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
1816
  ).fetchone()
1817
  if not entry_data:
1818
  return {"error": "Entry ID not found"}
 
1820
  report["entry_id"] = entry_id
1821
  report["lemma"] = entry_data["word"] # Alias for clarity
1822
 
1823
+ # 2. Get Senses (with Glosses, Tags, Topics, Categories, and Examples)
1824
  senses_q = conn.execute(
1825
  """
1826
+ SELECT
1827
+ s.id as sense_id,
1828
+ s.sense_index,
1829
+ (SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
1830
+ (SELECT GROUP_CONCAT(rg.raw_gloss, '; ') FROM raw_glosses rg WHERE rg.sense_id = s.id) as raw_glosses,
1831
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
1832
+ (SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics,
1833
+ (SELECT GROUP_CONCAT(c.category, ', ') FROM sense_categories sc JOIN categories c ON sc.category_id = c.id WHERE sc.sense_id = s.id) as categories
1834
  FROM senses s
 
1835
  WHERE s.entry_id = ?
1836
+ ORDER BY s.id
1837
  """, (entry_id,)
1838
  ).fetchall()
1839
+
1840
+ senses_list = []
1841
+ for sense_row in senses_q:
1842
+ sense_dict = dict(sense_row)
1843
+ # Get examples for this specific sense
1844
+ examples_q = conn.execute(
1845
+ "SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_dict["sense_id"],)
1846
+ ).fetchall()
1847
+ sense_dict["examples"] = [dict(ex) for ex in examples_q]
1848
+ senses_list.append(sense_dict)
1849
+ report["senses"] = senses_list
1850
+
1851
+ # 3. Get Inflected Forms (with Tags and Topics)
1852
  forms_q = conn.execute(
1853
  """
1854
+ SELECT
1855
+ f.form_text,
1856
+ f.sense_index,
1857
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM form_tags ft JOIN tags t ON ft.tag_id = t.id WHERE ft.form_id = f.id) as tags,
1858
+ (SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
1859
  FROM forms f
 
 
1860
  WHERE f.entry_id = ?
1861
  GROUP BY f.id
1862
  ORDER BY f.id
 
1864
  ).fetchall()
1865
  report["forms"] = [dict(f) for f in forms_q]
1866
 
1867
+ # 4. Get Pronunciations (with Tags)
1868
  sounds_q = conn.execute(
1869
+ """
1870
+ SELECT
1871
+ s.ipa, s.audio, s.mp3_url, s.ogg_url, s.rhymes,
1872
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM sound_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sound_id = s.id) as tags
1873
+ FROM sounds s
1874
+ WHERE s.entry_id = ?
1875
+ GROUP BY s.id
1876
+ """, (entry_id,)
1877
  ).fetchall()
1878
  report["sounds"] = [dict(s) for s in sounds_q]
1879
 
1880
+ # 5. Get Synonyms (with Tags and Topics)
1881
  syn_q = conn.execute(
1882
+ """
1883
+ SELECT
1884
+ s.synonym_word, s.sense_index,
1885
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM synonym_tags st JOIN tags t ON st.tag_id = t.id WHERE st.synonym_id = s.id) as tags,
1886
+ (SELECT GROUP_CONCAT(top.topic, ', ') FROM synonym_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.synonym_id = s.id) as topics
1887
+ FROM synonyms s
1888
+ WHERE s.entry_id = ?
1889
+ GROUP BY s.id
1890
+ """, (entry_id,)
1891
  ).fetchall()
1892
+ report["synonyms"] = [dict(s) for s in syn_q]
1893
 
1894
+ # 6. Get Antonyms (with Tags)
1895
  ant_q = conn.execute(
1896
+ """
1897
+ SELECT
1898
+ a.antonym_word, a.sense_index,
1899
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM antonym_tags at JOIN tags t ON at.tag_id = t.id WHERE at.antonym_id = a.id) as tags
1900
+ FROM antonyms a
1901
+ WHERE a.entry_id = ?
1902
+ GROUP BY a.id
1903
+ """, (entry_id,)
1904
  ).fetchall()
1905
+ report["antonyms"] = [dict(a) for a in ant_q]
1906
+
1907
+ # 7. Get Translations (with Tags)
1908
+ trans_q = conn.execute(
1909
  """
1910
+ SELECT
1911
+ tr.lang, tr.lang_code, tr.word, tr.sense_text, tr.roman,
1912
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM translation_tags tt JOIN tags t ON tt.tag_id = t.id WHERE tt.translation_id = tr.id) as tags
1913
+ FROM translations tr
1914
+ WHERE tr.entry_id = ?
1915
+ GROUP BY tr.id
1916
  """, (entry_id,)
1917
  ).fetchall()
1918
+ report["translations"] = [dict(tr) for tr in trans_q]
1919
+
1920
+ # 8. Get Hyphenations
1921
+ hyphen_q = conn.execute(
1922
+ "SELECT hyphenation FROM hyphenations WHERE entry_id = ?", (entry_id,)
1923
+ ).fetchall()
1924
+ report["hyphenations"] = [h["hyphenation"] for h in hyphen_q]
1925
+
1926
+ # 9. Get Derived and Related Terms
1927
+ derived_q = conn.execute(
1928
+ "SELECT derived_word FROM derived_terms WHERE entry_id = ?", (entry_id,)
1929
+ ).fetchall()
1930
+ report["derived_terms"] = [d["derived_word"] for d in derived_q]
1931
+
1932
+ related_q = conn.execute(
1933
+ "SELECT related_word FROM related_terms WHERE entry_id = ?", (entry_id,)
1934
+ ).fetchall()
1935
+ report["related_terms"] = [r["related_word"] for r in related_q]
1936
+
1937
+ # 10. Get Entry-level Tags and Categories
1938
+ entry_tags_q = conn.execute(
1939
+ "SELECT t.tag FROM entry_tags et JOIN tags t ON et.tag_id = t.id WHERE et.entry_id = ?", (entry_id,)
1940
+ ).fetchall()
1941
+ report["entry_tags"] = [t["tag"] for t in entry_tags_q]
1942
+
1943
+ entry_cats_q = conn.execute(
1944
+ "SELECT c.category FROM entry_categories ec JOIN categories c ON ec.category_id = c.id WHERE ec.entry_id = ?", (entry_id,)
1945
+ ).fetchall()
1946
+ report["entry_categories"] = [c["category"] for c in entry_cats_q]
1947
 
1948
  return report
1949
 
 
2068
  "odenet_senses": odenet_senses,
2069
  "conceptnet_relations": conceptnet_relations,
2070
  "wiktionary_synonyms": wikt_report.get("synonyms", []),
2071
+ "wiktionary_antonyms": wikt_report.get("antonyms", []),
2072
+ "wiktionary_translations": wikt_report.get("translations", []),
2073
+ "wiktionary_derived_terms": wikt_report.get("derived_terms", []),
2074
+ "wiktionary_related_terms": wikt_report.get("related_terms", [])
2075
+
2076
  }
2077
 
2078
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
 
2192
  "semantics_combined": semantics_block,
2193
  "wiktionary_metadata": {
2194
  "pos_title": pos_title,
2195
+ "etymology": wikt_report.get("etymology_text"),
2196
  "pronunciation": wikt_report.get("sounds"),
2197
+ "hyphenation": wikt_report.get("hyphenations"), # <-- THE FIX
2198
+ "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
2199
+ "entry_tags": wikt_report.get("entry_tags"),
2200
+ "entry_categories": wikt_report.get("entry_categories")
2201
  }
2202
  }
2203
 
 
3645
  hanta_get_tagger() # Call the function to load the model
3646
  except Exception as e:
3647
  print(f"✗ FAILED to start HanTa tagger: {e}")
3648
+ print(" 'Word Encyclopedia' tab will fail.")
3649
  else:
3650
  print("INFO: HanTa library not available, skipping tagger.")
3651
  print("--- HanTa Done ---\n")