Spaces:
Running
Running
more wiktionary data
Browse files
app.py
CHANGED
|
@@ -1804,13 +1804,15 @@ def _wiktionary_map_pos_key(wikt_pos: Optional[str]) -> str:
|
|
| 1804 |
|
| 1805 |
def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
|
| 1806 |
"""
|
| 1807 |
-
|
|
|
|
|
|
|
| 1808 |
"""
|
| 1809 |
report = {}
|
| 1810 |
|
| 1811 |
# 1. Get Base Entry Info
|
| 1812 |
entry_data = conn.execute(
|
| 1813 |
-
"SELECT word, pos, pos_title, lang FROM entries WHERE id = ?", (entry_id,)
|
| 1814 |
).fetchone()
|
| 1815 |
if not entry_data:
|
| 1816 |
return {"error": "Entry ID not found"}
|
|
@@ -1818,25 +1820,43 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1818 |
report["entry_id"] = entry_id
|
| 1819 |
report["lemma"] = entry_data["word"] # Alias for clarity
|
| 1820 |
|
| 1821 |
-
# 2. Get Senses (
|
| 1822 |
senses_q = conn.execute(
|
| 1823 |
"""
|
| 1824 |
-
SELECT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1825 |
FROM senses s
|
| 1826 |
-
JOIN glosses g ON s.id = g.sense_id
|
| 1827 |
WHERE s.entry_id = ?
|
| 1828 |
-
ORDER BY s.id
|
| 1829 |
""", (entry_id,)
|
| 1830 |
).fetchall()
|
| 1831 |
-
|
| 1832 |
-
|
| 1833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1834 |
forms_q = conn.execute(
|
| 1835 |
"""
|
| 1836 |
-
SELECT
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1837 |
FROM forms f
|
| 1838 |
-
LEFT JOIN form_tags ft ON f.id = ft.form_id
|
| 1839 |
-
LEFT JOIN tags t ON ft.tag_id = t.id
|
| 1840 |
WHERE f.entry_id = ?
|
| 1841 |
GROUP BY f.id
|
| 1842 |
ORDER BY f.id
|
|
@@ -1844,35 +1864,86 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1844 |
).fetchall()
|
| 1845 |
report["forms"] = [dict(f) for f in forms_q]
|
| 1846 |
|
| 1847 |
-
# 4. Get Pronunciations
|
| 1848 |
sounds_q = conn.execute(
|
| 1849 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1850 |
).fetchall()
|
| 1851 |
report["sounds"] = [dict(s) for s in sounds_q]
|
| 1852 |
|
| 1853 |
-
# 5. Get Synonyms
|
| 1854 |
syn_q = conn.execute(
|
| 1855 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1856 |
).fetchall()
|
| 1857 |
-
report["synonyms"] = [s
|
| 1858 |
|
| 1859 |
-
# 6. Get Antonyms
|
| 1860 |
ant_q = conn.execute(
|
| 1861 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1862 |
).fetchall()
|
| 1863 |
-
report["antonyms"] = [a
|
| 1864 |
-
|
| 1865 |
-
# 7. Get
|
| 1866 |
-
|
| 1867 |
"""
|
| 1868 |
-
SELECT
|
| 1869 |
-
|
| 1870 |
-
|
| 1871 |
-
|
| 1872 |
-
|
|
|
|
| 1873 |
""", (entry_id,)
|
| 1874 |
).fetchall()
|
| 1875 |
-
report["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1876 |
|
| 1877 |
return report
|
| 1878 |
|
|
@@ -1997,7 +2068,11 @@ def _wiktionary_format_semantics_block(
|
|
| 1997 |
"odenet_senses": odenet_senses,
|
| 1998 |
"conceptnet_relations": conceptnet_relations,
|
| 1999 |
"wiktionary_synonyms": wikt_report.get("synonyms", []),
|
| 2000 |
-
"wiktionary_antonyms": wikt_report.get("antonyms", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2001 |
}
|
| 2002 |
|
| 2003 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
@@ -2117,8 +2192,12 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2117 |
"semantics_combined": semantics_block,
|
| 2118 |
"wiktionary_metadata": {
|
| 2119 |
"pos_title": pos_title,
|
|
|
|
| 2120 |
"pronunciation": wikt_report.get("sounds"),
|
| 2121 |
-
"
|
|
|
|
|
|
|
|
|
|
| 2122 |
}
|
| 2123 |
}
|
| 2124 |
|
|
@@ -3566,7 +3645,7 @@ if __name__ == "__main__":
|
|
| 3566 |
hanta_get_tagger() # Call the function to load the model
|
| 3567 |
except Exception as e:
|
| 3568 |
print(f"✗ FAILED to start HanTa tagger: {e}")
|
| 3569 |
-
print("
|
| 3570 |
else:
|
| 3571 |
print("INFO: HanTa library not available, skipping tagger.")
|
| 3572 |
print("--- HanTa Done ---\n")
|
|
|
|
| 1804 |
|
| 1805 |
def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
|
| 1806 |
"""
|
| 1807 |
+
(REVISED)
|
| 1808 |
+
Fetches ALL associated data for a single Wiktionary entry_id,
|
| 1809 |
+
making full use of the normalized norm4.py schema.
|
| 1810 |
"""
|
| 1811 |
report = {}
|
| 1812 |
|
| 1813 |
# 1. Get Base Entry Info
|
| 1814 |
entry_data = conn.execute(
|
| 1815 |
+
"SELECT word, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
|
| 1816 |
).fetchone()
|
| 1817 |
if not entry_data:
|
| 1818 |
return {"error": "Entry ID not found"}
|
|
|
|
| 1820 |
report["entry_id"] = entry_id
|
| 1821 |
report["lemma"] = entry_data["word"] # Alias for clarity
|
| 1822 |
|
| 1823 |
+
# 2. Get Senses (with Glosses, Tags, Topics, Categories, and Examples)
|
| 1824 |
senses_q = conn.execute(
|
| 1825 |
"""
|
| 1826 |
+
SELECT
|
| 1827 |
+
s.id as sense_id,
|
| 1828 |
+
s.sense_index,
|
| 1829 |
+
(SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
|
| 1830 |
+
(SELECT GROUP_CONCAT(rg.raw_gloss, '; ') FROM raw_glosses rg WHERE rg.sense_id = s.id) as raw_glosses,
|
| 1831 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
|
| 1832 |
+
(SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics,
|
| 1833 |
+
(SELECT GROUP_CONCAT(c.category, ', ') FROM sense_categories sc JOIN categories c ON sc.category_id = c.id WHERE sc.sense_id = s.id) as categories
|
| 1834 |
FROM senses s
|
|
|
|
| 1835 |
WHERE s.entry_id = ?
|
| 1836 |
+
ORDER BY s.id
|
| 1837 |
""", (entry_id,)
|
| 1838 |
).fetchall()
|
| 1839 |
+
|
| 1840 |
+
senses_list = []
|
| 1841 |
+
for sense_row in senses_q:
|
| 1842 |
+
sense_dict = dict(sense_row)
|
| 1843 |
+
# Get examples for this specific sense
|
| 1844 |
+
examples_q = conn.execute(
|
| 1845 |
+
"SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_dict["sense_id"],)
|
| 1846 |
+
).fetchall()
|
| 1847 |
+
sense_dict["examples"] = [dict(ex) for ex in examples_q]
|
| 1848 |
+
senses_list.append(sense_dict)
|
| 1849 |
+
report["senses"] = senses_list
|
| 1850 |
+
|
| 1851 |
+
# 3. Get Inflected Forms (with Tags and Topics)
|
| 1852 |
forms_q = conn.execute(
|
| 1853 |
"""
|
| 1854 |
+
SELECT
|
| 1855 |
+
f.form_text,
|
| 1856 |
+
f.sense_index,
|
| 1857 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM form_tags ft JOIN tags t ON ft.tag_id = t.id WHERE ft.form_id = f.id) as tags,
|
| 1858 |
+
(SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
|
| 1859 |
FROM forms f
|
|
|
|
|
|
|
| 1860 |
WHERE f.entry_id = ?
|
| 1861 |
GROUP BY f.id
|
| 1862 |
ORDER BY f.id
|
|
|
|
| 1864 |
).fetchall()
|
| 1865 |
report["forms"] = [dict(f) for f in forms_q]
|
| 1866 |
|
| 1867 |
+
# 4. Get Pronunciations (with Tags)
|
| 1868 |
sounds_q = conn.execute(
|
| 1869 |
+
"""
|
| 1870 |
+
SELECT
|
| 1871 |
+
s.ipa, s.audio, s.mp3_url, s.ogg_url, s.rhymes,
|
| 1872 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM sound_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sound_id = s.id) as tags
|
| 1873 |
+
FROM sounds s
|
| 1874 |
+
WHERE s.entry_id = ?
|
| 1875 |
+
GROUP BY s.id
|
| 1876 |
+
""", (entry_id,)
|
| 1877 |
).fetchall()
|
| 1878 |
report["sounds"] = [dict(s) for s in sounds_q]
|
| 1879 |
|
| 1880 |
+
# 5. Get Synonyms (with Tags and Topics)
|
| 1881 |
syn_q = conn.execute(
|
| 1882 |
+
"""
|
| 1883 |
+
SELECT
|
| 1884 |
+
s.synonym_word, s.sense_index,
|
| 1885 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM synonym_tags st JOIN tags t ON st.tag_id = t.id WHERE st.synonym_id = s.id) as tags,
|
| 1886 |
+
(SELECT GROUP_CONCAT(top.topic, ', ') FROM synonym_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.synonym_id = s.id) as topics
|
| 1887 |
+
FROM synonyms s
|
| 1888 |
+
WHERE s.entry_id = ?
|
| 1889 |
+
GROUP BY s.id
|
| 1890 |
+
""", (entry_id,)
|
| 1891 |
).fetchall()
|
| 1892 |
+
report["synonyms"] = [dict(s) for s in syn_q]
|
| 1893 |
|
| 1894 |
+
# 6. Get Antonyms (with Tags)
|
| 1895 |
ant_q = conn.execute(
|
| 1896 |
+
"""
|
| 1897 |
+
SELECT
|
| 1898 |
+
a.antonym_word, a.sense_index,
|
| 1899 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM antonym_tags at JOIN tags t ON at.tag_id = t.id WHERE at.antonym_id = a.id) as tags
|
| 1900 |
+
FROM antonyms a
|
| 1901 |
+
WHERE a.entry_id = ?
|
| 1902 |
+
GROUP BY a.id
|
| 1903 |
+
""", (entry_id,)
|
| 1904 |
).fetchall()
|
| 1905 |
+
report["antonyms"] = [dict(a) for a in ant_q]
|
| 1906 |
+
|
| 1907 |
+
# 7. Get Translations (with Tags)
|
| 1908 |
+
trans_q = conn.execute(
|
| 1909 |
"""
|
| 1910 |
+
SELECT
|
| 1911 |
+
tr.lang, tr.lang_code, tr.word, tr.sense_text, tr.roman,
|
| 1912 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM translation_tags tt JOIN tags t ON tt.tag_id = t.id WHERE tt.translation_id = tr.id) as tags
|
| 1913 |
+
FROM translations tr
|
| 1914 |
+
WHERE tr.entry_id = ?
|
| 1915 |
+
GROUP BY tr.id
|
| 1916 |
""", (entry_id,)
|
| 1917 |
).fetchall()
|
| 1918 |
+
report["translations"] = [dict(tr) for tr in trans_q]
|
| 1919 |
+
|
| 1920 |
+
# 8. Get Hyphenations
|
| 1921 |
+
hyphen_q = conn.execute(
|
| 1922 |
+
"SELECT hyphenation FROM hyphenations WHERE entry_id = ?", (entry_id,)
|
| 1923 |
+
).fetchall()
|
| 1924 |
+
report["hyphenations"] = [h["hyphenation"] for h in hyphen_q]
|
| 1925 |
+
|
| 1926 |
+
# 9. Get Derived and Related Terms
|
| 1927 |
+
derived_q = conn.execute(
|
| 1928 |
+
"SELECT derived_word FROM derived_terms WHERE entry_id = ?", (entry_id,)
|
| 1929 |
+
).fetchall()
|
| 1930 |
+
report["derived_terms"] = [d["derived_word"] for d in derived_q]
|
| 1931 |
+
|
| 1932 |
+
related_q = conn.execute(
|
| 1933 |
+
"SELECT related_word FROM related_terms WHERE entry_id = ?", (entry_id,)
|
| 1934 |
+
).fetchall()
|
| 1935 |
+
report["related_terms"] = [r["related_word"] for r in related_q]
|
| 1936 |
+
|
| 1937 |
+
# 10. Get Entry-level Tags and Categories
|
| 1938 |
+
entry_tags_q = conn.execute(
|
| 1939 |
+
"SELECT t.tag FROM entry_tags et JOIN tags t ON et.tag_id = t.id WHERE et.entry_id = ?", (entry_id,)
|
| 1940 |
+
).fetchall()
|
| 1941 |
+
report["entry_tags"] = [t["tag"] for t in entry_tags_q]
|
| 1942 |
+
|
| 1943 |
+
entry_cats_q = conn.execute(
|
| 1944 |
+
"SELECT c.category FROM entry_categories ec JOIN categories c ON ec.category_id = c.id WHERE ec.entry_id = ?", (entry_id,)
|
| 1945 |
+
).fetchall()
|
| 1946 |
+
report["entry_categories"] = [c["category"] for c in entry_cats_q]
|
| 1947 |
|
| 1948 |
return report
|
| 1949 |
|
|
|
|
| 2068 |
"odenet_senses": odenet_senses,
|
| 2069 |
"conceptnet_relations": conceptnet_relations,
|
| 2070 |
"wiktionary_synonyms": wikt_report.get("synonyms", []),
|
| 2071 |
+
"wiktionary_antonyms": wikt_report.get("antonyms", []),
|
| 2072 |
+
"wiktionary_translations": wikt_report.get("translations", []),
|
| 2073 |
+
"wiktionary_derived_terms": wikt_report.get("derived_terms", []),
|
| 2074 |
+
"wiktionary_related_terms": wikt_report.get("related_terms", [])
|
| 2075 |
+
|
| 2076 |
}
|
| 2077 |
|
| 2078 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
|
|
| 2192 |
"semantics_combined": semantics_block,
|
| 2193 |
"wiktionary_metadata": {
|
| 2194 |
"pos_title": pos_title,
|
| 2195 |
+
"etymology": wikt_report.get("etymology_text"),
|
| 2196 |
"pronunciation": wikt_report.get("sounds"),
|
| 2197 |
+
"hyphenation": wikt_report.get("hyphenations"), # <-- THE FIX
|
| 2198 |
+
"examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
|
| 2199 |
+
"entry_tags": wikt_report.get("entry_tags"),
|
| 2200 |
+
"entry_categories": wikt_report.get("entry_categories")
|
| 2201 |
}
|
| 2202 |
}
|
| 2203 |
|
|
|
|
| 3645 |
hanta_get_tagger() # Call the function to load the model
|
| 3646 |
except Exception as e:
|
| 3647 |
print(f"✗ FAILED to start HanTa tagger: {e}")
|
| 3648 |
+
print(" 'Word Encyclopedia' tab will fail.")
|
| 3649 |
else:
|
| 3650 |
print("INFO: HanTa library not available, skipping tagger.")
|
| 3651 |
print("--- HanTa Done ---\n")
|