Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -215,7 +215,7 @@ def log(msg):
|
|
| 215 |
print(f"[DEBUG] {msg}")
|
| 216 |
|
| 217 |
# --- Wiktionary Cache & Lock ---
|
| 218 |
-
WIKTIONARY_DB_PATH = "
|
| 219 |
WIKTIONARY_REPO_ID = "cstr/de-wiktionary-sqlite-full"
|
| 220 |
WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
|
| 221 |
WIKTIONARY_CONN_LOCK = threading.Lock()
|
|
@@ -1804,33 +1804,31 @@ def _wiktionary_map_pos_key(wikt_pos: Optional[str]) -> str:
|
|
| 1804 |
|
| 1805 |
def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
|
| 1806 |
"""
|
| 1807 |
-
(REVISED)
|
| 1808 |
Fetches ALL associated data for a single Wiktionary entry_id,
|
| 1809 |
-
making full use of the normalized
|
| 1810 |
"""
|
| 1811 |
report = {}
|
| 1812 |
|
| 1813 |
-
# 1. Get Base Entry Info
|
| 1814 |
entry_data = conn.execute(
|
| 1815 |
-
"SELECT word, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
|
| 1816 |
).fetchone()
|
| 1817 |
if not entry_data:
|
| 1818 |
return {"error": "Entry ID not found"}
|
| 1819 |
report.update(dict(entry_data))
|
| 1820 |
report["entry_id"] = entry_id
|
| 1821 |
-
report["lemma"] = entry_data["word"]
|
| 1822 |
|
| 1823 |
-
# 2. Get Senses (with Glosses, Tags, Topics, Categories, and
|
| 1824 |
senses_q = conn.execute(
|
| 1825 |
"""
|
| 1826 |
SELECT
|
| 1827 |
s.id as sense_id,
|
| 1828 |
s.sense_index,
|
| 1829 |
(SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
|
| 1830 |
-
(SELECT GROUP_CONCAT(rg.raw_gloss, '; ') FROM raw_glosses rg WHERE rg.sense_id = s.id) as raw_glosses,
|
| 1831 |
(SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
|
| 1832 |
-
(SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics
|
| 1833 |
-
(SELECT GROUP_CONCAT(c.category, ', ') FROM sense_categories sc JOIN categories c ON sc.category_id = c.id WHERE sc.sense_id = s.id) as categories
|
| 1834 |
FROM senses s
|
| 1835 |
WHERE s.entry_id = ?
|
| 1836 |
ORDER BY s.id
|
|
@@ -1840,13 +1838,30 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1840 |
senses_list = []
|
| 1841 |
for sense_row in senses_q:
|
| 1842 |
sense_dict = dict(sense_row)
|
| 1843 |
-
|
|
|
|
|
|
|
| 1844 |
examples_q = conn.execute(
|
| 1845 |
-
"SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (
|
| 1846 |
).fetchall()
|
| 1847 |
sense_dict["examples"] = [dict(ex) for ex in examples_q]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1848 |
senses_list.append(sense_dict)
|
| 1849 |
report["senses"] = senses_list
|
|
|
|
|
|
|
| 1850 |
|
| 1851 |
# 3. Get Inflected Forms (with Tags and Topics)
|
| 1852 |
forms_q = conn.execute(
|
|
@@ -1858,8 +1873,7 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1858 |
(SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
|
| 1859 |
FROM forms f
|
| 1860 |
WHERE f.entry_id = ?
|
| 1861 |
-
GROUP BY f.id
|
| 1862 |
-
ORDER BY f.id
|
| 1863 |
""", (entry_id,)
|
| 1864 |
).fetchall()
|
| 1865 |
report["forms"] = [dict(f) for f in forms_q]
|
|
@@ -1925,14 +1939,15 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1925 |
|
| 1926 |
# 9. Get Derived and Related Terms
|
| 1927 |
derived_q = conn.execute(
|
| 1928 |
-
"SELECT derived_word FROM derived_terms WHERE entry_id = ?", (entry_id,)
|
| 1929 |
).fetchall()
|
| 1930 |
-
report["derived_terms"] = [d
|
| 1931 |
|
|
|
|
| 1932 |
related_q = conn.execute(
|
| 1933 |
-
"SELECT related_word FROM related_terms WHERE entry_id = ?", (entry_id,)
|
| 1934 |
).fetchall()
|
| 1935 |
-
report["related_terms"] = [r
|
| 1936 |
|
| 1937 |
# 10. Get Entry-level Tags and Categories
|
| 1938 |
entry_tags_q = conn.execute(
|
|
@@ -1945,6 +1960,50 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1945 |
).fetchall()
|
| 1946 |
report["entry_categories"] = [c["category"] for c in entry_cats_q]
|
| 1947 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1948 |
return report
|
| 1949 |
|
| 1950 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
|
@@ -2191,13 +2250,27 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2191 |
"inflections_pattern": pattern_block,
|
| 2192 |
"semantics_combined": semantics_block,
|
| 2193 |
"wiktionary_metadata": {
|
|
|
|
| 2194 |
"pos_title": pos_title,
|
| 2195 |
"etymology": wikt_report.get("etymology_text"),
|
| 2196 |
"pronunciation": wikt_report.get("sounds"),
|
| 2197 |
-
"hyphenation": wikt_report.get("hyphenations"),
|
| 2198 |
"examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
|
| 2199 |
"entry_tags": wikt_report.get("entry_tags"),
|
| 2200 |
-
"entry_categories": wikt_report.get("entry_categories")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2201 |
}
|
| 2202 |
}
|
| 2203 |
|
|
|
|
| 215 |
print(f"[DEBUG] {msg}")
|
| 216 |
|
| 217 |
# --- Wiktionary Cache & Lock ---
|
| 218 |
+
WIKTIONARY_DB_PATH = "de_wiktionary_normalized_full.db"
|
| 219 |
WIKTIONARY_REPO_ID = "cstr/de-wiktionary-sqlite-full"
|
| 220 |
WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
|
| 221 |
WIKTIONARY_CONN_LOCK = threading.Lock()
|
|
|
|
| 1804 |
|
| 1805 |
def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
|
| 1806 |
"""
|
| 1807 |
+
(REVISED FOR FULL DB)
|
| 1808 |
Fetches ALL associated data for a single Wiktionary entry_id,
|
| 1809 |
+
making full use of the new '_full' normalized schema.
|
| 1810 |
"""
|
| 1811 |
report = {}
|
| 1812 |
|
| 1813 |
+
# 1. Get Base Entry Info (with title and redirect)
|
| 1814 |
entry_data = conn.execute(
|
| 1815 |
+
"SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
|
| 1816 |
).fetchone()
|
| 1817 |
if not entry_data:
|
| 1818 |
return {"error": "Entry ID not found"}
|
| 1819 |
report.update(dict(entry_data))
|
| 1820 |
report["entry_id"] = entry_id
|
| 1821 |
+
report["lemma"] = entry_data["word"]
|
| 1822 |
|
| 1823 |
+
# 2. Get Senses (with Glosses, Tags, Topics, Categories, Examples, Expressions, and Proverbs)
|
| 1824 |
senses_q = conn.execute(
|
| 1825 |
"""
|
| 1826 |
SELECT
|
| 1827 |
s.id as sense_id,
|
| 1828 |
s.sense_index,
|
| 1829 |
(SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
|
|
|
|
| 1830 |
(SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
|
| 1831 |
+
(SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics
|
|
|
|
| 1832 |
FROM senses s
|
| 1833 |
WHERE s.entry_id = ?
|
| 1834 |
ORDER BY s.id
|
|
|
|
| 1838 |
senses_list = []
|
| 1839 |
for sense_row in senses_q:
|
| 1840 |
sense_dict = dict(sense_row)
|
| 1841 |
+
sense_id = sense_dict["sense_id"]
|
| 1842 |
+
|
| 1843 |
+
# Get examples
|
| 1844 |
examples_q = conn.execute(
|
| 1845 |
+
"SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_id,)
|
| 1846 |
).fetchall()
|
| 1847 |
sense_dict["examples"] = [dict(ex) for ex in examples_q]
|
| 1848 |
+
|
| 1849 |
+
# --- NEW: Get expressions ---
|
| 1850 |
+
expr_q = conn.execute(
|
| 1851 |
+
"SELECT expression, sense_index FROM expressions WHERE sense_id = ?", (sense_id,)
|
| 1852 |
+
).fetchall()
|
| 1853 |
+
sense_dict["expressions"] = [dict(ex) for ex in expr_q]
|
| 1854 |
+
|
| 1855 |
+
# --- NEW: Get proverbs ---
|
| 1856 |
+
prov_q = conn.execute(
|
| 1857 |
+
"SELECT proverb, sense_index FROM proverbs WHERE sense_id = ?", (sense_id,)
|
| 1858 |
+
).fetchall()
|
| 1859 |
+
sense_dict["proverbs"] = [dict(p) for p in prov_q]
|
| 1860 |
+
|
| 1861 |
senses_list.append(sense_dict)
|
| 1862 |
report["senses"] = senses_list
|
| 1863 |
+
report["expressions_by_sense"] = [s["expressions"] for s in senses_list if s["expressions"]]
|
| 1864 |
+
report["proverbs_by_sense"] = [s["proverbs"] for s in senses_list if s["proverbs"]]
|
| 1865 |
|
| 1866 |
# 3. Get Inflected Forms (with Tags and Topics)
|
| 1867 |
forms_q = conn.execute(
|
|
|
|
| 1873 |
(SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
|
| 1874 |
FROM forms f
|
| 1875 |
WHERE f.entry_id = ?
|
| 1876 |
+
GROUP BY f.id ORDER BY f.id
|
|
|
|
| 1877 |
""", (entry_id,)
|
| 1878 |
).fetchall()
|
| 1879 |
report["forms"] = [dict(f) for f in forms_q]
|
|
|
|
| 1939 |
|
| 1940 |
# 9. Get Derived and Related Terms
|
| 1941 |
derived_q = conn.execute(
|
| 1942 |
+
"SELECT derived_word, sense_index FROM derived_terms WHERE entry_id = ?", (entry_id,)
|
| 1943 |
).fetchall()
|
| 1944 |
+
report["derived_terms"] = [dict(d) for d in derived_q]
|
| 1945 |
|
| 1946 |
+
# --- FIXED: Query related_terms with new columns ---
|
| 1947 |
related_q = conn.execute(
|
| 1948 |
+
"SELECT related_word, sense_index, raw_tags_json FROM related_terms WHERE entry_id = ?", (entry_id,)
|
| 1949 |
).fetchall()
|
| 1950 |
+
report["related_terms"] = [dict(r) for r in related_q]
|
| 1951 |
|
| 1952 |
# 10. Get Entry-level Tags and Categories
|
| 1953 |
entry_tags_q = conn.execute(
|
|
|
|
| 1960 |
).fetchall()
|
| 1961 |
report["entry_categories"] = [c["category"] for c in entry_cats_q]
|
| 1962 |
|
| 1963 |
+
# --- 11. GET ALL NEW OMITTED FIELDS ---
|
| 1964 |
+
|
| 1965 |
+
# entry_notes
|
| 1966 |
+
notes_q = conn.execute("SELECT note FROM entry_notes WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1967 |
+
report["entry_notes"] = [n["note"] for n in notes_q]
|
| 1968 |
+
|
| 1969 |
+
# other_pos
|
| 1970 |
+
other_pos_q = conn.execute("SELECT pos_value FROM other_pos WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1971 |
+
report["other_pos"] = [p["pos_value"] for p in other_pos_q]
|
| 1972 |
+
|
| 1973 |
+
# entry_raw_tags
|
| 1974 |
+
raw_tags_q = conn.execute("SELECT raw_tag FROM entry_raw_tags WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1975 |
+
report["raw_tags"] = [t["raw_tag"] for t in raw_tags_q]
|
| 1976 |
+
|
| 1977 |
+
# descendants
|
| 1978 |
+
desc_q = conn.execute("SELECT lang, word, roman FROM descendants WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1979 |
+
report["descendants"] = [dict(d) for d in desc_q]
|
| 1980 |
+
|
| 1981 |
+
# Semantic relations
|
| 1982 |
+
hyper_q = conn.execute("SELECT hypernym_word, sense_index FROM hypernyms WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1983 |
+
report["hypernyms"] = [dict(h) for h in hyper_q]
|
| 1984 |
+
|
| 1985 |
+
hypo_q = conn.execute("SELECT hyponym_word, sense_index FROM hyponyms WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1986 |
+
report["hyponyms"] = [dict(h) for h in hypo_q]
|
| 1987 |
+
|
| 1988 |
+
holo_q = conn.execute("SELECT holonym_word, sense_index FROM holonyms WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1989 |
+
report["holonyms"] = [dict(h) for h in holo_q]
|
| 1990 |
+
|
| 1991 |
+
mero_q = conn.execute("SELECT meronym_word, sense_index FROM meronyms WHERE entry_id = ?", (entry_id,)).fetchall()
|
| 1992 |
+
report["meronyms"] = [dict(m) for m in mero_q]
|
| 1993 |
+
|
| 1994 |
+
# Coordinate terms (with their tags)
|
| 1995 |
+
coord_q = conn.execute(
|
| 1996 |
+
"""
|
| 1997 |
+
SELECT
|
| 1998 |
+
ct.id, ct.coordinate_word, ct.sense_index,
|
| 1999 |
+
(SELECT GROUP_CONCAT(t.tag, ', ') FROM coordinate_term_tags ctt JOIN tags t ON ctt.tag_id = t.id WHERE ctt.coordinate_term_id = ct.id) as tags
|
| 2000 |
+
FROM coordinate_terms ct
|
| 2001 |
+
WHERE ct.entry_id = ?
|
| 2002 |
+
GROUP BY ct.id
|
| 2003 |
+
""", (entry_id,)
|
| 2004 |
+
).fetchall()
|
| 2005 |
+
report["coordinate_terms"] = [dict(c) for c in coord_q]
|
| 2006 |
+
|
| 2007 |
return report
|
| 2008 |
|
| 2009 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
|
|
|
| 2250 |
"inflections_pattern": pattern_block,
|
| 2251 |
"semantics_combined": semantics_block,
|
| 2252 |
"wiktionary_metadata": {
|
| 2253 |
+
# --- Original Fields ---
|
| 2254 |
"pos_title": pos_title,
|
| 2255 |
"etymology": wikt_report.get("etymology_text"),
|
| 2256 |
"pronunciation": wikt_report.get("sounds"),
|
| 2257 |
+
"hyphenation": wikt_report.get("hyphenations"),
|
| 2258 |
"examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
|
| 2259 |
"entry_tags": wikt_report.get("entry_tags"),
|
| 2260 |
+
"entry_categories": wikt_report.get("entry_categories"),
|
| 2261 |
+
|
| 2262 |
+
# --- NEW: Pass through all new fields from the full DB ---
|
| 2263 |
+
"entry_notes": wikt_report.get("entry_notes"),
|
| 2264 |
+
"other_pos": wikt_report.get("other_pos"),
|
| 2265 |
+
"raw_tags": wikt_report.get("raw_tags"),
|
| 2266 |
+
"descendants": wikt_report.get("descendants"),
|
| 2267 |
+
"hypernyms": wikt_report.get("hypernyms"),
|
| 2268 |
+
"hyponyms": wikt_report.get("hyponyms"),
|
| 2269 |
+
"holonyms": wikt_report.get("holonyms"),
|
| 2270 |
+
"meronyms": wikt_report.get("meronyms"),
|
| 2271 |
+
"coordinate_terms": wikt_report.get("coordinate_terms"),
|
| 2272 |
+
"expressions_by_sense": wikt_report.get("expressions_by_sense"),
|
| 2273 |
+
"proverbs_by_sense": wikt_report.get("proverbs_by_sense")
|
| 2274 |
}
|
| 2275 |
}
|
| 2276 |
|