cstr commited on
Commit
40abda5
·
verified ·
1 Parent(s): b65a6e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -20
app.py CHANGED
@@ -215,7 +215,7 @@ def log(msg):
215
  print(f"[DEBUG] {msg}")
216
 
217
  # --- Wiktionary Cache & Lock ---
218
- WIKTIONARY_DB_PATH = "de_wiktionary_normalized.db"
219
  WIKTIONARY_REPO_ID = "cstr/de-wiktionary-sqlite-full"
220
  WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
221
  WIKTIONARY_CONN_LOCK = threading.Lock()
@@ -1804,33 +1804,31 @@ def _wiktionary_map_pos_key(wikt_pos: Optional[str]) -> str:
1804
 
1805
  def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
1806
  """
1807
- (REVISED)
1808
  Fetches ALL associated data for a single Wiktionary entry_id,
1809
- making full use of the normalized norm4.py schema.
1810
  """
1811
  report = {}
1812
 
1813
- # 1. Get Base Entry Info
1814
  entry_data = conn.execute(
1815
- "SELECT word, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
1816
  ).fetchone()
1817
  if not entry_data:
1818
  return {"error": "Entry ID not found"}
1819
  report.update(dict(entry_data))
1820
  report["entry_id"] = entry_id
1821
- report["lemma"] = entry_data["word"] # Alias for clarity
1822
 
1823
- # 2. Get Senses (with Glosses, Tags, Topics, Categories, and Examples)
1824
  senses_q = conn.execute(
1825
  """
1826
  SELECT
1827
  s.id as sense_id,
1828
  s.sense_index,
1829
  (SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
1830
- (SELECT GROUP_CONCAT(rg.raw_gloss, '; ') FROM raw_glosses rg WHERE rg.sense_id = s.id) as raw_glosses,
1831
  (SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
1832
- (SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics,
1833
- (SELECT GROUP_CONCAT(c.category, ', ') FROM sense_categories sc JOIN categories c ON sc.category_id = c.id WHERE sc.sense_id = s.id) as categories
1834
  FROM senses s
1835
  WHERE s.entry_id = ?
1836
  ORDER BY s.id
@@ -1840,13 +1838,30 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1840
  senses_list = []
1841
  for sense_row in senses_q:
1842
  sense_dict = dict(sense_row)
1843
- # Get examples for this specific sense
 
 
1844
  examples_q = conn.execute(
1845
- "SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_dict["sense_id"],)
1846
  ).fetchall()
1847
  sense_dict["examples"] = [dict(ex) for ex in examples_q]
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
  senses_list.append(sense_dict)
1849
  report["senses"] = senses_list
 
 
1850
 
1851
  # 3. Get Inflected Forms (with Tags and Topics)
1852
  forms_q = conn.execute(
@@ -1858,8 +1873,7 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1858
  (SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
1859
  FROM forms f
1860
  WHERE f.entry_id = ?
1861
- GROUP BY f.id
1862
- ORDER BY f.id
1863
  """, (entry_id,)
1864
  ).fetchall()
1865
  report["forms"] = [dict(f) for f in forms_q]
@@ -1925,14 +1939,15 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1925
 
1926
  # 9. Get Derived and Related Terms
1927
  derived_q = conn.execute(
1928
- "SELECT derived_word FROM derived_terms WHERE entry_id = ?", (entry_id,)
1929
  ).fetchall()
1930
- report["derived_terms"] = [d["derived_word"] for d in derived_q]
1931
 
 
1932
  related_q = conn.execute(
1933
- "SELECT related_word FROM related_terms WHERE entry_id = ?", (entry_id,)
1934
  ).fetchall()
1935
- report["related_terms"] = [r["related_word"] for r in related_q]
1936
 
1937
  # 10. Get Entry-level Tags and Categories
1938
  entry_tags_q = conn.execute(
@@ -1945,6 +1960,50 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1945
  ).fetchall()
1946
  report["entry_categories"] = [c["category"] for c in entry_cats_q]
1947
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1948
  return report
1949
 
1950
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
@@ -2191,13 +2250,27 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2191
  "inflections_pattern": pattern_block,
2192
  "semantics_combined": semantics_block,
2193
  "wiktionary_metadata": {
 
2194
  "pos_title": pos_title,
2195
  "etymology": wikt_report.get("etymology_text"),
2196
  "pronunciation": wikt_report.get("sounds"),
2197
- "hyphenation": wikt_report.get("hyphenations"), # <-- THE FIX
2198
  "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
2199
  "entry_tags": wikt_report.get("entry_tags"),
2200
- "entry_categories": wikt_report.get("entry_categories")
 
 
 
 
 
 
 
 
 
 
 
 
 
2201
  }
2202
  }
2203
 
 
215
  print(f"[DEBUG] {msg}")
216
 
217
  # --- Wiktionary Cache & Lock ---
218
+ WIKTIONARY_DB_PATH = "de_wiktionary_normalized_full.db"
219
  WIKTIONARY_REPO_ID = "cstr/de-wiktionary-sqlite-full"
220
  WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
221
  WIKTIONARY_CONN_LOCK = threading.Lock()
 
1804
 
1805
  def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
1806
  """
1807
+ (REVISED FOR FULL DB)
1808
  Fetches ALL associated data for a single Wiktionary entry_id,
1809
+ making full use of the new '_full' normalized schema.
1810
  """
1811
  report = {}
1812
 
1813
+ # 1. Get Base Entry Info (with title and redirect)
1814
  entry_data = conn.execute(
1815
+ "SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
1816
  ).fetchone()
1817
  if not entry_data:
1818
  return {"error": "Entry ID not found"}
1819
  report.update(dict(entry_data))
1820
  report["entry_id"] = entry_id
1821
+ report["lemma"] = entry_data["word"]
1822
 
1823
+ # 2. Get Senses (with Glosses, Tags, Topics, Categories, Examples, Expressions, and Proverbs)
1824
  senses_q = conn.execute(
1825
  """
1826
  SELECT
1827
  s.id as sense_id,
1828
  s.sense_index,
1829
  (SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
 
1830
  (SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
1831
+ (SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics
 
1832
  FROM senses s
1833
  WHERE s.entry_id = ?
1834
  ORDER BY s.id
 
1838
  senses_list = []
1839
  for sense_row in senses_q:
1840
  sense_dict = dict(sense_row)
1841
+ sense_id = sense_dict["sense_id"]
1842
+
1843
+ # Get examples
1844
  examples_q = conn.execute(
1845
+ "SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_id,)
1846
  ).fetchall()
1847
  sense_dict["examples"] = [dict(ex) for ex in examples_q]
1848
+
1849
+ # --- NEW: Get expressions ---
1850
+ expr_q = conn.execute(
1851
+ "SELECT expression, sense_index FROM expressions WHERE sense_id = ?", (sense_id,)
1852
+ ).fetchall()
1853
+ sense_dict["expressions"] = [dict(ex) for ex in expr_q]
1854
+
1855
+ # --- NEW: Get proverbs ---
1856
+ prov_q = conn.execute(
1857
+ "SELECT proverb, sense_index FROM proverbs WHERE sense_id = ?", (sense_id,)
1858
+ ).fetchall()
1859
+ sense_dict["proverbs"] = [dict(p) for p in prov_q]
1860
+
1861
  senses_list.append(sense_dict)
1862
  report["senses"] = senses_list
1863
+ report["expressions_by_sense"] = [s["expressions"] for s in senses_list if s["expressions"]]
1864
+ report["proverbs_by_sense"] = [s["proverbs"] for s in senses_list if s["proverbs"]]
1865
 
1866
  # 3. Get Inflected Forms (with Tags and Topics)
1867
  forms_q = conn.execute(
 
1873
  (SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
1874
  FROM forms f
1875
  WHERE f.entry_id = ?
1876
+ GROUP BY f.id ORDER BY f.id
 
1877
  """, (entry_id,)
1878
  ).fetchall()
1879
  report["forms"] = [dict(f) for f in forms_q]
 
1939
 
1940
  # 9. Get Derived and Related Terms
1941
  derived_q = conn.execute(
1942
+ "SELECT derived_word, sense_index FROM derived_terms WHERE entry_id = ?", (entry_id,)
1943
  ).fetchall()
1944
+ report["derived_terms"] = [dict(d) for d in derived_q]
1945
 
1946
+ # --- FIXED: Query related_terms with new columns ---
1947
  related_q = conn.execute(
1948
+ "SELECT related_word, sense_index, raw_tags_json FROM related_terms WHERE entry_id = ?", (entry_id,)
1949
  ).fetchall()
1950
+ report["related_terms"] = [dict(r) for r in related_q]
1951
 
1952
  # 10. Get Entry-level Tags and Categories
1953
  entry_tags_q = conn.execute(
 
1960
  ).fetchall()
1961
  report["entry_categories"] = [c["category"] for c in entry_cats_q]
1962
 
1963
+ # --- 11. GET ALL NEW OMITTED FIELDS ---
1964
+
1965
+ # entry_notes
1966
+ notes_q = conn.execute("SELECT note FROM entry_notes WHERE entry_id = ?", (entry_id,)).fetchall()
1967
+ report["entry_notes"] = [n["note"] for n in notes_q]
1968
+
1969
+ # other_pos
1970
+ other_pos_q = conn.execute("SELECT pos_value FROM other_pos WHERE entry_id = ?", (entry_id,)).fetchall()
1971
+ report["other_pos"] = [p["pos_value"] for p in other_pos_q]
1972
+
1973
+ # entry_raw_tags
1974
+ raw_tags_q = conn.execute("SELECT raw_tag FROM entry_raw_tags WHERE entry_id = ?", (entry_id,)).fetchall()
1975
+ report["raw_tags"] = [t["raw_tag"] for t in raw_tags_q]
1976
+
1977
+ # descendants
1978
+ desc_q = conn.execute("SELECT lang, word, roman FROM descendants WHERE entry_id = ?", (entry_id,)).fetchall()
1979
+ report["descendants"] = [dict(d) for d in desc_q]
1980
+
1981
+ # Semantic relations
1982
+ hyper_q = conn.execute("SELECT hypernym_word, sense_index FROM hypernyms WHERE entry_id = ?", (entry_id,)).fetchall()
1983
+ report["hypernyms"] = [dict(h) for h in hyper_q]
1984
+
1985
+ hypo_q = conn.execute("SELECT hyponym_word, sense_index FROM hyponyms WHERE entry_id = ?", (entry_id,)).fetchall()
1986
+ report["hyponyms"] = [dict(h) for h in hypo_q]
1987
+
1988
+ holo_q = conn.execute("SELECT holonym_word, sense_index FROM holonyms WHERE entry_id = ?", (entry_id,)).fetchall()
1989
+ report["holonyms"] = [dict(h) for h in holo_q]
1990
+
1991
+ mero_q = conn.execute("SELECT meronym_word, sense_index FROM meronyms WHERE entry_id = ?", (entry_id,)).fetchall()
1992
+ report["meronyms"] = [dict(m) for m in mero_q]
1993
+
1994
+ # Coordinate terms (with their tags)
1995
+ coord_q = conn.execute(
1996
+ """
1997
+ SELECT
1998
+ ct.id, ct.coordinate_word, ct.sense_index,
1999
+ (SELECT GROUP_CONCAT(t.tag, ', ') FROM coordinate_term_tags ctt JOIN tags t ON ctt.tag_id = t.id WHERE ctt.coordinate_term_id = ct.id) as tags
2000
+ FROM coordinate_terms ct
2001
+ WHERE ct.entry_id = ?
2002
+ GROUP BY ct.id
2003
+ """, (entry_id,)
2004
+ ).fetchall()
2005
+ report["coordinate_terms"] = [dict(c) for c in coord_q]
2006
+
2007
  return report
2008
 
2009
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
 
2250
  "inflections_pattern": pattern_block,
2251
  "semantics_combined": semantics_block,
2252
  "wiktionary_metadata": {
2253
+ # --- Original Fields ---
2254
  "pos_title": pos_title,
2255
  "etymology": wikt_report.get("etymology_text"),
2256
  "pronunciation": wikt_report.get("sounds"),
2257
+ "hyphenation": wikt_report.get("hyphenations"),
2258
  "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
2259
  "entry_tags": wikt_report.get("entry_tags"),
2260
+ "entry_categories": wikt_report.get("entry_categories"),
2261
+
2262
+ # --- NEW: Pass through all new fields from the full DB ---
2263
+ "entry_notes": wikt_report.get("entry_notes"),
2264
+ "other_pos": wikt_report.get("other_pos"),
2265
+ "raw_tags": wikt_report.get("raw_tags"),
2266
+ "descendants": wikt_report.get("descendants"),
2267
+ "hypernyms": wikt_report.get("hypernyms"),
2268
+ "hyponyms": wikt_report.get("hyponyms"),
2269
+ "holonyms": wikt_report.get("holonyms"),
2270
+ "meronyms": wikt_report.get("meronyms"),
2271
+ "coordinate_terms": wikt_report.get("coordinate_terms"),
2272
+ "expressions_by_sense": wikt_report.get("expressions_by_sense"),
2273
+ "proverbs_by_sense": wikt_report.get("proverbs_by_sense")
2274
  }
2275
  }
2276