WiktionaryEN

Running

App Files Files Community

cstr commited on 29 days ago

Commit

afcdc52

verified ·

1 Parent(s): 23d7efa

more wiktionary data

Browse files

Files changed (1) hide show

app.py +111 -32

app.py CHANGED Viewed

@@ -1804,13 +1804,15 @@ def _wiktionary_map_pos_key(wikt_pos: Optional[str]) -> str:
 def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
     """
-    Fetches all associated data for a single Wiktionary entry_id.
     """
     report = {}
     # 1. Get Base Entry Info
     entry_data = conn.execute(
-        "SELECT word, pos, pos_title, lang FROM entries WHERE id = ?", (entry_id,)
     ).fetchone()
     if not entry_data:
         return {"error": "Entry ID not found"}
@@ -1818,25 +1820,43 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
     report["entry_id"] = entry_id
     report["lemma"] = entry_data["word"] # Alias for clarity
-    # 2. Get Senses (Definitions)
     senses_q = conn.execute(
         """
-        SELECT s.id as sense_id, g.gloss_text
         FROM senses s
-        JOIN glosses g ON s.id = g.sense_id
         WHERE s.entry_id = ?
-        ORDER BY s.id, g.id
         """, (entry_id,)
     ).fetchall()
-    report["senses"] = [dict(s) for s in senses_q]
-    # 3. Get Inflected Forms
     forms_q = conn.execute(
         """
-        SELECT f.form_text, GROUP_CONCAT(t.tag, ', ') as tags
         FROM forms f
-        LEFT JOIN form_tags ft ON f.id = ft.form_id
-        LEFT JOIN tags t ON ft.tag_id = t.id
         WHERE f.entry_id = ?
         GROUP BY f.id
         ORDER BY f.id
@@ -1844,35 +1864,86 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
     ).fetchall()
     report["forms"] = [dict(f) for f in forms_q]
-    # 4. Get Pronunciations
     sounds_q = conn.execute(
-        "SELECT ipa, audio FROM sounds WHERE entry_id = ?", (entry_id,)
     ).fetchall()
     report["sounds"] = [dict(s) for s in sounds_q]
-    # 5. Get Synonyms
     syn_q = conn.execute(
-        "SELECT synonym_word FROM synonyms WHERE entry_id = ?", (entry_id,)
     ).fetchall()
-    report["synonyms"] = [s["synonym_word"] for s in syn_q]
-    # 6. Get Antonyms
     ant_q = conn.execute(
-        "SELECT antonym_word FROM antonyms WHERE entry_id = ?", (entry_id,)
     ).fetchall()
-    report["antonyms"] = [a["antonym_word"] for a in ant_q]
-    # 7. Get Examples (Limit 5 for brevity)
-    ex_q = conn.execute(
         """
-        SELECT ex.text
-        FROM examples ex
-        JOIN senses s ON ex.sense_id = s.id
-        WHERE s.entry_id = ?
-        LIMIT 5
         """, (entry_id,)
     ).fetchall()
-    report["examples"] = [ex["text"] for ex in ex_q]
     return report
@@ -1997,7 +2068,11 @@ def _wiktionary_format_semantics_block(
         "odenet_senses": odenet_senses,
         "conceptnet_relations": conceptnet_relations,
         "wiktionary_synonyms": wikt_report.get("synonyms", []),
-        "wiktionary_antonyms": wikt_report.get("antonyms", [])
     }
 def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
@@ -2117,8 +2192,12 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
             "semantics_combined": semantics_block,
             "wiktionary_metadata": {
                 "pos_title": pos_title,
                 "pronunciation": wikt_report.get("sounds"),
-                "examples": wikt_report.get("examples")
             }
         }
@@ -3566,7 +3645,7 @@ if __name__ == "__main__":
             hanta_get_tagger() # Call the function to load the model
         except Exception as e:
             print(f"✗ FAILED to start HanTa tagger: {e}")
-            print("  'Word Encyclopedia' tab will fail.")
     else:
         print("INFO: HanTa library not available, skipping tagger.")
     print("--- HanTa Done ---\n")

 def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection) -> Dict[str, Any]:
     """
+    (REVISED)
+    Fetches ALL associated data for a single Wiktionary entry_id,
+    making full use of the normalized norm4.py schema.
     """
     report = {}
     # 1. Get Base Entry Info
     entry_data = conn.execute(
+        "SELECT word, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
     ).fetchone()
     if not entry_data:
         return {"error": "Entry ID not found"}
     report["entry_id"] = entry_id
     report["lemma"] = entry_data["word"] # Alias for clarity
+    # 2. Get Senses (with Glosses, Tags, Topics, Categories, and Examples)
     senses_q = conn.execute(
         """
+        SELECT
+            s.id as sense_id,
+            s.sense_index,
+            (SELECT GROUP_CONCAT(g.gloss_text, '; ') FROM glosses g WHERE g.sense_id = s.id) as glosses,
+            (SELECT GROUP_CONCAT(rg.raw_gloss, '; ') FROM raw_glosses rg WHERE rg.sense_id = s.id) as raw_glosses,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM sense_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sense_id = s.id) as tags,
+            (SELECT GROUP_CONCAT(top.topic, ', ') FROM sense_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.sense_id = s.id) as topics,
+            (SELECT GROUP_CONCAT(c.category, ', ') FROM sense_categories sc JOIN categories c ON sc.category_id = c.id WHERE sc.sense_id = s.id) as categories
         FROM senses s
         WHERE s.entry_id = ?
+        ORDER BY s.id
         """, (entry_id,)
     ).fetchall()
+    senses_list = []
+    for sense_row in senses_q:
+        sense_dict = dict(sense_row)
+        # Get examples for this specific sense
+        examples_q = conn.execute(
+            "SELECT text, ref, author, title, year, url FROM examples WHERE sense_id = ?", (sense_dict["sense_id"],)
+        ).fetchall()
+        sense_dict["examples"] = [dict(ex) for ex in examples_q]
+        senses_list.append(sense_dict)
+    report["senses"] = senses_list
+    # 3. Get Inflected Forms (with Tags and Topics)
     forms_q = conn.execute(
         """
+        SELECT
+            f.form_text,
+            f.sense_index,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM form_tags ft JOIN tags t ON ft.tag_id = t.id WHERE ft.form_id = f.id) as tags,
+            (SELECT GROUP_CONCAT(top.topic, ', ') FROM form_topics ftop JOIN topics top ON ftop.topic_id = top.id WHERE ftop.form_id = f.id) as topics
         FROM forms f
         WHERE f.entry_id = ?
         GROUP BY f.id
         ORDER BY f.id
     ).fetchall()
     report["forms"] = [dict(f) for f in forms_q]
+    # 4. Get Pronunciations (with Tags)
     sounds_q = conn.execute(
+        """
+        SELECT
+            s.ipa, s.audio, s.mp3_url, s.ogg_url, s.rhymes,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM sound_tags st JOIN tags t ON st.tag_id = t.id WHERE st.sound_id = s.id) as tags
+        FROM sounds s
+        WHERE s.entry_id = ?
+        GROUP BY s.id
+        """, (entry_id,)
     ).fetchall()
     report["sounds"] = [dict(s) for s in sounds_q]
+    # 5. Get Synonyms (with Tags and Topics)
     syn_q = conn.execute(
+        """
+        SELECT
+            s.synonym_word, s.sense_index,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM synonym_tags st JOIN tags t ON st.tag_id = t.id WHERE st.synonym_id = s.id) as tags,
+            (SELECT GROUP_CONCAT(top.topic, ', ') FROM synonym_topics stop JOIN topics top ON stop.topic_id = top.id WHERE stop.synonym_id = s.id) as topics
+        FROM synonyms s
+        WHERE s.entry_id = ?
+        GROUP BY s.id
+        """, (entry_id,)
     ).fetchall()
+    report["synonyms"] = [dict(s) for s in syn_q]
+    # 6. Get Antonyms (with Tags)
     ant_q = conn.execute(
+        """
+        SELECT
+            a.antonym_word, a.sense_index,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM antonym_tags at JOIN tags t ON at.tag_id = t.id WHERE at.antonym_id = a.id) as tags
+        FROM antonyms a
+        WHERE a.entry_id = ?
+        GROUP BY a.id
+        """, (entry_id,)
     ).fetchall()
+    report["antonyms"] = [dict(a) for a in ant_q]
+    # 7. Get Translations (with Tags)
+    trans_q = conn.execute(
         """
+        SELECT
+            tr.lang, tr.lang_code, tr.word, tr.sense_text, tr.roman,
+            (SELECT GROUP_CONCAT(t.tag, ', ') FROM translation_tags tt JOIN tags t ON tt.tag_id = t.id WHERE tt.translation_id = tr.id) as tags
+        FROM translations tr
+        WHERE tr.entry_id = ?
+        GROUP BY tr.id
         """, (entry_id,)
     ).fetchall()
+    report["translations"] = [dict(tr) for tr in trans_q]
+    # 8. Get Hyphenations
+    hyphen_q = conn.execute(
+        "SELECT hyphenation FROM hyphenations WHERE entry_id = ?", (entry_id,)
+    ).fetchall()
+    report["hyphenations"] = [h["hyphenation"] for h in hyphen_q]
+    # 9. Get Derived and Related Terms
+    derived_q = conn.execute(
+        "SELECT derived_word FROM derived_terms WHERE entry_id = ?", (entry_id,)
+    ).fetchall()
+    report["derived_terms"] = [d["derived_word"] for d in derived_q]
+    related_q = conn.execute(
+        "SELECT related_word FROM related_terms WHERE entry_id = ?", (entry_id,)
+    ).fetchall()
+    report["related_terms"] = [r["related_word"] for r in related_q]
+    # 10. Get Entry-level Tags and Categories
+    entry_tags_q = conn.execute(
+        "SELECT t.tag FROM entry_tags et JOIN tags t ON et.tag_id = t.id WHERE et.entry_id = ?", (entry_id,)
+    ).fetchall()
+    report["entry_tags"] = [t["tag"] for t in entry_tags_q]
+    entry_cats_q = conn.execute(
+        "SELECT c.category FROM entry_categories ec JOIN categories c ON ec.category_id = c.id WHERE ec.entry_id = ?", (entry_id,)
+    ).fetchall()
+    report["entry_categories"] = [c["category"] for c in entry_cats_q]
     return report
         "odenet_senses": odenet_senses,
         "conceptnet_relations": conceptnet_relations,
         "wiktionary_synonyms": wikt_report.get("synonyms", []),
+        "wiktionary_antonyms": wikt_report.get("antonyms", []),
+        "wiktionary_translations": wikt_report.get("translations", []),
+        "wiktionary_derived_terms": wikt_report.get("derived_terms", []),
+        "wiktionary_related_terms": wikt_report.get("related_terms", [])
     }
 def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
             "semantics_combined": semantics_block,
             "wiktionary_metadata": {
                 "pos_title": pos_title,
+                "etymology": wikt_report.get("etymology_text"),
                 "pronunciation": wikt_report.get("sounds"),
+                "hyphenation": wikt_report.get("hyphenations"), # <-- THE FIX
+                "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
+                "entry_tags": wikt_report.get("entry_tags"),
+                "entry_categories": wikt_report.get("entry_categories")
             }
         }
             hanta_get_tagger() # Call the function to load the model
         except Exception as e:
             print(f"✗ FAILED to start HanTa tagger: {e}")
+            print("  'Word Encyclopedia' tab will fail.")
     else:
         print("INFO: HanTa library not available, skipping tagger.")
     print("--- HanTa Done ---\n")