Spaces:
Running
Running
fix base lemma query
Browse files
app.py
CHANGED
|
@@ -1998,8 +1998,12 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1998 |
|
| 1999 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 2000 |
"""
|
| 2001 |
-
|
| 2002 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2003 |
Returns a list of full entry reports.
|
| 2004 |
"""
|
| 2005 |
log(f"Wiktionary: Querying for '{word}'...")
|
|
@@ -2007,12 +2011,39 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 2007 |
|
| 2008 |
# 1. Check if the word is a lemma (base form)
|
| 2009 |
lemma_q = conn.execute(
|
| 2010 |
-
"SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
|
| 2011 |
).fetchall()
|
|
|
|
|
|
|
|
|
|
| 2012 |
for row in lemma_q:
|
| 2013 |
-
|
|
|
|
|
|
|
| 2014 |
|
| 2015 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2016 |
form_q = conn.execute(
|
| 2017 |
"""
|
| 2018 |
SELECT DISTINCT e.id
|
|
@@ -2031,6 +2062,16 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 2031 |
for row in form_q:
|
| 2032 |
found_entry_ids.add(row["id"])
|
| 2033 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2034 |
log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
|
| 2035 |
|
| 2036 |
# 3. Build a full report for each unique entry
|
|
@@ -2050,26 +2091,33 @@ def _wiktionary_format_semantics_block(
|
|
| 2050 |
top_n: int
|
| 2051 |
) -> Dict[str, Any]:
|
| 2052 |
"""
|
|
|
|
| 2053 |
Combines Wiktionary senses with OdeNet/ConceptNet senses,
|
| 2054 |
-
using the
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2055 |
"""
|
| 2056 |
|
| 2057 |
-
# --- THIS IS THE FIX ---
|
| 2058 |
-
# Determine the true lemma from the pattern.de block, as it's more reliable
|
| 2059 |
-
# for semantic lookup than the wiktionary lemma (which could be an inflected form).
|
| 2060 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 2061 |
|
| 2062 |
-
|
| 2063 |
-
|
| 2064 |
-
|
| 2065 |
-
elif pos_key == "noun":
|
| 2066 |
-
semantic_lemma = pattern_block.get("base_form")
|
| 2067 |
-
elif pos_key == "adjective":
|
| 2068 |
-
semantic_lemma = pattern_block.get("predicative")
|
| 2069 |
|
| 2070 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2071 |
if not semantic_lemma:
|
| 2072 |
-
semantic_lemma = wikt_report.get("
|
| 2073 |
|
| 2074 |
log(f"[DEBUG] Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
|
| 2075 |
# --- END OF FIX ---
|
|
@@ -2078,7 +2126,7 @@ def _wiktionary_format_semantics_block(
|
|
| 2078 |
wiktionary_senses = []
|
| 2079 |
for sense in wikt_report.get("senses", []):
|
| 2080 |
wiktionary_senses.append({
|
| 2081 |
-
"definition": sense.get("
|
| 2082 |
"source": "wiktionary"
|
| 2083 |
})
|
| 2084 |
|
|
@@ -2121,7 +2169,6 @@ def _wiktionary_format_semantics_block(
|
|
| 2121 |
"wiktionary_translations": wikt_report.get("translations", []),
|
| 2122 |
"wiktionary_derived_terms": wikt_report.get("derived_terms", []),
|
| 2123 |
"wiktionary_related_terms": wikt_report.get("related_terms", [])
|
| 2124 |
-
|
| 2125 |
}
|
| 2126 |
|
| 2127 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
|
|
| 1998 |
|
| 1999 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 2000 |
"""
|
| 2001 |
+
(FIXED V24)
|
| 2002 |
+
Finds all entries related to a word.
|
| 2003 |
+
1. Finds direct lemma matches (e.g., input "Vertrag" -> finds "Vertrag" entry)
|
| 2004 |
+
2. Finds inflection matches (e.g., input "Häuser" -> finds "Haus" entry via `forms` table)
|
| 2005 |
+
3. Finds declined form matches (e.g., input "Verträge" -> finds "Verträge" entry,
|
| 2006 |
+
then finds "Vertrag" entry via `senses.form_of` table)
|
| 2007 |
Returns a list of full entry reports.
|
| 2008 |
"""
|
| 2009 |
log(f"Wiktionary: Querying for '{word}'...")
|
|
|
|
| 2011 |
|
| 2012 |
# 1. Check if the word is a lemma (base form)
|
| 2013 |
lemma_q = conn.execute(
|
| 2014 |
+
"SELECT id, pos_title FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
|
| 2015 |
).fetchall()
|
| 2016 |
+
|
| 2017 |
+
parent_lemmas_to_find: Set[str] = set()
|
| 2018 |
+
|
| 2019 |
for row in lemma_q:
|
| 2020 |
+
entry_id = row["id"]
|
| 2021 |
+
pos_title = row["pos_title"]
|
| 2022 |
+
found_entry_ids.add(entry_id)
|
| 2023 |
|
| 2024 |
+
# --- THIS IS THE NEW LOGIC (STEP 3) ---
|
| 2025 |
+
if pos_title in ("Deklinierte Form", "Konjugierte Form", "Komparativ", "Superlativ"):
|
| 2026 |
+
log(f"Wiktionary: Word '{word}' is an inflected entry (ID {entry_id}). Looking for its parent lemma...")
|
| 2027 |
+
form_of_q = conn.execute(
|
| 2028 |
+
"SELECT form_of FROM senses WHERE entry_id = ?", (entry_id,)
|
| 2029 |
+
).fetchall()
|
| 2030 |
+
|
| 2031 |
+
for form_row in form_of_q:
|
| 2032 |
+
form_of_json = form_row["form_of"]
|
| 2033 |
+
if not form_of_json:
|
| 2034 |
+
continue
|
| 2035 |
+
try:
|
| 2036 |
+
# Parse the JSON string (e.g., '[{"word": "Vertrag"}]')
|
| 2037 |
+
form_of_data = json.loads(form_of_json)
|
| 2038 |
+
if isinstance(form_of_data, list) and form_of_data:
|
| 2039 |
+
parent_lemma_word = form_of_data[0].get("word")
|
| 2040 |
+
if parent_lemma_word:
|
| 2041 |
+
parent_lemmas_to_find.add(parent_lemma_word)
|
| 2042 |
+
except json.JSONDecodeError:
|
| 2043 |
+
log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
|
| 2044 |
+
# --- END OF NEW LOGIC ---
|
| 2045 |
+
|
| 2046 |
+
# 2. Check if the word is an inflected form (in the `forms` table)
|
| 2047 |
form_q = conn.execute(
|
| 2048 |
"""
|
| 2049 |
SELECT DISTINCT e.id
|
|
|
|
| 2062 |
for row in form_q:
|
| 2063 |
found_entry_ids.add(row["id"])
|
| 2064 |
|
| 2065 |
+
# --- NEW: Add parent lemmas found in step 3 ---
|
| 2066 |
+
if parent_lemmas_to_find:
|
| 2067 |
+
log(f"Wiktionary: Found parent lemmas to add: {parent_lemmas_to_find}")
|
| 2068 |
+
for lemma_word in parent_lemmas_to_find:
|
| 2069 |
+
parent_id_q = conn.execute(
|
| 2070 |
+
"SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (lemma_word,)
|
| 2071 |
+
).fetchall()
|
| 2072 |
+
for row in parent_id_q:
|
| 2073 |
+
found_entry_ids.add(row["id"])
|
| 2074 |
+
|
| 2075 |
log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
|
| 2076 |
|
| 2077 |
# 3. Build a full report for each unique entry
|
|
|
|
| 2091 |
top_n: int
|
| 2092 |
) -> Dict[str, Any]:
|
| 2093 |
"""
|
| 2094 |
+
(FIXED V24)
|
| 2095 |
Combines Wiktionary senses with OdeNet/ConceptNet senses,
|
| 2096 |
+
using the *correct* lemma.
|
| 2097 |
+
|
| 2098 |
+
Priority:
|
| 2099 |
+
1. Wiktionary's lemma (from `wikt_report`)
|
| 2100 |
+
2. Pattern.de's lemma (from `pattern_block`)
|
| 2101 |
"""
|
| 2102 |
|
|
|
|
|
|
|
|
|
|
| 2103 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 2104 |
|
| 2105 |
+
# --- THIS IS THE FIX ---
|
| 2106 |
+
# Prioritize Wiktionary's lemma first, as it's more reliable.
|
| 2107 |
+
semantic_lemma = wikt_report.get("lemma")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2108 |
|
| 2109 |
+
# If Wiktionary's lemma is missing or bad, try pattern.de's
|
| 2110 |
+
if not semantic_lemma:
|
| 2111 |
+
if pos_key == "verb":
|
| 2112 |
+
semantic_lemma = pattern_block.get("infinitive")
|
| 2113 |
+
elif pos_key == "noun":
|
| 2114 |
+
semantic_lemma = pattern_block.get("base_form")
|
| 2115 |
+
elif pos_key == "adjective":
|
| 2116 |
+
semantic_lemma = pattern_block.get("predicative")
|
| 2117 |
+
|
| 2118 |
+
# Final fallback
|
| 2119 |
if not semantic_lemma:
|
| 2120 |
+
semantic_lemma = wikt_report.get("word", "") # Use the original word as last resort
|
| 2121 |
|
| 2122 |
log(f"[DEBUG] Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
|
| 2123 |
# --- END OF FIX ---
|
|
|
|
| 2126 |
wiktionary_senses = []
|
| 2127 |
for sense in wikt_report.get("senses", []):
|
| 2128 |
wiktionary_senses.append({
|
| 2129 |
+
"definition": sense.get("glosses"), # <-- Corrected from gloss_text
|
| 2130 |
"source": "wiktionary"
|
| 2131 |
})
|
| 2132 |
|
|
|
|
| 2169 |
"wiktionary_translations": wikt_report.get("translations", []),
|
| 2170 |
"wiktionary_derived_terms": wikt_report.get("derived_terms", []),
|
| 2171 |
"wiktionary_related_terms": wikt_report.get("related_terms", [])
|
|
|
|
| 2172 |
}
|
| 2173 |
|
| 2174 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|