cstr commited on
Commit
5b17922
·
verified ·
1 Parent(s): ea16568

fix base lemma query

Browse files
Files changed (1) hide show
  1. app.py +67 -20
app.py CHANGED
@@ -1998,8 +1998,12 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1998
 
1999
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
2000
  """
2001
- Finds all entries related to a word, checking both lemmas and
2002
- NON-VARIANT, NON-AUXILIARY inflected forms.
 
 
 
 
2003
  Returns a list of full entry reports.
2004
  """
2005
  log(f"Wiktionary: Querying for '{word}'...")
@@ -2007,12 +2011,39 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
2007
 
2008
  # 1. Check if the word is a lemma (base form)
2009
  lemma_q = conn.execute(
2010
- "SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
2011
  ).fetchall()
 
 
 
2012
  for row in lemma_q:
2013
- found_entry_ids.add(row["id"])
 
 
2014
 
2015
- # 2. Check if the word is a true inflected form, but NOT a "variant" or "auxiliary"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2016
  form_q = conn.execute(
2017
  """
2018
  SELECT DISTINCT e.id
@@ -2031,6 +2062,16 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
2031
  for row in form_q:
2032
  found_entry_ids.add(row["id"])
2033
 
 
 
 
 
 
 
 
 
 
 
2034
  log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
2035
 
2036
  # 3. Build a full report for each unique entry
@@ -2050,26 +2091,33 @@ def _wiktionary_format_semantics_block(
2050
  top_n: int
2051
  ) -> Dict[str, Any]:
2052
  """
 
2053
  Combines Wiktionary senses with OdeNet/ConceptNet senses,
2054
- using the CORRECT lemma from the pattern.de analysis block.
 
 
 
 
2055
  """
2056
 
2057
- # --- THIS IS THE FIX ---
2058
- # Determine the true lemma from the pattern.de block, as it's more reliable
2059
- # for semantic lookup than the wiktionary lemma (which could be an inflected form).
2060
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
2061
 
2062
- semantic_lemma = ""
2063
- if pos_key == "verb":
2064
- semantic_lemma = pattern_block.get("infinitive")
2065
- elif pos_key == "noun":
2066
- semantic_lemma = pattern_block.get("base_form")
2067
- elif pos_key == "adjective":
2068
- semantic_lemma = pattern_block.get("predicative")
2069
 
2070
- # Fallback if pattern.de fails or it's a non-inflecting POS
 
 
 
 
 
 
 
 
 
2071
  if not semantic_lemma:
2072
- semantic_lemma = wikt_report.get("lemma", "")
2073
 
2074
  log(f"[DEBUG] Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
2075
  # --- END OF FIX ---
@@ -2078,7 +2126,7 @@ def _wiktionary_format_semantics_block(
2078
  wiktionary_senses = []
2079
  for sense in wikt_report.get("senses", []):
2080
  wiktionary_senses.append({
2081
- "definition": sense.get("gloss_text"),
2082
  "source": "wiktionary"
2083
  })
2084
 
@@ -2121,7 +2169,6 @@ def _wiktionary_format_semantics_block(
2121
  "wiktionary_translations": wikt_report.get("translations", []),
2122
  "wiktionary_derived_terms": wikt_report.get("derived_terms", []),
2123
  "wiktionary_related_terms": wikt_report.get("related_terms", [])
2124
-
2125
  }
2126
 
2127
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
 
1998
 
1999
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
2000
  """
2001
+ (FIXED V24)
2002
+ Finds all entries related to a word.
2003
+ 1. Finds direct lemma matches (e.g., input "Vertrag" -> finds "Vertrag" entry)
2004
+ 2. Finds inflection matches (e.g., input "Häuser" -> finds "Haus" entry via `forms` table)
2005
+ 3. Finds declined form matches (e.g., input "Verträge" -> finds "Verträge" entry,
2006
+ then finds "Vertrag" entry via `senses.form_of` table)
2007
  Returns a list of full entry reports.
2008
  """
2009
  log(f"Wiktionary: Querying for '{word}'...")
 
2011
 
2012
  # 1. Check if the word is a lemma (base form)
2013
  lemma_q = conn.execute(
2014
+ "SELECT id, pos_title FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
2015
  ).fetchall()
2016
+
2017
+ parent_lemmas_to_find: Set[str] = set()
2018
+
2019
  for row in lemma_q:
2020
+ entry_id = row["id"]
2021
+ pos_title = row["pos_title"]
2022
+ found_entry_ids.add(entry_id)
2023
 
2024
+ # --- THIS IS THE NEW LOGIC (STEP 3) ---
2025
+ if pos_title in ("Deklinierte Form", "Konjugierte Form", "Komparativ", "Superlativ"):
2026
+ log(f"Wiktionary: Word '{word}' is an inflected entry (ID {entry_id}). Looking for its parent lemma...")
2027
+ form_of_q = conn.execute(
2028
+ "SELECT form_of FROM senses WHERE entry_id = ?", (entry_id,)
2029
+ ).fetchall()
2030
+
2031
+ for form_row in form_of_q:
2032
+ form_of_json = form_row["form_of"]
2033
+ if not form_of_json:
2034
+ continue
2035
+ try:
2036
+ # Parse the JSON string (e.g., '[{"word": "Vertrag"}]')
2037
+ form_of_data = json.loads(form_of_json)
2038
+ if isinstance(form_of_data, list) and form_of_data:
2039
+ parent_lemma_word = form_of_data[0].get("word")
2040
+ if parent_lemma_word:
2041
+ parent_lemmas_to_find.add(parent_lemma_word)
2042
+ except json.JSONDecodeError:
2043
+ log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
2044
+ # --- END OF NEW LOGIC ---
2045
+
2046
+ # 2. Check if the word is an inflected form (in the `forms` table)
2047
  form_q = conn.execute(
2048
  """
2049
  SELECT DISTINCT e.id
 
2062
  for row in form_q:
2063
  found_entry_ids.add(row["id"])
2064
 
2065
+ # --- NEW: Add parent lemmas found in step 3 ---
2066
+ if parent_lemmas_to_find:
2067
+ log(f"Wiktionary: Found parent lemmas to add: {parent_lemmas_to_find}")
2068
+ for lemma_word in parent_lemmas_to_find:
2069
+ parent_id_q = conn.execute(
2070
+ "SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (lemma_word,)
2071
+ ).fetchall()
2072
+ for row in parent_id_q:
2073
+ found_entry_ids.add(row["id"])
2074
+
2075
  log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
2076
 
2077
  # 3. Build a full report for each unique entry
 
2091
  top_n: int
2092
  ) -> Dict[str, Any]:
2093
  """
2094
+ (FIXED V24)
2095
  Combines Wiktionary senses with OdeNet/ConceptNet senses,
2096
+ using the *correct* lemma.
2097
+
2098
+ Priority:
2099
+ 1. Wiktionary's lemma (from `wikt_report`)
2100
+ 2. Pattern.de's lemma (from `pattern_block`)
2101
  """
2102
 
 
 
 
2103
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
2104
 
2105
+ # --- THIS IS THE FIX ---
2106
+ # Prioritize Wiktionary's lemma first, as it's more reliable.
2107
+ semantic_lemma = wikt_report.get("lemma")
 
 
 
 
2108
 
2109
+ # If Wiktionary's lemma is missing or bad, try pattern.de's
2110
+ if not semantic_lemma:
2111
+ if pos_key == "verb":
2112
+ semantic_lemma = pattern_block.get("infinitive")
2113
+ elif pos_key == "noun":
2114
+ semantic_lemma = pattern_block.get("base_form")
2115
+ elif pos_key == "adjective":
2116
+ semantic_lemma = pattern_block.get("predicative")
2117
+
2118
+ # Final fallback
2119
  if not semantic_lemma:
2120
+ semantic_lemma = wikt_report.get("word", "") # Use the original word as last resort
2121
 
2122
  log(f"[DEBUG] Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
2123
  # --- END OF FIX ---
 
2126
  wiktionary_senses = []
2127
  for sense in wikt_report.get("senses", []):
2128
  wiktionary_senses.append({
2129
+ "definition": sense.get("glosses"), # <-- Corrected from gloss_text
2130
  "source": "wiktionary"
2131
  })
2132
 
 
2169
  "wiktionary_translations": wikt_report.get("translations", []),
2170
  "wiktionary_derived_terms": wikt_report.get("derived_terms", []),
2171
  "wiktionary_related_terms": wikt_report.get("related_terms", [])
 
2172
  }
2173
 
2174
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]: