cstr commited on
Commit
23d7efa
·
verified ·
1 Parent(s): d91d571
Files changed (1) hide show
  1. app.py +24 -15
app.py CHANGED
@@ -1020,11 +1020,21 @@ def pattern_analyze_as_noun(word: str, hint_lemma: str = None) -> Dict[str, Any]
1020
  analysis["gender"] = list(analysis["declension_by_gender"].keys())[0]
1021
 
1022
  return analysis
 
1023
  def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]:
1024
  """Comprehensive verb conjugation analysis."""
1025
  log(f" Analyzing as verb (hint_lemma={hint_lemma})")
1026
  verb_lemma = lemma(word)
1027
  log(f" lemma({word}) = {verb_lemma}")
 
 
 
 
 
 
 
 
 
1028
  if not verb_lemma or verb_lemma == word:
1029
  if hint_lemma and hint_lemma != word:
1030
  verb_lemma = hint_lemma
@@ -1111,6 +1121,7 @@ def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]
1111
  if form: analysis["conjugation"]["Konjunktiv II"][name] = form
1112
  except: pass
1113
  return analysis
 
1114
  def pattern_analyze_as_adjective(word: str, hint_lemma: str = None) -> Dict[str, Any]:
1115
  """Comprehensive adjective inflection analysis."""
1116
  log(f" Analyzing as adjective (hint_lemma={hint_lemma})")
@@ -1868,25 +1879,20 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1868
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
1869
  """
1870
  Finds all entries related to a word, checking both lemmas and
1871
- NON-VARIANT inflected forms.
1872
  Returns a list of full entry reports.
1873
  """
1874
  log(f"Wiktionary: Querying for '{word}'...")
1875
  found_entry_ids: Set[int] = set()
1876
 
1877
  # 1. Check if the word is a lemma (base form)
1878
- # e.g., input "Haus" finds "Haus (Substantiv)"
1879
- # e.g., input "gehe" finds "gehe (Konjugierte Form)"
1880
  lemma_q = conn.execute(
1881
  "SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
1882
  ).fetchall()
1883
  for row in lemma_q:
1884
  found_entry_ids.add(row["id"])
1885
 
1886
- # 2. Check if the word is a true inflected form, but NOT a "variant"
1887
- # e.g., input "gehe" finds "gehen (Verb)"
1888
- # e.g., input "Haus" finds "Hau (Substantiv)"
1889
- # This WILL NOT find "Häusle" from "Haus" anymore.
1890
  form_q = conn.execute(
1891
  """
1892
  SELECT DISTINCT e.id
@@ -1894,11 +1900,11 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
1894
  JOIN entries e ON f.entry_id = e.id
1895
  WHERE f.form_text = ? AND e.lang = 'Deutsch'
1896
  AND f.id NOT IN (
1897
- -- Exclude all form_ids that are tagged as 'variant'
1898
  SELECT ft.form_id
1899
  FROM form_tags ft
1900
  JOIN tags t ON ft.tag_id = t.id
1901
- WHERE t.tag = 'variant'
1902
  )
1903
  """, (word,)
1904
  ).fetchall()
@@ -2116,18 +2122,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2116
  }
2117
  }
2118
 
2119
- # --- E. VALIDATION FILTER ---
2120
  is_valid = False
2121
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2122
 
2123
- # Check 1: Is the input word the lemma OF A BASE FORM entry?
2124
- if not is_inflected_entry and lemma.lower() == word_lower:
 
 
2125
  is_valid = True
2126
- log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches lemma of a base entry.")
2127
 
2128
  # Check 2: Is the input word in the *bare* forms list?
2129
- # (This is the only check that should apply to inflected entries)
2130
- if not is_valid:
 
2131
  for form_entry in inflections_wikt_block.get("forms_list", []):
2132
  form_text = form_entry.get("form_text", "")
2133
  bare_form = re.sub(r"\(.*\)", "", form_text).strip()
 
1020
  analysis["gender"] = list(analysis["declension_by_gender"].keys())[0]
1021
 
1022
  return analysis
1023
+
1024
  def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]:
1025
  """Comprehensive verb conjugation analysis."""
1026
  log(f" Analyzing as verb (hint_lemma={hint_lemma})")
1027
  verb_lemma = lemma(word)
1028
  log(f" lemma({word}) = {verb_lemma}")
1029
+
1030
+ # If the input word is already an infinitive (ends in 'en', 'n', 'ln'),
1031
+ # and pattern.de gives a weird lemma, trust the input word.
1032
+ # This fixes lemma('gießen') -> 'gaßen'
1033
+ is_infinitive_form = word.endswith("en") or word.endswith("ln") or word.endswith("rn")
1034
+ if is_infinitive_form and verb_lemma != word.lower():
1035
+ log(f" Pattern.de lemma '{verb_lemma}' is suspicious for infinitive '{word}'. Trusting input word.")
1036
+ verb_lemma = word
1037
+
1038
  if not verb_lemma or verb_lemma == word:
1039
  if hint_lemma and hint_lemma != word:
1040
  verb_lemma = hint_lemma
 
1121
  if form: analysis["conjugation"]["Konjunktiv II"][name] = form
1122
  except: pass
1123
  return analysis
1124
+
1125
  def pattern_analyze_as_adjective(word: str, hint_lemma: str = None) -> Dict[str, Any]:
1126
  """Comprehensive adjective inflection analysis."""
1127
  log(f" Analyzing as adjective (hint_lemma={hint_lemma})")
 
1879
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
1880
  """
1881
  Finds all entries related to a word, checking both lemmas and
1882
+ NON-VARIANT, NON-AUXILIARY inflected forms.
1883
  Returns a list of full entry reports.
1884
  """
1885
  log(f"Wiktionary: Querying for '{word}'...")
1886
  found_entry_ids: Set[int] = set()
1887
 
1888
  # 1. Check if the word is a lemma (base form)
 
 
1889
  lemma_q = conn.execute(
1890
  "SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
1891
  ).fetchall()
1892
  for row in lemma_q:
1893
  found_entry_ids.add(row["id"])
1894
 
1895
+ # 2. Check if the word is a true inflected form, but NOT a "variant" or "auxiliary"
 
 
 
1896
  form_q = conn.execute(
1897
  """
1898
  SELECT DISTINCT e.id
 
1900
  JOIN entries e ON f.entry_id = e.id
1901
  WHERE f.form_text = ? AND e.lang = 'Deutsch'
1902
  AND f.id NOT IN (
1903
+ -- Exclude all form_ids that are tagged as 'variant' or 'auxiliary'
1904
  SELECT ft.form_id
1905
  FROM form_tags ft
1906
  JOIN tags t ON ft.tag_id = t.id
1907
+ WHERE t.tag IN ('variant', 'auxiliary')
1908
  )
1909
  """, (word,)
1910
  ).fetchall()
 
2122
  }
2123
  }
2124
 
2125
+ # --- E. VALIDATION FILTER (REVISED LOGIC) ---
2126
  is_valid = False
2127
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2128
 
2129
+ # Check 1: Is the input word the lemma?
2130
+ # This is true for base form entries (e.g., "Haus" -> "Haus (Substantiv)")
2131
+ # AND for inflected form entries (e.g., "gießt" -> "gießt (Konjugierte Form)")
2132
+ if lemma.lower() == word_lower:
2133
  is_valid = True
2134
+ log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches entry lemma.")
2135
 
2136
  # Check 2: Is the input word in the *bare* forms list?
2137
+ # (This applies to base entries where the input is an inflection, e.g., "gießt" -> "gehen (Verb)")
2138
+ # We only run this if Check 1 failed AND this is not an inflected entry (which have no forms)
2139
+ if not is_valid and not is_inflected_entry:
2140
  for form_entry in inflections_wikt_block.get("forms_list", []):
2141
  form_text = form_entry.get("form_text", "")
2142
  bare_form = re.sub(r"\(.*\)", "", form_text).strip()