Spaces:
Running
Running
fixes
Browse files
app.py
CHANGED
|
@@ -1020,11 +1020,21 @@ def pattern_analyze_as_noun(word: str, hint_lemma: str = None) -> Dict[str, Any]
|
|
| 1020 |
analysis["gender"] = list(analysis["declension_by_gender"].keys())[0]
|
| 1021 |
|
| 1022 |
return analysis
|
|
|
|
| 1023 |
def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 1024 |
"""Comprehensive verb conjugation analysis."""
|
| 1025 |
log(f" Analyzing as verb (hint_lemma={hint_lemma})")
|
| 1026 |
verb_lemma = lemma(word)
|
| 1027 |
log(f" lemma({word}) = {verb_lemma}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1028 |
if not verb_lemma or verb_lemma == word:
|
| 1029 |
if hint_lemma and hint_lemma != word:
|
| 1030 |
verb_lemma = hint_lemma
|
|
@@ -1111,6 +1121,7 @@ def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]
|
|
| 1111 |
if form: analysis["conjugation"]["Konjunktiv II"][name] = form
|
| 1112 |
except: pass
|
| 1113 |
return analysis
|
|
|
|
| 1114 |
def pattern_analyze_as_adjective(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 1115 |
"""Comprehensive adjective inflection analysis."""
|
| 1116 |
log(f" Analyzing as adjective (hint_lemma={hint_lemma})")
|
|
@@ -1868,25 +1879,20 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1868 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1869 |
"""
|
| 1870 |
Finds all entries related to a word, checking both lemmas and
|
| 1871 |
-
NON-VARIANT inflected forms.
|
| 1872 |
Returns a list of full entry reports.
|
| 1873 |
"""
|
| 1874 |
log(f"Wiktionary: Querying for '{word}'...")
|
| 1875 |
found_entry_ids: Set[int] = set()
|
| 1876 |
|
| 1877 |
# 1. Check if the word is a lemma (base form)
|
| 1878 |
-
# e.g., input "Haus" finds "Haus (Substantiv)"
|
| 1879 |
-
# e.g., input "gehe" finds "gehe (Konjugierte Form)"
|
| 1880 |
lemma_q = conn.execute(
|
| 1881 |
"SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
|
| 1882 |
).fetchall()
|
| 1883 |
for row in lemma_q:
|
| 1884 |
found_entry_ids.add(row["id"])
|
| 1885 |
|
| 1886 |
-
# 2. Check if the word is a true inflected form, but NOT a "variant"
|
| 1887 |
-
# e.g., input "gehe" finds "gehen (Verb)"
|
| 1888 |
-
# e.g., input "Haus" finds "Hau (Substantiv)"
|
| 1889 |
-
# This WILL NOT find "Häusle" from "Haus" anymore.
|
| 1890 |
form_q = conn.execute(
|
| 1891 |
"""
|
| 1892 |
SELECT DISTINCT e.id
|
|
@@ -1894,11 +1900,11 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1894 |
JOIN entries e ON f.entry_id = e.id
|
| 1895 |
WHERE f.form_text = ? AND e.lang = 'Deutsch'
|
| 1896 |
AND f.id NOT IN (
|
| 1897 |
-
-- Exclude all form_ids that are tagged as 'variant'
|
| 1898 |
SELECT ft.form_id
|
| 1899 |
FROM form_tags ft
|
| 1900 |
JOIN tags t ON ft.tag_id = t.id
|
| 1901 |
-
WHERE t.tag
|
| 1902 |
)
|
| 1903 |
""", (word,)
|
| 1904 |
).fetchall()
|
|
@@ -2116,18 +2122,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2116 |
}
|
| 2117 |
}
|
| 2118 |
|
| 2119 |
-
# --- E. VALIDATION FILTER ---
|
| 2120 |
is_valid = False
|
| 2121 |
is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
|
| 2122 |
|
| 2123 |
-
# Check 1: Is the input word the lemma
|
| 2124 |
-
|
|
|
|
|
|
|
| 2125 |
is_valid = True
|
| 2126 |
-
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches lemma
|
| 2127 |
|
| 2128 |
# Check 2: Is the input word in the *bare* forms list?
|
| 2129 |
-
# (This
|
| 2130 |
-
if not
|
|
|
|
| 2131 |
for form_entry in inflections_wikt_block.get("forms_list", []):
|
| 2132 |
form_text = form_entry.get("form_text", "")
|
| 2133 |
bare_form = re.sub(r"\(.*\)", "", form_text).strip()
|
|
|
|
| 1020 |
analysis["gender"] = list(analysis["declension_by_gender"].keys())[0]
|
| 1021 |
|
| 1022 |
return analysis
|
| 1023 |
+
|
| 1024 |
def pattern_analyze_as_verb(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 1025 |
"""Comprehensive verb conjugation analysis."""
|
| 1026 |
log(f" Analyzing as verb (hint_lemma={hint_lemma})")
|
| 1027 |
verb_lemma = lemma(word)
|
| 1028 |
log(f" lemma({word}) = {verb_lemma}")
|
| 1029 |
+
|
| 1030 |
+
# If the input word is already an infinitive (ends in 'en', 'n', 'ln'),
|
| 1031 |
+
# and pattern.de gives a weird lemma, trust the input word.
|
| 1032 |
+
# This fixes lemma('gießen') -> 'gaßen'
|
| 1033 |
+
is_infinitive_form = word.endswith("en") or word.endswith("ln") or word.endswith("rn")
|
| 1034 |
+
if is_infinitive_form and verb_lemma != word.lower():
|
| 1035 |
+
log(f" Pattern.de lemma '{verb_lemma}' is suspicious for infinitive '{word}'. Trusting input word.")
|
| 1036 |
+
verb_lemma = word
|
| 1037 |
+
|
| 1038 |
if not verb_lemma or verb_lemma == word:
|
| 1039 |
if hint_lemma and hint_lemma != word:
|
| 1040 |
verb_lemma = hint_lemma
|
|
|
|
| 1121 |
if form: analysis["conjugation"]["Konjunktiv II"][name] = form
|
| 1122 |
except: pass
|
| 1123 |
return analysis
|
| 1124 |
+
|
| 1125 |
def pattern_analyze_as_adjective(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 1126 |
"""Comprehensive adjective inflection analysis."""
|
| 1127 |
log(f" Analyzing as adjective (hint_lemma={hint_lemma})")
|
|
|
|
| 1879 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1880 |
"""
|
| 1881 |
Finds all entries related to a word, checking both lemmas and
|
| 1882 |
+
NON-VARIANT, NON-AUXILIARY inflected forms.
|
| 1883 |
Returns a list of full entry reports.
|
| 1884 |
"""
|
| 1885 |
log(f"Wiktionary: Querying for '{word}'...")
|
| 1886 |
found_entry_ids: Set[int] = set()
|
| 1887 |
|
| 1888 |
# 1. Check if the word is a lemma (base form)
|
|
|
|
|
|
|
| 1889 |
lemma_q = conn.execute(
|
| 1890 |
"SELECT id FROM entries WHERE word = ? AND lang = 'Deutsch'", (word,)
|
| 1891 |
).fetchall()
|
| 1892 |
for row in lemma_q:
|
| 1893 |
found_entry_ids.add(row["id"])
|
| 1894 |
|
| 1895 |
+
# 2. Check if the word is a true inflected form, but NOT a "variant" or "auxiliary"
|
|
|
|
|
|
|
|
|
|
| 1896 |
form_q = conn.execute(
|
| 1897 |
"""
|
| 1898 |
SELECT DISTINCT e.id
|
|
|
|
| 1900 |
JOIN entries e ON f.entry_id = e.id
|
| 1901 |
WHERE f.form_text = ? AND e.lang = 'Deutsch'
|
| 1902 |
AND f.id NOT IN (
|
| 1903 |
+
-- Exclude all form_ids that are tagged as 'variant' or 'auxiliary'
|
| 1904 |
SELECT ft.form_id
|
| 1905 |
FROM form_tags ft
|
| 1906 |
JOIN tags t ON ft.tag_id = t.id
|
| 1907 |
+
WHERE t.tag IN ('variant', 'auxiliary')
|
| 1908 |
)
|
| 1909 |
""", (word,)
|
| 1910 |
).fetchall()
|
|
|
|
| 2122 |
}
|
| 2123 |
}
|
| 2124 |
|
| 2125 |
+
# --- E. VALIDATION FILTER (REVISED LOGIC) ---
|
| 2126 |
is_valid = False
|
| 2127 |
is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
|
| 2128 |
|
| 2129 |
+
# Check 1: Is the input word the lemma?
|
| 2130 |
+
# This is true for base form entries (e.g., "Haus" -> "Haus (Substantiv)")
|
| 2131 |
+
# AND for inflected form entries (e.g., "gießt" -> "gießt (Konjugierte Form)")
|
| 2132 |
+
if lemma.lower() == word_lower:
|
| 2133 |
is_valid = True
|
| 2134 |
+
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches entry lemma.")
|
| 2135 |
|
| 2136 |
# Check 2: Is the input word in the *bare* forms list?
|
| 2137 |
+
# (This applies to base entries where the input is an inflection, e.g., "gießt" -> "gehen (Verb)")
|
| 2138 |
+
# We only run this if Check 1 failed AND this is not an inflected entry (which have no forms)
|
| 2139 |
+
if not is_valid and not is_inflected_entry:
|
| 2140 |
for form_entry in inflections_wikt_block.get("forms_list", []):
|
| 2141 |
form_text = form_entry.get("form_text", "")
|
| 2142 |
bare_form = re.sub(r"\(.*\)", "", form_text).strip()
|