Spaces:
Sleeping
Sleeping
fix wiktionary ground truth overrides pattern guesses
Browse files
app.py
CHANGED
|
@@ -951,74 +951,91 @@ def pattern_is_good_analysis(analysis, analysis_type):
|
|
| 951 |
|
| 952 |
|
| 953 |
# --- Inflection Generators ---
|
| 954 |
-
def pattern_analyze_as_noun(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 955 |
-
"""
|
| 956 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
analysis = {}
|
|
|
|
|
|
|
| 958 |
singular = singularize(word)
|
| 959 |
plural = pluralize(word)
|
| 960 |
-
|
| 961 |
-
log(f" pluralize({word}) = {plural}")
|
| 962 |
if plural != word and singular != word:
|
| 963 |
base = word
|
| 964 |
-
log(f" Word changes when pluralized => base = {base}")
|
| 965 |
elif singular != word:
|
| 966 |
base = singular
|
| 967 |
-
log(f" Word changes when singularized => base = {base}")
|
| 968 |
elif hint_lemma and hint_lemma != word:
|
| 969 |
base = hint_lemma
|
| 970 |
-
log(f" Using hint lemma => base = {base}")
|
| 971 |
else:
|
| 972 |
-
# This is a valid case, e.g. "Lauf" (singular)
|
| 973 |
base = word
|
| 974 |
-
log(f" Word is already base form => base = {base}")
|
| 975 |
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
genders = list(g)
|
| 982 |
-
log(f" Detected ambiguous gender: {genders}")
|
| 983 |
-
elif g is None:
|
| 984 |
-
genders = [MALE] # Default
|
| 985 |
-
log(f" Gender unknown, defaulting to MALE")
|
| 986 |
else:
|
| 987 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 988 |
|
| 989 |
analysis["base_form"] = base
|
| 990 |
analysis["plural"] = pluralize(base)
|
| 991 |
analysis["singular"] = base
|
| 992 |
analysis["declension_by_gender"] = {}
|
| 993 |
|
|
|
|
| 994 |
for gen in genders:
|
| 995 |
gender_str = {MALE: "Masculine", FEMALE: "Feminine", NEUTRAL: "Neuter"}.get(gen, "Unknown")
|
| 996 |
gen_declension = {}
|
|
|
|
| 997 |
for number, number_name in [(SINGULAR, "Singular"), (PLURAL, "Plural")]:
|
| 998 |
word_form = base if number == SINGULAR else pluralize(base)
|
| 999 |
word_form_cap = word_form.capitalize()
|
| 1000 |
gender_for_article = gen if number == SINGULAR else PLURAL
|
| 1001 |
-
|
|
|
|
| 1002 |
(DATIVE, "Dativ"), (GENITIVE, "Genitiv")]:
|
| 1003 |
try:
|
| 1004 |
def_art = article(word_form, DEFINITE, gender_for_article, case)
|
| 1005 |
indef_art = article(word_form, INDEFINITE, gender_for_article, case)
|
|
|
|
| 1006 |
indef_form = f"{indef_art} {word_form_cap}" if indef_art else word_form_cap
|
| 1007 |
-
if number == PLURAL:
|
| 1008 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
gen_declension[f"{case_name} {number_name}"] = {
|
| 1010 |
-
"definite": f"{def_art} {
|
| 1011 |
"indefinite": indef_form,
|
| 1012 |
-
"bare":
|
| 1013 |
}
|
| 1014 |
except Exception as e:
|
| 1015 |
log(f" Failed to get article for {gender_str}/{case_name} {number_name}: {e}")
|
|
|
|
| 1016 |
analysis["declension_by_gender"][gender_str] = gen_declension
|
| 1017 |
|
| 1018 |
-
|
| 1019 |
if len(genders) == 1:
|
| 1020 |
-
|
| 1021 |
-
analysis["
|
|
|
|
| 1022 |
|
| 1023 |
return analysis
|
| 1024 |
|
|
@@ -2174,9 +2191,11 @@ def _wiktionary_format_semantics_block(
|
|
| 2174 |
|
| 2175 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
| 2176 |
"""
|
| 2177 |
-
(PRIMARY ENGINE) Analyzes a word using the Wiktionary DB
|
| 2178 |
-
|
| 2179 |
"""
|
|
|
|
|
|
|
| 2180 |
final_result: Dict[str, Any] = {
|
| 2181 |
"input_word": word,
|
| 2182 |
"analysis": {}
|
|
@@ -2184,7 +2203,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2184 |
|
| 2185 |
conn = wiktionary_get_connection()
|
| 2186 |
if not conn:
|
| 2187 |
-
|
|
|
|
| 2188 |
|
| 2189 |
# --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
|
| 2190 |
spacy_pos_hint = None
|
|
@@ -2204,44 +2224,37 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2204 |
else: spacy_pos_hint = spacy_pos_raw
|
| 2205 |
|
| 2206 |
spacy_lemma_hint = token.lemma_
|
| 2207 |
-
log(f"[DEBUG]
|
| 2208 |
except Exception as e:
|
| 2209 |
-
log(f"[DEBUG]
|
| 2210 |
|
| 2211 |
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 2212 |
try:
|
| 2213 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 2214 |
except Exception as e:
|
| 2215 |
log(f"[DEBUG] Wiktionary query failed: {e}")
|
| 2216 |
-
return {}
|
| 2217 |
|
| 2218 |
if not wiktionary_reports:
|
| 2219 |
-
|
|
|
|
| 2220 |
|
| 2221 |
# --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
|
| 2222 |
def get_priority_score(report):
|
| 2223 |
wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
|
| 2224 |
wikt_lemma = report.get("lemma")
|
| 2225 |
-
|
| 2226 |
# Priority 1: Exact POS match with spaCy hint
|
| 2227 |
if spacy_pos_hint and wikt_pos == spacy_pos_hint:
|
| 2228 |
-
|
| 2229 |
-
if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint:
|
| 2230 |
-
return 1
|
| 2231 |
return 2
|
| 2232 |
-
|
| 2233 |
-
|
| 2234 |
-
if wikt_lemma.lower() == word.lower():
|
| 2235 |
-
return 3
|
| 2236 |
-
|
| 2237 |
-
# Priority 3: Other inflected forms (e.g. "gehe" -> "gehen")
|
| 2238 |
return 4
|
| 2239 |
|
| 2240 |
wiktionary_reports.sort(key=get_priority_score)
|
| 2241 |
-
log(f"[DEBUG]
|
| 2242 |
|
| 2243 |
-
|
| 2244 |
-
# --- 4. BUILD AND *VALIDATE* THE FINAL REPORT (PATH-PURE) ---
|
| 2245 |
word_lower = word.lower()
|
| 2246 |
|
| 2247 |
for wikt_report in wiktionary_reports:
|
|
@@ -2249,46 +2262,133 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2249 |
lemma = wikt_report.get("lemma", word)
|
| 2250 |
pos_title = wikt_report.get("pos_title", "")
|
| 2251 |
|
| 2252 |
-
|
|
|
|
|
|
|
|
|
|
| 2253 |
inflections_wikt_block = {
|
| 2254 |
"base_form": lemma,
|
| 2255 |
-
"forms_list":
|
| 2256 |
"source": "wiktionary"
|
| 2257 |
}
|
| 2258 |
|
| 2259 |
-
# --- B.
|
|
|
|
| 2260 |
pattern_block = {}
|
|
|
|
| 2261 |
if PATTERN_DE_AVAILABLE:
|
| 2262 |
try:
|
|
|
|
| 2263 |
if pos_key == "noun" or "Substantiv" in pos_title:
|
| 2264 |
-
|
| 2265 |
-
|
| 2266 |
-
|
| 2267 |
-
if
|
| 2268 |
-
|
| 2269 |
else:
|
| 2270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2271 |
elif pos_key == "adjective" or "Adjektiv" in pos_title or "Deklinierte Form" in pos_title:
|
| 2272 |
-
|
| 2273 |
-
|
| 2274 |
-
|
| 2275 |
-
else:
|
| 2276 |
-
pattern_block = pattern_analyze_as_adjective(lemma)
|
| 2277 |
elif pos_key == "adverb":
|
| 2278 |
pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 2279 |
except Exception as e:
|
| 2280 |
-
|
|
|
|
| 2281 |
|
| 2282 |
-
# --- C.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2283 |
semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
|
| 2284 |
|
| 2285 |
-
# ---
|
| 2286 |
pos_entry_report = {
|
| 2287 |
"inflections_wiktionary": inflections_wikt_block,
|
| 2288 |
-
"inflections_pattern": pattern_block,
|
| 2289 |
"semantics_combined": semantics_block,
|
| 2290 |
"wiktionary_metadata": {
|
| 2291 |
-
# --- Original Fields ---
|
| 2292 |
"pos_title": pos_title,
|
| 2293 |
"etymology": wikt_report.get("etymology_text"),
|
| 2294 |
"pronunciation": wikt_report.get("sounds"),
|
|
@@ -2296,8 +2396,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2296 |
"examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
|
| 2297 |
"entry_tags": wikt_report.get("entry_tags"),
|
| 2298 |
"entry_categories": wikt_report.get("entry_categories"),
|
| 2299 |
-
|
| 2300 |
-
# Pass through all new fields from the full DB ---
|
| 2301 |
"entry_notes": wikt_report.get("entry_notes"),
|
| 2302 |
"other_pos": wikt_report.get("other_pos"),
|
| 2303 |
"raw_tags": wikt_report.get("raw_tags"),
|
|
@@ -2307,50 +2406,46 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 2307 |
"holonyms": wikt_report.get("holonyms"),
|
| 2308 |
"meronyms": wikt_report.get("meronyms"),
|
| 2309 |
"coordinate_terms": wikt_report.get("coordinate_terms"),
|
| 2310 |
-
# We are now correctly getting the data we queried earlier.
|
| 2311 |
"expressions": wikt_report.get("expressions"),
|
| 2312 |
"proverbs": wikt_report.get("proverbs")
|
| 2313 |
-
|
| 2314 |
}
|
| 2315 |
}
|
| 2316 |
|
| 2317 |
-
# ---
|
| 2318 |
is_valid = False
|
| 2319 |
is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
|
| 2320 |
|
| 2321 |
-
# Check 1:
|
| 2322 |
-
# This is true for base form entries (e.g., "Haus" -> "Haus (Substantiv)")
|
| 2323 |
-
# AND for inflected form entries (e.g., "gießt" -> "gießt (Konjugierte Form)")
|
| 2324 |
if lemma.lower() == word_lower:
|
| 2325 |
is_valid = True
|
| 2326 |
-
log(f"[DEBUG]
|
| 2327 |
|
| 2328 |
-
# Check 2:
|
| 2329 |
-
# (This applies to base entries where the input is an inflection, e.g., "gießt" -> "gehen (Verb)")
|
| 2330 |
-
# We only run this if Check 1 failed AND this is not an inflected entry (which have no forms)
|
| 2331 |
if not is_valid and not is_inflected_entry:
|
| 2332 |
-
|
|
|
|
| 2333 |
form_text = form_entry.get("form_text", "")
|
| 2334 |
-
|
| 2335 |
-
|
| 2336 |
-
|
| 2337 |
-
|
| 2338 |
-
if bare_form.lower() == word_lower:
|
| 2339 |
is_valid = True
|
| 2340 |
-
log(f"[DEBUG]
|
| 2341 |
break
|
| 2342 |
-
|
| 2343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2344 |
if is_valid:
|
| 2345 |
if pos_key not in final_result["analysis"]:
|
| 2346 |
final_result["analysis"][pos_key] = []
|
| 2347 |
final_result["analysis"][pos_key].append(pos_entry_report)
|
| 2348 |
else:
|
| 2349 |
-
log(f"[DEBUG]
|
| 2350 |
-
|
| 2351 |
-
# --- END OF VALIDATION ---
|
| 2352 |
|
| 2353 |
-
final_result["info"] = f"Analysis from Wiktionary (
|
| 2354 |
return final_result
|
| 2355 |
|
| 2356 |
# ============================================================================
|
|
@@ -3383,172 +3478,302 @@ HTML_CSS = """
|
|
| 3383 |
"""
|
| 3384 |
|
| 3385 |
def _format_word_analysis_html(data: Dict[str, Any]) -> str:
|
| 3386 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3387 |
if not data or "analysis" not in data:
|
| 3388 |
return f"{HTML_CSS}<div class='ling-card'>Keine Daten verfügbar. {data.get('info', '')}</div>"
|
| 3389 |
|
| 3390 |
html = HTML_CSS
|
| 3391 |
analysis = data["analysis"]
|
| 3392 |
|
| 3393 |
-
# Iterate over POS
|
| 3394 |
for pos_key, entries in analysis.items():
|
| 3395 |
if not entries: continue
|
| 3396 |
-
entry = entries[0] # Take best candidate
|
| 3397 |
-
|
| 3398 |
-
# --- POS Display Logic ---
|
| 3399 |
-
display_pos = pos_key.upper()
|
| 3400 |
-
css_class = "pos-other"
|
| 3401 |
-
|
| 3402 |
-
if pos_key == 'noun':
|
| 3403 |
-
css_class = "pos-noun"
|
| 3404 |
-
display_pos = "SUBSTANTIV"
|
| 3405 |
-
elif pos_key == 'verb':
|
| 3406 |
-
css_class = "pos-verb"
|
| 3407 |
-
display_pos = "VERB"
|
| 3408 |
-
elif pos_key == 'adj' or pos_key == 'adjective':
|
| 3409 |
-
css_class = "pos-adj"
|
| 3410 |
-
display_pos = "ADJEKTIV"
|
| 3411 |
-
elif pos_key == 'adv' or pos_key == 'adverb':
|
| 3412 |
-
css_class = "pos-adv"
|
| 3413 |
-
display_pos = "ADVERB"
|
| 3414 |
|
| 3415 |
-
#
|
| 3416 |
-
|
| 3417 |
-
|
| 3418 |
-
|
| 3419 |
-
|
| 3420 |
-
|
| 3421 |
-
|
| 3422 |
-
|
| 3423 |
-
|
| 3424 |
-
|
| 3425 |
-
|
| 3426 |
-
html += f"""
|
| 3427 |
-
<div class="ling-card">
|
| 3428 |
-
<div class="ling-header">
|
| 3429 |
-
<span class="ling-lemma">{lemma}</span>
|
| 3430 |
-
<span class="ling-pos {css_class}">{display_pos}</span>
|
| 3431 |
-
</div>
|
| 3432 |
-
"""
|
| 3433 |
-
|
| 3434 |
-
# --- Inflections Section (Pattern.de logic) ---
|
| 3435 |
-
html += "<div class='ling-section'><div class='ling-subtitle'>Morphologie & Flexion</div>"
|
| 3436 |
-
html += "<table class='inflection-table'>"
|
| 3437 |
-
|
| 3438 |
-
has_pattern_data = bool(inf_pat) and "error" not in inf_pat
|
| 3439 |
-
|
| 3440 |
-
if pos_key == 'noun':
|
| 3441 |
-
# Pattern.de returns 'declension' or 'declension_by_gender'
|
| 3442 |
-
decl = inf_pat.get('declension')
|
| 3443 |
-
# Fallback if declension is inside gender key
|
| 3444 |
-
if not decl and inf_pat.get('declension_by_gender'):
|
| 3445 |
-
first_gender = list(inf_pat['declension_by_gender'].keys())[0]
|
| 3446 |
-
decl = inf_pat['declension_by_gender'][first_gender]
|
| 3447 |
|
| 3448 |
-
|
| 3449 |
-
|
| 3450 |
-
|
| 3451 |
-
|
| 3452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3453 |
|
| 3454 |
-
|
| 3455 |
-
|
| 3456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3457 |
|
| 3458 |
-
|
| 3459 |
-
html += f"<tr><td class='inflection-label'>
|
| 3460 |
-
|
| 3461 |
-
html += f"<tr><td
|
| 3462 |
-
|
| 3463 |
-
|
| 3464 |
-
|
| 3465 |
-
|
| 3466 |
-
|
| 3467 |
-
|
| 3468 |
|
| 3469 |
-
|
| 3470 |
-
|
| 3471 |
-
|
| 3472 |
-
html +=
|
| 3473 |
-
|
| 3474 |
-
|
| 3475 |
-
|
| 3476 |
-
|
| 3477 |
-
|
| 3478 |
-
|
| 3479 |
-
|
| 3480 |
-
|
| 3481 |
-
|
| 3482 |
-
|
| 3483 |
-
|
| 3484 |
-
|
| 3485 |
-
|
| 3486 |
|
| 3487 |
-
|
| 3488 |
-
html += f"<tr><td class='inflection-label'>Weitere Formen (DB)</td><td>{', '.join(forms_str_list)}</td></tr>"
|
| 3489 |
-
|
| 3490 |
-
html += "</table></div>"
|
| 3491 |
-
|
| 3492 |
-
# --- Semantics Section ---
|
| 3493 |
-
html += "<div class='ling-section'><div class='ling-subtitle'>Bedeutungen & Definitionen</div>"
|
| 3494 |
-
|
| 3495 |
-
wikt_senses = sem_comb.get("wiktionary_senses") or []
|
| 3496 |
-
ode_senses = sem_comb.get("odenet_senses") or []
|
| 3497 |
-
|
| 3498 |
-
if not wikt_senses and not ode_senses:
|
| 3499 |
-
html += "<div class='sense-item'><i>Keine Definitionen gefunden.</i></div>"
|
| 3500 |
-
|
| 3501 |
-
for s in wikt_senses[:3]:
|
| 3502 |
-
gloss_raw = s.get("definition") or ""
|
| 3503 |
-
gloss = str(gloss_raw).replace(";", "<br>")
|
| 3504 |
-
if gloss:
|
| 3505 |
-
html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
|
| 3506 |
|
| 3507 |
-
|
| 3508 |
-
|
| 3509 |
-
if defi:
|
| 3510 |
-
html += f"<div class='sense-item'><span class='source-badge src-oewn'>OdeNet</span> {defi}</div>"
|
| 3511 |
|
| 3512 |
-
|
| 3513 |
-
|
| 3514 |
-
# --- Relations Section ---
|
| 3515 |
-
rels = sem_comb.get("conceptnet_relations") or []
|
| 3516 |
-
if rels:
|
| 3517 |
-
html += "<div class='ling-section'><div class='ling-subtitle'>Wissensgraph (Kontext)</div>"
|
| 3518 |
|
| 3519 |
-
|
| 3520 |
-
|
| 3521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3522 |
|
| 3523 |
-
|
| 3524 |
-
|
| 3525 |
-
|
| 3526 |
-
if
|
| 3527 |
-
|
| 3528 |
-
|
| 3529 |
-
return f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
|
| 3530 |
-
|
| 3531 |
-
html += "<div>"
|
| 3532 |
-
for r in visible_rels:
|
| 3533 |
-
html += render_rel(r)
|
| 3534 |
html += "</div>"
|
| 3535 |
-
|
| 3536 |
-
if hidden_rels:
|
| 3537 |
-
html += f"""
|
| 3538 |
-
<details class='kg-details'>
|
| 3539 |
-
<summary>Zeige {len(hidden_rels)} weitere Relationen</summary>
|
| 3540 |
-
<div class='kg-content'>
|
| 3541 |
-
"""
|
| 3542 |
-
for r in hidden_rels:
|
| 3543 |
-
html += render_rel(r)
|
| 3544 |
-
html += "</div></details>"
|
| 3545 |
|
| 3546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3547 |
|
| 3548 |
-
|
| 3549 |
|
| 3550 |
return html
|
| 3551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3552 |
def _format_comprehensive_html(data: Dict[str, Any]) -> str:
|
| 3553 |
""" Generates HTML for the comprehensive sentence analysis. """
|
| 3554 |
if "error" in data:
|
|
|
|
| 951 |
|
| 952 |
|
| 953 |
# --- Inflection Generators ---
|
| 954 |
+
def pattern_analyze_as_noun(word: str, hint_lemma: str = None, fixed_gender: int = None) -> Dict[str, Any]:
|
| 955 |
+
"""
|
| 956 |
+
Comprehensive noun inflection analysis.
|
| 957 |
+
Args:
|
| 958 |
+
hint_lemma: A lemma suggestion to help Pattern.
|
| 959 |
+
fixed_gender: A pattern.de constant (MALE, FEMALE, NEUTRAL) to FORCE a specific gender.
|
| 960 |
+
"""
|
| 961 |
+
log(f" Analyzing as noun (hint_lemma={hint_lemma}, fixed_gender={fixed_gender})")
|
| 962 |
analysis = {}
|
| 963 |
+
|
| 964 |
+
# 1. Determine Base Form
|
| 965 |
singular = singularize(word)
|
| 966 |
plural = pluralize(word)
|
| 967 |
+
|
|
|
|
| 968 |
if plural != word and singular != word:
|
| 969 |
base = word
|
|
|
|
| 970 |
elif singular != word:
|
| 971 |
base = singular
|
|
|
|
| 972 |
elif hint_lemma and hint_lemma != word:
|
| 973 |
base = hint_lemma
|
|
|
|
| 974 |
else:
|
|
|
|
| 975 |
base = word
|
|
|
|
| 976 |
|
| 977 |
+
# 2. Determine Gender
|
| 978 |
+
# If Wiktionary gave us a gender, USE IT. Ignore Pattern's internal dictionary.
|
| 979 |
+
if fixed_gender is not None:
|
| 980 |
+
genders = [fixed_gender]
|
| 981 |
+
log(f" [Pattern] Enforcing gender from DB: {fixed_gender}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
else:
|
| 983 |
+
# Fallback to auto-detection
|
| 984 |
+
g = gender(base, pos=NOUN)
|
| 985 |
+
if isinstance(g, tuple):
|
| 986 |
+
genders = list(g)
|
| 987 |
+
elif g is None:
|
| 988 |
+
genders = [MALE]
|
| 989 |
+
else:
|
| 990 |
+
genders = [g]
|
| 991 |
|
| 992 |
analysis["base_form"] = base
|
| 993 |
analysis["plural"] = pluralize(base)
|
| 994 |
analysis["singular"] = base
|
| 995 |
analysis["declension_by_gender"] = {}
|
| 996 |
|
| 997 |
+
# 3. Generate Declensions
|
| 998 |
for gen in genders:
|
| 999 |
gender_str = {MALE: "Masculine", FEMALE: "Feminine", NEUTRAL: "Neuter"}.get(gen, "Unknown")
|
| 1000 |
gen_declension = {}
|
| 1001 |
+
|
| 1002 |
for number, number_name in [(SINGULAR, "Singular"), (PLURAL, "Plural")]:
|
| 1003 |
word_form = base if number == SINGULAR else pluralize(base)
|
| 1004 |
word_form_cap = word_form.capitalize()
|
| 1005 |
gender_for_article = gen if number == SINGULAR else PLURAL
|
| 1006 |
+
|
| 1007 |
+
for case, case_name in [(NOMINATIVE, "Nominativ"), (ACCUSATIVE, "Akkusativ"),
|
| 1008 |
(DATIVE, "Dativ"), (GENITIVE, "Genitiv")]:
|
| 1009 |
try:
|
| 1010 |
def_art = article(word_form, DEFINITE, gender_for_article, case)
|
| 1011 |
indef_art = article(word_form, INDEFINITE, gender_for_article, case)
|
| 1012 |
+
|
| 1013 |
indef_form = f"{indef_art} {word_form_cap}" if indef_art else word_form_cap
|
| 1014 |
+
if number == PLURAL: indef_form = "—"
|
| 1015 |
+
|
| 1016 |
+
# Fix for Pattern sometimes missing Genitive 's' suffix on Masculine/Neuter
|
| 1017 |
+
noun_text = word_form_cap
|
| 1018 |
+
if number == SINGULAR and case == GENITIVE and gen in [MALE, NEUTRAL] and not noun_text.endswith("s") and not noun_text.endswith("x") and not noun_text.endswith("z"):
|
| 1019 |
+
# Simple heuristic fix: German Genitive usually adds 's' or 'es'
|
| 1020 |
+
# Pattern handles this usually, but if we force gender on a word Pattern doesn't know, it might miss it.
|
| 1021 |
+
# For safety, we trust Pattern's output, but if you find Pattern fails here, you inject logic here.
|
| 1022 |
+
pass
|
| 1023 |
+
|
| 1024 |
gen_declension[f"{case_name} {number_name}"] = {
|
| 1025 |
+
"definite": f"{def_art} {noun_text}" if def_art else noun_text,
|
| 1026 |
"indefinite": indef_form,
|
| 1027 |
+
"bare": noun_text
|
| 1028 |
}
|
| 1029 |
except Exception as e:
|
| 1030 |
log(f" Failed to get article for {gender_str}/{case_name} {number_name}: {e}")
|
| 1031 |
+
|
| 1032 |
analysis["declension_by_gender"][gender_str] = gen_declension
|
| 1033 |
|
| 1034 |
+
# Flatten for the main keys if only one gender exists
|
| 1035 |
if len(genders) == 1:
|
| 1036 |
+
first_gen_key = list(analysis["declension_by_gender"].keys())[0]
|
| 1037 |
+
analysis["declension"] = analysis["declension_by_gender"][first_gen_key]
|
| 1038 |
+
analysis["gender"] = first_gen_key
|
| 1039 |
|
| 1040 |
return analysis
|
| 1041 |
|
|
|
|
| 2191 |
|
| 2192 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
| 2193 |
"""
|
| 2194 |
+
(PRIMARY ENGINE) Analyzes a word using the Wiktionary DB as Ground Truth,
|
| 2195 |
+
filling in missing gaps with Pattern.de generation.
|
| 2196 |
"""
|
| 2197 |
+
print(f"\n[Wiktionary Engine] Starting analysis for: {word}")
|
| 2198 |
+
|
| 2199 |
final_result: Dict[str, Any] = {
|
| 2200 |
"input_word": word,
|
| 2201 |
"analysis": {}
|
|
|
|
| 2203 |
|
| 2204 |
conn = wiktionary_get_connection()
|
| 2205 |
if not conn:
|
| 2206 |
+
log("[Wiktionary Engine] No DB connection available.")
|
| 2207 |
+
return {}
|
| 2208 |
|
| 2209 |
# --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
|
| 2210 |
spacy_pos_hint = None
|
|
|
|
| 2224 |
else: spacy_pos_hint = spacy_pos_raw
|
| 2225 |
|
| 2226 |
spacy_lemma_hint = token.lemma_
|
| 2227 |
+
log(f"[DEBUG] Priority Hint: spaCy POS='{spacy_pos_hint}', Lemma='{spacy_lemma_hint}'")
|
| 2228 |
except Exception as e:
|
| 2229 |
+
log(f"[DEBUG] Priority Hint failed: {e}")
|
| 2230 |
|
| 2231 |
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 2232 |
try:
|
| 2233 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 2234 |
except Exception as e:
|
| 2235 |
log(f"[DEBUG] Wiktionary query failed: {e}")
|
| 2236 |
+
return {}
|
| 2237 |
|
| 2238 |
if not wiktionary_reports:
|
| 2239 |
+
log(f"[DEBUG] No Wiktionary entries found for '{word}'.")
|
| 2240 |
+
return {}
|
| 2241 |
|
| 2242 |
# --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
|
| 2243 |
def get_priority_score(report):
|
| 2244 |
wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
|
| 2245 |
wikt_lemma = report.get("lemma")
|
|
|
|
| 2246 |
# Priority 1: Exact POS match with spaCy hint
|
| 2247 |
if spacy_pos_hint and wikt_pos == spacy_pos_hint:
|
| 2248 |
+
if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint: return 1
|
|
|
|
|
|
|
| 2249 |
return 2
|
| 2250 |
+
# Priority 2: Input word is the lemma
|
| 2251 |
+
if wikt_lemma and wikt_lemma.lower() == word.lower(): return 3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2252 |
return 4
|
| 2253 |
|
| 2254 |
wiktionary_reports.sort(key=get_priority_score)
|
| 2255 |
+
log(f"[DEBUG] Sorted {len(wiktionary_reports)} entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
|
| 2256 |
|
| 2257 |
+
# --- 4. PROCESS ENTRIES (HYBRID STRATEGY) ---
|
|
|
|
| 2258 |
word_lower = word.lower()
|
| 2259 |
|
| 2260 |
for wikt_report in wiktionary_reports:
|
|
|
|
| 2262 |
lemma = wikt_report.get("lemma", word)
|
| 2263 |
pos_title = wikt_report.get("pos_title", "")
|
| 2264 |
|
| 2265 |
+
log(f"\n--- Processing Entry: {lemma} ({pos_key}) ---")
|
| 2266 |
+
|
| 2267 |
+
# --- A. Raw Wiktionary Forms (Ground Truth) ---
|
| 2268 |
+
wikt_forms_list = wikt_report.get("forms", [])
|
| 2269 |
inflections_wikt_block = {
|
| 2270 |
"base_form": lemma,
|
| 2271 |
+
"forms_list": wikt_forms_list,
|
| 2272 |
"source": "wiktionary"
|
| 2273 |
}
|
| 2274 |
|
| 2275 |
+
# --- B. Generate Base Pattern Template (The Scaffold) ---
|
| 2276 |
+
# We ALWAYS generate this if Pattern is available, to provide the table structure.
|
| 2277 |
pattern_block = {}
|
| 2278 |
+
|
| 2279 |
if PATTERN_DE_AVAILABLE:
|
| 2280 |
try:
|
| 2281 |
+
log(f"[DEBUG] Generating Pattern.de base template for '{lemma}' ({pos_key})...")
|
| 2282 |
if pos_key == "noun" or "Substantiv" in pos_title:
|
| 2283 |
+
# Gender-Aware Generation
|
| 2284 |
+
wikt_tags = wikt_report.get("entry_tags", [])
|
| 2285 |
+
forced_gender = _map_wikt_gender_to_pattern(wikt_tags)
|
| 2286 |
+
if forced_gender:
|
| 2287 |
+
log(f"[DEBUG] Context: Forcing Pattern gender to {forced_gender} based on Wiktionary tags.")
|
| 2288 |
else:
|
| 2289 |
+
log(f"[DEBUG] Context: No gender tags in Wiktionary. Letting Pattern auto-detect.")
|
| 2290 |
+
|
| 2291 |
+
pattern_block = pattern_analyze_as_noun(lemma, fixed_gender=forced_gender)
|
| 2292 |
+
|
| 2293 |
+
elif pos_key == "verb" or "Verb" in pos_title or "Konjugierte Form" in pos_title:
|
| 2294 |
+
use_word = word if "Konjugierte Form" in pos_title else lemma
|
| 2295 |
+
pattern_block = pattern_analyze_as_verb(use_word)
|
| 2296 |
+
|
| 2297 |
elif pos_key == "adjective" or "Adjektiv" in pos_title or "Deklinierte Form" in pos_title:
|
| 2298 |
+
use_word = word if "Deklinierte Form" in pos_title else lemma
|
| 2299 |
+
pattern_block = pattern_analyze_as_adjective(use_word)
|
| 2300 |
+
|
|
|
|
|
|
|
| 2301 |
elif pos_key == "adverb":
|
| 2302 |
pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 2303 |
except Exception as e:
|
| 2304 |
+
log(f"[ERROR] Pattern.de generation failed: {e}")
|
| 2305 |
+
pattern_block = {"error": f"Pattern.de failed: {e}"}
|
| 2306 |
|
| 2307 |
+
# --- C. THE HYBRID MERGE: Overwrite Pattern data with Wiktionary Truth ---
|
| 2308 |
+
# logic: If Wiktionary has a form for a specific slot, use it.
|
| 2309 |
+
# If not, keep the Pattern generated form (thereby filling the gap).
|
| 2310 |
+
|
| 2311 |
+
if pattern_block and "error" not in pattern_block and wikt_forms_list:
|
| 2312 |
+
log(f"[DEBUG] Starting Hybrid Merge (Wiktionary forms: {len(wikt_forms_list)})...")
|
| 2313 |
+
|
| 2314 |
+
overwrites_count = 0
|
| 2315 |
+
|
| 2316 |
+
for wikt_form in wikt_forms_list:
|
| 2317 |
+
text = wikt_form.get("form_text")
|
| 2318 |
+
tags = wikt_form.get("tags")
|
| 2319 |
+
if not text or not tags: continue
|
| 2320 |
+
|
| 2321 |
+
# Map Wikt tags to the address inside pattern_block
|
| 2322 |
+
path_keys = _map_wikt_form_to_pattern_keys(pos_key, tags)
|
| 2323 |
+
|
| 2324 |
+
if path_keys:
|
| 2325 |
+
# Navigate to the slot in pattern_block
|
| 2326 |
+
target = pattern_block
|
| 2327 |
+
|
| 2328 |
+
# Special handling for Noun structure (declension_by_gender)
|
| 2329 |
+
if pos_key == "noun" and "declension_by_gender" in pattern_block:
|
| 2330 |
+
# We apply the overwrite to ALL genders present in the pattern block
|
| 2331 |
+
# (Usually only 1 if we forced it, but maybe more if ambiguous)
|
| 2332 |
+
for gender_key in pattern_block["declension_by_gender"]:
|
| 2333 |
+
# path_keys[0] is e.g. "Nominativ Singular"
|
| 2334 |
+
slot_key = path_keys[0]
|
| 2335 |
+
target_dict = pattern_block["declension_by_gender"][gender_key]
|
| 2336 |
+
|
| 2337 |
+
if slot_key in target_dict:
|
| 2338 |
+
# Noun slots have subkeys: 'bare', 'definite', 'indefinite'
|
| 2339 |
+
# Wiktionary usually gives the form with article "der See" or without "Seen"
|
| 2340 |
+
# We try to be smart about updating 'bare' vs 'definite'
|
| 2341 |
+
|
| 2342 |
+
current_bare = target_dict[slot_key].get('bare', '')
|
| 2343 |
+
|
| 2344 |
+
# Simple clean: remove articles to get bare
|
| 2345 |
+
clean_text = re.sub(r"^(der|die|das|den|dem|des|ein|eine|einen|einem|einer|eines)\s+", "", text, flags=re.IGNORECASE).strip()
|
| 2346 |
+
|
| 2347 |
+
if clean_text != current_bare:
|
| 2348 |
+
log(f"[DEBUG] Merge: Overwriting {gender_key} -> {slot_key} | Old: '{current_bare}' -> New: '{clean_text}' (Source: Wiktionary)")
|
| 2349 |
+
target_dict[slot_key]['bare'] = clean_text
|
| 2350 |
+
# Also update full forms if possible
|
| 2351 |
+
if "definite" in target_dict[slot_key]:
|
| 2352 |
+
# We can reconstruct definite if we know the article, but let's just trust the bare text update
|
| 2353 |
+
# because the HTML renderer often rebuilds the article.
|
| 2354 |
+
# However, let's update 'definite' if the wikt text looks like it has an article
|
| 2355 |
+
if " " in text:
|
| 2356 |
+
target_dict[slot_key]['definite'] = text
|
| 2357 |
+
overwrites_count += 1
|
| 2358 |
+
|
| 2359 |
+
# Handling for Verbs/Adjectives (Nested Dicts)
|
| 2360 |
+
else:
|
| 2361 |
+
# Navigate deep
|
| 2362 |
+
valid_path = True
|
| 2363 |
+
for key in path_keys[:-1]:
|
| 2364 |
+
if key in target:
|
| 2365 |
+
target = target[key]
|
| 2366 |
+
else:
|
| 2367 |
+
valid_path = False
|
| 2368 |
+
break
|
| 2369 |
+
|
| 2370 |
+
if valid_path:
|
| 2371 |
+
last_key = path_keys[-1]
|
| 2372 |
+
if last_key in target and target[last_key] != text:
|
| 2373 |
+
log(f"[DEBUG] Merge: Overwriting {path_keys} | Old: '{target[last_key]}' -> New: '{text}' (Source: Wiktionary)")
|
| 2374 |
+
target[last_key] = text
|
| 2375 |
+
overwrites_count += 1
|
| 2376 |
+
|
| 2377 |
+
log(f"[DEBUG] Merge complete. {overwrites_count} slots updated with Ground Truth.")
|
| 2378 |
+
# Mark the block as hybrid so UI can verify validity
|
| 2379 |
+
pattern_block["is_hybrid"] = True
|
| 2380 |
+
|
| 2381 |
+
# --- D. Build Semantics Block ---
|
| 2382 |
+
# Use lemma from Wiktionary (Ground Truth)
|
| 2383 |
+
semantics_lemma = lemma
|
| 2384 |
semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
|
| 2385 |
|
| 2386 |
+
# --- E. Assemble Final Report ---
|
| 2387 |
pos_entry_report = {
|
| 2388 |
"inflections_wiktionary": inflections_wikt_block,
|
| 2389 |
+
"inflections_pattern": pattern_block, # This is now the Hybrid Block
|
| 2390 |
"semantics_combined": semantics_block,
|
| 2391 |
"wiktionary_metadata": {
|
|
|
|
| 2392 |
"pos_title": pos_title,
|
| 2393 |
"etymology": wikt_report.get("etymology_text"),
|
| 2394 |
"pronunciation": wikt_report.get("sounds"),
|
|
|
|
| 2396 |
"examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
|
| 2397 |
"entry_tags": wikt_report.get("entry_tags"),
|
| 2398 |
"entry_categories": wikt_report.get("entry_categories"),
|
| 2399 |
+
# New fields
|
|
|
|
| 2400 |
"entry_notes": wikt_report.get("entry_notes"),
|
| 2401 |
"other_pos": wikt_report.get("other_pos"),
|
| 2402 |
"raw_tags": wikt_report.get("raw_tags"),
|
|
|
|
| 2406 |
"holonyms": wikt_report.get("holonyms"),
|
| 2407 |
"meronyms": wikt_report.get("meronyms"),
|
| 2408 |
"coordinate_terms": wikt_report.get("coordinate_terms"),
|
|
|
|
| 2409 |
"expressions": wikt_report.get("expressions"),
|
| 2410 |
"proverbs": wikt_report.get("proverbs")
|
|
|
|
| 2411 |
}
|
| 2412 |
}
|
| 2413 |
|
| 2414 |
+
# --- F. Validation Filter ---
|
| 2415 |
is_valid = False
|
| 2416 |
is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
|
| 2417 |
|
| 2418 |
+
# Check 1: Lemma Match
|
|
|
|
|
|
|
| 2419 |
if lemma.lower() == word_lower:
|
| 2420 |
is_valid = True
|
| 2421 |
+
log(f"[DEBUG] Validate: Accepted '{lemma}' (Lemma Match)")
|
| 2422 |
|
| 2423 |
+
# Check 2: Form Match
|
|
|
|
|
|
|
| 2424 |
if not is_valid and not is_inflected_entry:
|
| 2425 |
+
# Look in Ground Truth (Wiktionary)
|
| 2426 |
+
for form_entry in wikt_forms_list:
|
| 2427 |
form_text = form_entry.get("form_text", "")
|
| 2428 |
+
clean_form = re.sub(r"\(.*\)", "", form_text).strip() # Remove parens
|
| 2429 |
+
clean_form = re.sub(r"^(der|die|das|ein|eine|...)\s+", "", clean_form, flags=re.IGNORECASE).strip() # Remove articles
|
| 2430 |
+
if word_lower in clean_form.lower():
|
|
|
|
|
|
|
| 2431 |
is_valid = True
|
| 2432 |
+
log(f"[DEBUG] Validate: Accepted '{lemma}' (Found in Wiktionary forms)")
|
| 2433 |
break
|
| 2434 |
+
|
| 2435 |
+
# Look in Pattern Generation (if Wikt failed)
|
| 2436 |
+
if not is_valid and pattern_block:
|
| 2437 |
+
if word_appears_in_inflections(word, pattern_block, pos_key):
|
| 2438 |
+
is_valid = True
|
| 2439 |
+
log(f"[DEBUG] Validate: Accepted '{lemma}' (Found in Pattern forms)")
|
| 2440 |
+
|
| 2441 |
if is_valid:
|
| 2442 |
if pos_key not in final_result["analysis"]:
|
| 2443 |
final_result["analysis"][pos_key] = []
|
| 2444 |
final_result["analysis"][pos_key].append(pos_entry_report)
|
| 2445 |
else:
|
| 2446 |
+
log(f"[DEBUG] Validate: Dropped '{lemma}' ({pos_key}) - No match found.")
|
|
|
|
|
|
|
| 2447 |
|
| 2448 |
+
final_result["info"] = f"Analysis from Wiktionary (Hybrid Engine). Found {len(wiktionary_reports)} entries."
|
| 2449 |
return final_result
|
| 2450 |
|
| 2451 |
# ============================================================================
|
|
|
|
| 3478 |
"""
|
| 3479 |
|
| 3480 |
def _format_word_analysis_html(data: Dict[str, Any]) -> str:
|
| 3481 |
+
"""
|
| 3482 |
+
Generates HTML for a single word analysis (German version).
|
| 3483 |
+
Renders the 'inflections_pattern' block, which contains the
|
| 3484 |
+
Hybrid (Wiktionary-verified) data from the backend.
|
| 3485 |
+
"""
|
| 3486 |
if not data or "analysis" not in data:
|
| 3487 |
return f"{HTML_CSS}<div class='ling-card'>Keine Daten verfügbar. {data.get('info', '')}</div>"
|
| 3488 |
|
| 3489 |
html = HTML_CSS
|
| 3490 |
analysis = data["analysis"]
|
| 3491 |
|
| 3492 |
+
# Iterate over POS categories (noun, verb, etc.)
|
| 3493 |
for pos_key, entries in analysis.items():
|
| 3494 |
if not entries: continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3495 |
|
| 3496 |
+
# We usually display the best candidate, but if there are multiple distinct entries
|
| 3497 |
+
# (like "der See" vs "die See"), the backend groups them in the list.
|
| 3498 |
+
# We should ideally render ALL entries in the list to show the homonyms.
|
| 3499 |
+
# This loop handles that.
|
| 3500 |
+
for entry in entries:
|
| 3501 |
+
|
| 3502 |
+
# Data Extraction
|
| 3503 |
+
inf_wikt = entry.get("inflections_wiktionary") or {}
|
| 3504 |
+
inf_pat = entry.get("inflections_pattern") or {}
|
| 3505 |
+
sem_comb = entry.get("semantics_combined") or {}
|
| 3506 |
+
meta = entry.get("wiktionary_metadata") or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3507 |
|
| 3508 |
+
lemma = inf_wikt.get("base_form") or \
|
| 3509 |
+
inf_pat.get("base_form") or \
|
| 3510 |
+
sem_comb.get("lemma") or \
|
| 3511 |
+
data.get("input_word") or "?"
|
| 3512 |
+
|
| 3513 |
+
# --- POS Display Logic ---
|
| 3514 |
+
display_pos = pos_key.upper()
|
| 3515 |
+
css_class = "pos-other"
|
| 3516 |
+
|
| 3517 |
+
if pos_key == 'noun':
|
| 3518 |
+
css_class = "pos-noun"
|
| 3519 |
+
display_pos = "SUBSTANTIV"
|
| 3520 |
+
# Append Gender to POS badge if available
|
| 3521 |
+
if "gender" in inf_pat:
|
| 3522 |
+
gender_map = {"Masculine": "M", "Feminine": "F", "Neuter": "N"}
|
| 3523 |
+
g_short = gender_map.get(inf_pat['gender'], "?")
|
| 3524 |
+
display_pos += f" ({g_short})"
|
| 3525 |
+
|
| 3526 |
+
elif pos_key == 'verb':
|
| 3527 |
+
css_class = "pos-verb"
|
| 3528 |
+
display_pos = "VERB"
|
| 3529 |
+
elif pos_key == 'adj' or pos_key == 'adjective':
|
| 3530 |
+
css_class = "pos-adj"
|
| 3531 |
+
display_pos = "ADJEKTIV"
|
| 3532 |
+
elif pos_key == 'adv' or pos_key == 'adverb':
|
| 3533 |
+
css_class = "pos-adv"
|
| 3534 |
+
display_pos = "ADVERB"
|
| 3535 |
+
|
| 3536 |
+
# --- CARD START ---
|
| 3537 |
+
html += f"""
|
| 3538 |
+
<div class="ling-card">
|
| 3539 |
+
<div class="ling-header">
|
| 3540 |
+
<span class="ling-lemma">{lemma}</span>
|
| 3541 |
+
<span class="ling-pos {css_class}">{display_pos}</span>
|
| 3542 |
+
"""
|
| 3543 |
+
|
| 3544 |
+
# Add small title if available (e.g., "Konjugierte Form")
|
| 3545 |
+
if meta.get("pos_title"):
|
| 3546 |
+
html += f"<span style='margin-left:10px; color:#6b7280; font-size:0.85em;'>{meta['pos_title']}</span>"
|
| 3547 |
+
|
| 3548 |
+
html += "</div>" # End Header
|
| 3549 |
+
|
| 3550 |
+
# --- SOURCE BADGE LOGIC ---
|
| 3551 |
+
# Determine credibility of the data
|
| 3552 |
+
is_hybrid = inf_pat.get("is_hybrid", False)
|
| 3553 |
+
wikt_forms_count = len(inf_wikt.get("forms_list", []))
|
| 3554 |
+
|
| 3555 |
+
badge_style = "float:right; font-weight:bold; font-size:0.75em; padding:2px 6px; border-radius:4px;"
|
| 3556 |
+
|
| 3557 |
+
if is_hybrid:
|
| 3558 |
+
source_html = f"<span style='{badge_style} background:#ecfdf5; color:#065f46; border:1px solid #a7f3d0;'>Quelle: Wiktionary (Verifiziert)</span>"
|
| 3559 |
+
elif wikt_forms_count > 0:
|
| 3560 |
+
source_html = f"<span style='{badge_style} background:#ecfdf5; color:#065f46; border:1px solid #a7f3d0;'>Quelle: Wiktionary (DB)</span>"
|
| 3561 |
+
elif inf_pat and "error" not in inf_pat:
|
| 3562 |
+
source_html = f"<span style='{badge_style} background:#fffbeb; color:#92400e; border:1px solid #fcd34d;'>Quelle: Pattern (Generiert)</span>"
|
| 3563 |
+
else:
|
| 3564 |
+
source_html = ""
|
| 3565 |
+
|
| 3566 |
+
# --- INFLECTIONS SECTION ---
|
| 3567 |
+
html += f"<div class='ling-section'>{source_html}<div class='ling-subtitle'>Morphologie & Flexion</div>"
|
| 3568 |
+
html += "<table class='inflection-table'>"
|
| 3569 |
+
|
| 3570 |
+
# We render the table based on 'inf_pat' because the backend has already merged
|
| 3571 |
+
# the Wiktionary truths into this structure.
|
| 3572 |
+
|
| 3573 |
+
if pos_key == 'noun':
|
| 3574 |
+
decl = inf_pat.get('declension')
|
| 3575 |
+
# Fallback if declension is nested in gender key
|
| 3576 |
+
if not decl and inf_pat.get('declension_by_gender'):
|
| 3577 |
+
# If we have a specific gender from the analysis, try to grab that specific table
|
| 3578 |
+
target_gender = inf_pat.get("gender")
|
| 3579 |
+
if target_gender and target_gender in inf_pat['declension_by_gender']:
|
| 3580 |
+
decl = inf_pat['declension_by_gender'][target_gender]
|
| 3581 |
+
else:
|
| 3582 |
+
# Fallback: take the first available
|
| 3583 |
+
first_gender = list(inf_pat['declension_by_gender'].keys())[0]
|
| 3584 |
+
decl = inf_pat['declension_by_gender'][first_gender]
|
| 3585 |
|
| 3586 |
+
if decl:
|
| 3587 |
+
# Noun Table Rows
|
| 3588 |
+
nom_sg = decl.get('Nominativ Singular', {}).get('definite', '-')
|
| 3589 |
+
nom_pl = decl.get('Nominativ Plural', {}).get('definite', '-')
|
| 3590 |
+
gen_sg = decl.get('Genitiv Singular', {}).get('definite', '-')
|
| 3591 |
+
dat_pl = decl.get('Dativ Plural', {}).get('definite', '-')
|
| 3592 |
+
|
| 3593 |
+
html += f"<tr><td class='inflection-label'>Nom. Singular</td><td>{nom_sg}</td></tr>"
|
| 3594 |
+
html += f"<tr><td class='inflection-label'>Nom. Plural</td><td>{nom_pl}</td></tr>"
|
| 3595 |
+
html += f"<tr><td class='inflection-label'>Gen. Singular</td><td>{gen_sg}</td></tr>"
|
| 3596 |
+
html += f"<tr><td class='inflection-label'>Dat. Plural</td><td>{dat_pl}</td></tr>"
|
| 3597 |
+
else:
|
| 3598 |
+
html += f"<tr><td colspan='2'><i>Keine Flexionsdaten verfügbar.</i></td></tr>"
|
| 3599 |
+
|
| 3600 |
+
elif pos_key == 'verb':
|
| 3601 |
+
cj = inf_pat.get('conjugation') or {}
|
| 3602 |
+
pres = cj.get('Präsens') or {}
|
| 3603 |
+
past = cj.get('Präteritum') or {}
|
| 3604 |
+
parts = inf_pat.get('participles') or {}
|
| 3605 |
|
| 3606 |
+
html += f"<tr><td class='inflection-label'>Infinitiv</td><td>{inf_pat.get('infinitive', lemma)}</td></tr>"
|
| 3607 |
+
html += f"<tr><td class='inflection-label'>3. Pers. Sg. (er/sie)</td><td>{pres.get('er/sie/es', '-')}</td></tr>"
|
| 3608 |
+
html += f"<tr><td class='inflection-label'>Präteritum (ich)</td><td>{past.get('ich', '-')}</td></tr>"
|
| 3609 |
+
html += f"<tr><td class='inflection-label'>Partizip II</td><td>{parts.get('Partizip Perfekt', '-')}</td></tr>"
|
| 3610 |
+
html += f"<tr><td class='inflection-label'>Konjunktiv II (ich)</td><td>{cj.get('Konjunktiv II', {}).get('ich', '-')}</td></tr>"
|
| 3611 |
+
|
| 3612 |
+
elif pos_key in ['adjective', 'adj']:
|
| 3613 |
+
html += f"<tr><td class='inflection-label'>Positiv</td><td>{inf_pat.get('predicative', lemma)}</td></tr>"
|
| 3614 |
+
html += f"<tr><td class='inflection-label'>Komparativ</td><td>{inf_pat.get('comparative', '-')}</td></tr>"
|
| 3615 |
+
html += f"<tr><td class='inflection-label'>Superlativ</td><td>{inf_pat.get('superlative', '-')}</td></tr>"
|
| 3616 |
|
| 3617 |
+
elif pos_key in ['adverb', 'adv']:
|
| 3618 |
+
html += f"<tr><td class='inflection-label'>Form</td><td>{lemma} (unveränderlich)</td></tr>"
|
| 3619 |
+
|
| 3620 |
+
html += "</table>"
|
| 3621 |
+
|
| 3622 |
+
# --- RAW FORMS FOOTER (The "Evidence") ---
|
| 3623 |
+
# Display the raw forms list from DB if available, as this proves the ground truth
|
| 3624 |
+
forms_list = inf_wikt.get("forms_list") or []
|
| 3625 |
+
if forms_list:
|
| 3626 |
+
# Deduplicate and flatten
|
| 3627 |
+
unique_forms = sorted(list(set([f.get('form_text') for f in forms_list if f.get('form_text')])))
|
| 3628 |
+
# Limit display to avoid wall of text
|
| 3629 |
+
display_forms = ", ".join(unique_forms[:12])
|
| 3630 |
+
if len(unique_forms) > 12: display_forms += f", ... ({len(unique_forms)-12} weitere)"
|
| 3631 |
+
|
| 3632 |
+
html += f"<div style='font-size:0.8em; color:#6b7280; margin-top:5px;'>"
|
| 3633 |
+
html += f"<strong>Beobachtete Formen (DB):</strong> {display_forms}</div>"
|
| 3634 |
|
| 3635 |
+
html += "</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3636 |
|
| 3637 |
+
# --- SEMANTICS SECTION ---
|
| 3638 |
+
html += "<div class='ling-section'><div class='ling-subtitle'>Bedeutungen & Definitionen</div>"
|
|
|
|
|
|
|
| 3639 |
|
| 3640 |
+
wikt_senses = sem_comb.get("wiktionary_senses") or []
|
| 3641 |
+
ode_senses = sem_comb.get("odenet_senses") or []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3642 |
|
| 3643 |
+
if not wikt_senses and not ode_senses:
|
| 3644 |
+
html += "<div class='sense-item'><i>Keine Definitionen gefunden.</i></div>"
|
| 3645 |
+
|
| 3646 |
+
# Render Wiktionary Senses
|
| 3647 |
+
for s in wikt_senses[:3]:
|
| 3648 |
+
gloss_raw = s.get("definition") or ""
|
| 3649 |
+
gloss = str(gloss_raw).replace(";", "<br>")
|
| 3650 |
+
if gloss:
|
| 3651 |
+
html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
|
| 3652 |
|
| 3653 |
+
# Render OdeNet Senses
|
| 3654 |
+
for s in ode_senses[:3]:
|
| 3655 |
+
defi = s.get("definition") or ""
|
| 3656 |
+
if defi:
|
| 3657 |
+
html += f"<div class='sense-item'><span class='source-badge src-oewn'>OdeNet</span> {defi}</div>"
|
| 3658 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3659 |
html += "</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3660 |
|
| 3661 |
+
# --- RELATIONS SECTION ---
|
| 3662 |
+
rels = sem_comb.get("conceptnet_relations") or []
|
| 3663 |
+
if rels:
|
| 3664 |
+
html += "<div class='ling-section'><div class='ling-subtitle'>Wissensgraph (Kontext)</div>"
|
| 3665 |
+
|
| 3666 |
+
top_n_rels = 6
|
| 3667 |
+
visible_rels = rels[:top_n_rels]
|
| 3668 |
+
hidden_rels = rels[top_n_rels:]
|
| 3669 |
+
|
| 3670 |
+
def render_rel(r):
|
| 3671 |
+
rel_name = r.get("relation", "Rel")
|
| 3672 |
+
target = r.get("other_node") or "?"
|
| 3673 |
+
if target == "?" and "surface" in r:
|
| 3674 |
+
parts = str(r["surface"]).split()
|
| 3675 |
+
if len(parts) > 2: target = parts[-1]
|
| 3676 |
+
return f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
|
| 3677 |
+
|
| 3678 |
+
html += "<div>"
|
| 3679 |
+
for r in visible_rels:
|
| 3680 |
+
html += render_rel(r)
|
| 3681 |
+
html += "</div>"
|
| 3682 |
+
|
| 3683 |
+
if hidden_rels:
|
| 3684 |
+
html += f"""
|
| 3685 |
+
<details class='kg-details'>
|
| 3686 |
+
<summary>Zeige {len(hidden_rels)} weitere Relationen</summary>
|
| 3687 |
+
<div class='kg-content'>
|
| 3688 |
+
"""
|
| 3689 |
+
for r in hidden_rels:
|
| 3690 |
+
html += render_rel(r)
|
| 3691 |
+
html += "</div></details>"
|
| 3692 |
+
|
| 3693 |
+
html += "</div>"
|
| 3694 |
|
| 3695 |
+
html += "</div>" # End Card (div.ling-card)
|
| 3696 |
|
| 3697 |
return html
|
| 3698 |
|
| 3699 |
+
def _map_wikt_form_to_pattern_keys(pos_key: str, tags_str: str) -> Optional[List[str]]:
|
| 3700 |
+
"""
|
| 3701 |
+
Parses a Wiktionary tag string and returns the corresponding path keys
|
| 3702 |
+
for the Pattern.de dictionary structure.
|
| 3703 |
+
"""
|
| 3704 |
+
if not tags_str: return None
|
| 3705 |
+
t = tags_str.lower()
|
| 3706 |
+
|
| 3707 |
+
if pos_key == "noun":
|
| 3708 |
+
# Pattern Structure: [Gender] -> "Nominativ Singular" -> "bare"/"definite"
|
| 3709 |
+
case = ""
|
| 3710 |
+
if "nominative" in t: case = "Nominativ"
|
| 3711 |
+
elif "genitive" in t: case = "Genitiv"
|
| 3712 |
+
elif "dative" in t: case = "Dativ"
|
| 3713 |
+
elif "accusative" in t: case = "Akkusativ"
|
| 3714 |
+
|
| 3715 |
+
number = ""
|
| 3716 |
+
if "singular" in t: number = "Singular"
|
| 3717 |
+
elif "plural" in t: number = "Plural"
|
| 3718 |
+
|
| 3719 |
+
if case and number:
|
| 3720 |
+
return [f"{case} {number}"]
|
| 3721 |
+
|
| 3722 |
+
elif pos_key == "verb":
|
| 3723 |
+
# Pattern Structure: "conjugation" -> "Präsens" -> "ich"
|
| 3724 |
+
tense = ""
|
| 3725 |
+
if "present" in t: tense = "Präsens"
|
| 3726 |
+
elif "past" in t or "preterite" in t: tense = "Präteritum"
|
| 3727 |
+
elif "subjunctive i" in t: tense = "Konjunktiv I"
|
| 3728 |
+
elif "subjunctive ii" in t: tense = "Konjunktiv II"
|
| 3729 |
+
elif "imperative" in t: tense = "Imperativ"
|
| 3730 |
+
|
| 3731 |
+
person_key = ""
|
| 3732 |
+
if "participle" in t:
|
| 3733 |
+
if "past" in t or "perfect" in t: return ["participles", "Partizip Perfekt"]
|
| 3734 |
+
if "present" in t: return ["participles", "Partizip Präsens"]
|
| 3735 |
+
|
| 3736 |
+
if "singular" in t:
|
| 3737 |
+
if "1" in t: person_key = "ich" if tense != "Imperativ" else "du" # 1sg usually not imp, but handling safety
|
| 3738 |
+
elif "2" in t: person_key = "du"
|
| 3739 |
+
elif "3" in t: person_key = "er/sie/es"
|
| 3740 |
+
elif "plural" in t:
|
| 3741 |
+
if "1" in t: person_key = "wir"
|
| 3742 |
+
elif "2" in t: person_key = "ihr"
|
| 3743 |
+
elif "3" in t: person_key = "sie/Sie"
|
| 3744 |
+
|
| 3745 |
+
if tense and person_key:
|
| 3746 |
+
return ["conjugation", tense, person_key]
|
| 3747 |
+
|
| 3748 |
+
elif pos_key == "adjective":
|
| 3749 |
+
# Pattern Structure: "comparative", "superlative"
|
| 3750 |
+
if "comparative" in t and "predicative" in t: return ["comparative"]
|
| 3751 |
+
if "superlative" in t and "predicative" in t: return ["superlative"]
|
| 3752 |
+
if "positive" in t and "predicative" in t: return ["predicative"]
|
| 3753 |
+
|
| 3754 |
+
return None
|
| 3755 |
+
|
| 3756 |
+
def _map_wikt_gender_to_pattern(tags_list: List[str]) -> Optional[int]:
|
| 3757 |
+
"""
|
| 3758 |
+
Maps Wiktionary tag strings (e.g., 'masculine') to pattern.de constants.
|
| 3759 |
+
Returns None if no specific gender is found.
|
| 3760 |
+
"""
|
| 3761 |
+
if not tags_list:
|
| 3762 |
+
return None
|
| 3763 |
+
|
| 3764 |
+
# Flatten and normalize tags
|
| 3765 |
+
# Wiktionary often provides tags like "masculine", "feminine", "neuter"
|
| 3766 |
+
tags_lower = [str(t).lower() for t in tags_list]
|
| 3767 |
+
|
| 3768 |
+
if "masculine" in tags_lower or "m" in tags_lower:
|
| 3769 |
+
return MALE
|
| 3770 |
+
if "feminine" in tags_lower or "f" in tags_lower:
|
| 3771 |
+
return FEMALE
|
| 3772 |
+
if "neuter" in tags_lower or "n" in tags_lower:
|
| 3773 |
+
return NEUTRAL
|
| 3774 |
+
|
| 3775 |
+
return None
|
| 3776 |
+
|
| 3777 |
def _format_comprehensive_html(data: Dict[str, Any]) -> str:
|
| 3778 |
""" Generates HTML for the comprehensive sentence analysis. """
|
| 3779 |
if "error" in data:
|