cstr commited on
Commit
95d2426
·
verified ·
1 Parent(s): 55bd9f8

fix wiktionary ground truth overrides pattern guesses

Browse files
Files changed (1) hide show
  1. app.py +460 -235
app.py CHANGED
@@ -951,74 +951,91 @@ def pattern_is_good_analysis(analysis, analysis_type):
951
 
952
 
953
  # --- Inflection Generators ---
954
- def pattern_analyze_as_noun(word: str, hint_lemma: str = None) -> Dict[str, Any]:
955
- """Comprehensive noun inflection analysis."""
956
- log(f" Analyzing as noun (hint_lemma={hint_lemma})")
 
 
 
 
 
957
  analysis = {}
 
 
958
  singular = singularize(word)
959
  plural = pluralize(word)
960
- log(f" singularize({word}) = {singular}")
961
- log(f" pluralize({word}) = {plural}")
962
  if plural != word and singular != word:
963
  base = word
964
- log(f" Word changes when pluralized => base = {base}")
965
  elif singular != word:
966
  base = singular
967
- log(f" Word changes when singularized => base = {base}")
968
  elif hint_lemma and hint_lemma != word:
969
  base = hint_lemma
970
- log(f" Using hint lemma => base = {base}")
971
  else:
972
- # This is a valid case, e.g. "Lauf" (singular)
973
  base = word
974
- log(f" Word is already base form => base = {base}")
975
 
976
- g = gender(base, pos=NOUN)
977
- log(f" gender({base}) = {g}")
978
-
979
- # --- AMBIGUITY HANDLING for Nouns (e.g., der/das See) ---
980
- if isinstance(g, tuple):
981
- genders = list(g)
982
- log(f" Detected ambiguous gender: {genders}")
983
- elif g is None:
984
- genders = [MALE] # Default
985
- log(f" Gender unknown, defaulting to MALE")
986
  else:
987
- genders = [g]
 
 
 
 
 
 
 
988
 
989
  analysis["base_form"] = base
990
  analysis["plural"] = pluralize(base)
991
  analysis["singular"] = base
992
  analysis["declension_by_gender"] = {}
993
 
 
994
  for gen in genders:
995
  gender_str = {MALE: "Masculine", FEMALE: "Feminine", NEUTRAL: "Neuter"}.get(gen, "Unknown")
996
  gen_declension = {}
 
997
  for number, number_name in [(SINGULAR, "Singular"), (PLURAL, "Plural")]:
998
  word_form = base if number == SINGULAR else pluralize(base)
999
  word_form_cap = word_form.capitalize()
1000
  gender_for_article = gen if number == SINGULAR else PLURAL
1001
- for case, case_name in [(NOMINATIVE, "Nominativ"), (ACCUSATIVE, "Akkusativ"),
 
1002
  (DATIVE, "Dativ"), (GENITIVE, "Genitiv")]:
1003
  try:
1004
  def_art = article(word_form, DEFINITE, gender_for_article, case)
1005
  indef_art = article(word_form, INDEFINITE, gender_for_article, case)
 
1006
  indef_form = f"{indef_art} {word_form_cap}" if indef_art else word_form_cap
1007
- if number == PLURAL:
1008
- indef_form = "—"
 
 
 
 
 
 
 
 
1009
  gen_declension[f"{case_name} {number_name}"] = {
1010
- "definite": f"{def_art} {word_form_cap}" if def_art else word_form_cap,
1011
  "indefinite": indef_form,
1012
- "bare": word_form_cap
1013
  }
1014
  except Exception as e:
1015
  log(f" Failed to get article for {gender_str}/{case_name} {number_name}: {e}")
 
1016
  analysis["declension_by_gender"][gender_str] = gen_declension
1017
 
1018
- log(f" Generated declensions for {len(genders)} gender(s)")
1019
  if len(genders) == 1:
1020
- analysis["declension"] = analysis["declension_by_gender"][list(analysis["declension_by_gender"].keys())[0]]
1021
- analysis["gender"] = list(analysis["declension_by_gender"].keys())[0]
 
1022
 
1023
  return analysis
1024
 
@@ -2174,9 +2191,11 @@ def _wiktionary_format_semantics_block(
2174
 
2175
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2176
  """
2177
- (PRIMARY ENGINE) Analyzes a word using the Wiktionary DB.
2178
- Returns {} on failure to signal dispatcher to fall back.
2179
  """
 
 
2180
  final_result: Dict[str, Any] = {
2181
  "input_word": word,
2182
  "analysis": {}
@@ -2184,7 +2203,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2184
 
2185
  conn = wiktionary_get_connection()
2186
  if not conn:
2187
- return {} # Return empty dict to signal failure
 
2188
 
2189
  # --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
2190
  spacy_pos_hint = None
@@ -2204,44 +2224,37 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2204
  else: spacy_pos_hint = spacy_pos_raw
2205
 
2206
  spacy_lemma_hint = token.lemma_
2207
- log(f"[DEBUG] Wiktionary Priority Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
2208
  except Exception as e:
2209
- log(f"[DEBUG] Wiktionary Priority Hint: spaCy/IWNLP failed: {e}")
2210
 
2211
  # --- 2. FIND ALL WIKTIONARY ENTRIES ---
2212
  try:
2213
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
2214
  except Exception as e:
2215
  log(f"[DEBUG] Wiktionary query failed: {e}")
2216
- return {} # Signal failure
2217
 
2218
  if not wiktionary_reports:
2219
- return {} # No results, signal to fallback
 
2220
 
2221
  # --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
2222
  def get_priority_score(report):
2223
  wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
2224
  wikt_lemma = report.get("lemma")
2225
-
2226
  # Priority 1: Exact POS match with spaCy hint
2227
  if spacy_pos_hint and wikt_pos == spacy_pos_hint:
2228
- # Bonus if lemma also matches
2229
- if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint:
2230
- return 1
2231
  return 2
2232
-
2233
- # Priority 2: Input word is the lemma (e.g., "Haus" -> "Haus")
2234
- if wikt_lemma.lower() == word.lower():
2235
- return 3
2236
-
2237
- # Priority 3: Other inflected forms (e.g. "gehe" -> "gehen")
2238
  return 4
2239
 
2240
  wiktionary_reports.sort(key=get_priority_score)
2241
- log(f"[DEBUG] Wiktionary: Sorted entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
2242
 
2243
-
2244
- # --- 4. BUILD AND *VALIDATE* THE FINAL REPORT (PATH-PURE) ---
2245
  word_lower = word.lower()
2246
 
2247
  for wikt_report in wiktionary_reports:
@@ -2249,46 +2262,133 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2249
  lemma = wikt_report.get("lemma", word)
2250
  pos_title = wikt_report.get("pos_title", "")
2251
 
2252
- # --- A. Build Wiktionary Inflection Block ---
 
 
 
2253
  inflections_wikt_block = {
2254
  "base_form": lemma,
2255
- "forms_list": wikt_report.get("forms", []),
2256
  "source": "wiktionary"
2257
  }
2258
 
2259
- # --- B. Build Pattern Inflection Block (CRITICAL for finding true lemma) ---
 
2260
  pattern_block = {}
 
2261
  if PATTERN_DE_AVAILABLE:
2262
  try:
 
2263
  if pos_key == "noun" or "Substantiv" in pos_title:
2264
- pattern_block = pattern_analyze_as_noun(lemma)
2265
- elif pos_key == "verb" or "Verb" in pos_title or "Konjugierte Form" in pos_title:
2266
- # Use the *input word* for inflected forms to find the right lemma
2267
- if "Konjugierte Form" in pos_title:
2268
- pattern_block = pattern_analyze_as_verb(word)
2269
  else:
2270
- pattern_block = pattern_analyze_as_verb(lemma)
 
 
 
 
 
 
 
2271
  elif pos_key == "adjective" or "Adjektiv" in pos_title or "Deklinierte Form" in pos_title:
2272
- # Use the *input word* for inflected forms
2273
- if "Deklinierte Form" in pos_title:
2274
- pattern_block = pattern_analyze_as_adjective(word)
2275
- else:
2276
- pattern_block = pattern_analyze_as_adjective(lemma)
2277
  elif pos_key == "adverb":
2278
  pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
2279
  except Exception as e:
2280
- pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
 
2281
 
2282
- # --- C. Build Semantics Block (using correct lemma from pattern_block) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2283
  semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
2284
 
2285
- # --- D. Assemble the report (pre-validation) ---
2286
  pos_entry_report = {
2287
  "inflections_wiktionary": inflections_wikt_block,
2288
- "inflections_pattern": pattern_block,
2289
  "semantics_combined": semantics_block,
2290
  "wiktionary_metadata": {
2291
- # --- Original Fields ---
2292
  "pos_title": pos_title,
2293
  "etymology": wikt_report.get("etymology_text"),
2294
  "pronunciation": wikt_report.get("sounds"),
@@ -2296,8 +2396,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2296
  "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
2297
  "entry_tags": wikt_report.get("entry_tags"),
2298
  "entry_categories": wikt_report.get("entry_categories"),
2299
-
2300
- # Pass through all new fields from the full DB ---
2301
  "entry_notes": wikt_report.get("entry_notes"),
2302
  "other_pos": wikt_report.get("other_pos"),
2303
  "raw_tags": wikt_report.get("raw_tags"),
@@ -2307,50 +2406,46 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2307
  "holonyms": wikt_report.get("holonyms"),
2308
  "meronyms": wikt_report.get("meronyms"),
2309
  "coordinate_terms": wikt_report.get("coordinate_terms"),
2310
- # We are now correctly getting the data we queried earlier.
2311
  "expressions": wikt_report.get("expressions"),
2312
  "proverbs": wikt_report.get("proverbs")
2313
-
2314
  }
2315
  }
2316
 
2317
- # --- E. VALIDATION FILTER (REVISED LOGIC) ---
2318
  is_valid = False
2319
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2320
 
2321
- # Check 1: Is the input word the lemma?
2322
- # This is true for base form entries (e.g., "Haus" -> "Haus (Substantiv)")
2323
- # AND for inflected form entries (e.g., "gießt" -> "gießt (Konjugierte Form)")
2324
  if lemma.lower() == word_lower:
2325
  is_valid = True
2326
- log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches entry lemma.")
2327
 
2328
- # Check 2: Is the input word in the *bare* forms list?
2329
- # (This applies to base entries where the input is an inflection, e.g., "gießt" -> "gehen (Verb)")
2330
- # We only run this if Check 1 failed AND this is not an inflected entry (which have no forms)
2331
  if not is_valid and not is_inflected_entry:
2332
- for form_entry in inflections_wikt_block.get("forms_list", []):
 
2333
  form_text = form_entry.get("form_text", "")
2334
- bare_form = re.sub(r"\(.*\)", "", form_text).strip()
2335
- bare_form = re.sub(r"^(der|die|das|ein|eine|am)\s+", "", bare_form, flags=re.IGNORECASE).strip()
2336
- bare_form = bare_form.rstrip("!.")
2337
-
2338
- if bare_form.lower() == word_lower:
2339
  is_valid = True
2340
- log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word found in form: '{form_text}'")
2341
  break
2342
-
2343
- # --- F. Add to final result if valid ---
 
 
 
 
 
2344
  if is_valid:
2345
  if pos_key not in final_result["analysis"]:
2346
  final_result["analysis"][pos_key] = []
2347
  final_result["analysis"][pos_key].append(pos_entry_report)
2348
  else:
2349
- log(f"[DEBUG] Wiktionary: DROPPING entry '{lemma}' ({pos_key}, {pos_title}) because input word '{word}' was not found in its valid forms.")
2350
-
2351
- # --- END OF VALIDATION ---
2352
 
2353
- final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching entries, kept {sum(len(v) for v in final_result.get('analysis', {}).values())}."
2354
  return final_result
2355
 
2356
  # ============================================================================
@@ -3383,172 +3478,302 @@ HTML_CSS = """
3383
  """
3384
 
3385
  def _format_word_analysis_html(data: Dict[str, Any]) -> str:
3386
- """ Generates HTML for a single word analysis result (German version). """
 
 
 
 
3387
  if not data or "analysis" not in data:
3388
  return f"{HTML_CSS}<div class='ling-card'>Keine Daten verfügbar. {data.get('info', '')}</div>"
3389
 
3390
  html = HTML_CSS
3391
  analysis = data["analysis"]
3392
 
3393
- # Iterate over POS
3394
  for pos_key, entries in analysis.items():
3395
  if not entries: continue
3396
- entry = entries[0] # Take best candidate
3397
-
3398
- # --- POS Display Logic ---
3399
- display_pos = pos_key.upper()
3400
- css_class = "pos-other"
3401
-
3402
- if pos_key == 'noun':
3403
- css_class = "pos-noun"
3404
- display_pos = "SUBSTANTIV"
3405
- elif pos_key == 'verb':
3406
- css_class = "pos-verb"
3407
- display_pos = "VERB"
3408
- elif pos_key == 'adj' or pos_key == 'adjective':
3409
- css_class = "pos-adj"
3410
- display_pos = "ADJEKTIV"
3411
- elif pos_key == 'adv' or pos_key == 'adverb':
3412
- css_class = "pos-adv"
3413
- display_pos = "ADVERB"
3414
 
3415
- # Data Extraction
3416
- inf_wikt = entry.get("inflections_wiktionary") or {}
3417
- inf_pat = entry.get("inflections_pattern") or {}
3418
- sem_comb = entry.get("semantics_combined") or {}
3419
-
3420
- lemma = inf_wikt.get("base_form") or \
3421
- inf_pat.get("base_form") or \
3422
- sem_comb.get("lemma") or \
3423
- data.get("input_word") or "?"
3424
-
3425
- # --- CARD START ---
3426
- html += f"""
3427
- <div class="ling-card">
3428
- <div class="ling-header">
3429
- <span class="ling-lemma">{lemma}</span>
3430
- <span class="ling-pos {css_class}">{display_pos}</span>
3431
- </div>
3432
- """
3433
-
3434
- # --- Inflections Section (Pattern.de logic) ---
3435
- html += "<div class='ling-section'><div class='ling-subtitle'>Morphologie & Flexion</div>"
3436
- html += "<table class='inflection-table'>"
3437
-
3438
- has_pattern_data = bool(inf_pat) and "error" not in inf_pat
3439
-
3440
- if pos_key == 'noun':
3441
- # Pattern.de returns 'declension' or 'declension_by_gender'
3442
- decl = inf_pat.get('declension')
3443
- # Fallback if declension is inside gender key
3444
- if not decl and inf_pat.get('declension_by_gender'):
3445
- first_gender = list(inf_pat['declension_by_gender'].keys())[0]
3446
- decl = inf_pat['declension_by_gender'][first_gender]
3447
 
3448
- if decl:
3449
- # Extract singular/plural nominative for concise display
3450
- nom_sg = decl.get('Nominativ Singular', {}).get('bare', '-')
3451
- nom_pl = decl.get('Nominativ Plural', {}).get('bare', '-')
3452
- gen_sg = decl.get('Genitiv Singular', {}).get('bare', '-')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3453
 
3454
- html += f"<tr><td class='inflection-label'>Singular (Nom)</td><td>{nom_sg}</td></tr>"
3455
- html += f"<tr><td class='inflection-label'>Plural (Nom)</td><td>{nom_pl}</td></tr>"
3456
- html += f"<tr><td class='inflection-label'>Genitiv (Sg)</td><td>{gen_sg}</td></tr>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3457
 
3458
- gender = inf_pat.get('gender', 'Unknown')
3459
- html += f"<tr><td class='inflection-label'>Genus</td><td>{gender}</td></tr>"
3460
- else:
3461
- html += f"<tr><td colspan='2'><i>Keine Flexionsdaten gefunden.</i></td></tr>"
3462
-
3463
- elif pos_key == 'verb':
3464
- cj = inf_pat.get('conjugation') or {}
3465
- pres = cj.get('Präsens') or {}
3466
- past = cj.get('Präteritum') or {}
3467
- parts = inf_pat.get('participles') or {}
3468
 
3469
- html += f"<tr><td class='inflection-label'>Infinitiv</td><td>{inf_pat.get('infinitive', lemma)}</td></tr>"
3470
- html += f"<tr><td class='inflection-label'>3. Pers. Sg. (er/sie)</td><td>{pres.get('er/sie/es', '-')}</td></tr>"
3471
- html += f"<tr><td class='inflection-label'>Präteritum (ich)</td><td>{past.get('ich', '-')}</td></tr>"
3472
- html += f"<tr><td class='inflection-label'>Partizip II</td><td>{parts.get('Partizip Perfekt', '-')}</td></tr>"
3473
-
3474
- elif pos_key in ['adjective', 'adj']:
3475
- html += f"<tr><td class='inflection-label'>Positiv</td><td>{inf_pat.get('predicative', lemma)}</td></tr>"
3476
- html += f"<tr><td class='inflection-label'>Komparativ</td><td>{inf_pat.get('comparative', '-')}</td></tr>"
3477
- html += f"<tr><td class='inflection-label'>Superlativ</td><td>{inf_pat.get('superlative', '-')}</td></tr>"
3478
-
3479
- # Wiktionary Forms (The "Other Forms" box)
3480
- forms_list = inf_wikt.get("forms_list") or []
3481
- if forms_list:
3482
- forms_str_list = []
3483
- for f in forms_list[:8]: # Show up to 8 forms
3484
- txt = f.get('form_text')
3485
- if txt: forms_str_list.append(txt)
3486
 
3487
- if forms_str_list:
3488
- html += f"<tr><td class='inflection-label'>Weitere Formen (DB)</td><td>{', '.join(forms_str_list)}</td></tr>"
3489
-
3490
- html += "</table></div>"
3491
-
3492
- # --- Semantics Section ---
3493
- html += "<div class='ling-section'><div class='ling-subtitle'>Bedeutungen & Definitionen</div>"
3494
-
3495
- wikt_senses = sem_comb.get("wiktionary_senses") or []
3496
- ode_senses = sem_comb.get("odenet_senses") or []
3497
-
3498
- if not wikt_senses and not ode_senses:
3499
- html += "<div class='sense-item'><i>Keine Definitionen gefunden.</i></div>"
3500
-
3501
- for s in wikt_senses[:3]:
3502
- gloss_raw = s.get("definition") or ""
3503
- gloss = str(gloss_raw).replace(";", "<br>")
3504
- if gloss:
3505
- html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
3506
 
3507
- for s in ode_senses[:3]:
3508
- defi = s.get("definition") or ""
3509
- if defi:
3510
- html += f"<div class='sense-item'><span class='source-badge src-oewn'>OdeNet</span> {defi}</div>"
3511
 
3512
- html += "</div>"
3513
-
3514
- # --- Relations Section ---
3515
- rels = sem_comb.get("conceptnet_relations") or []
3516
- if rels:
3517
- html += "<div class='ling-section'><div class='ling-subtitle'>Wissensgraph (Kontext)</div>"
3518
 
3519
- top_n_rels = 6
3520
- visible_rels = rels[:top_n_rels]
3521
- hidden_rels = rels[top_n_rels:]
 
 
 
 
 
 
3522
 
3523
- def render_rel(r):
3524
- rel_name = r.get("relation", "Rel")
3525
- target = r.get("other_node") or "?"
3526
- if target == "?" and "surface" in r:
3527
- parts = str(r["surface"]).split()
3528
- if len(parts) > 2: target = parts[-1]
3529
- return f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
3530
-
3531
- html += "<div>"
3532
- for r in visible_rels:
3533
- html += render_rel(r)
3534
  html += "</div>"
3535
-
3536
- if hidden_rels:
3537
- html += f"""
3538
- <details class='kg-details'>
3539
- <summary>Zeige {len(hidden_rels)} weitere Relationen</summary>
3540
- <div class='kg-content'>
3541
- """
3542
- for r in hidden_rels:
3543
- html += render_rel(r)
3544
- html += "</div></details>"
3545
 
3546
- html += "</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3547
 
3548
- html += "</div>" # End Card
3549
 
3550
  return html
3551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3552
  def _format_comprehensive_html(data: Dict[str, Any]) -> str:
3553
  """ Generates HTML for the comprehensive sentence analysis. """
3554
  if "error" in data:
 
951
 
952
 
953
  # --- Inflection Generators ---
954
+ def pattern_analyze_as_noun(word: str, hint_lemma: str = None, fixed_gender: int = None) -> Dict[str, Any]:
955
+ """
956
+ Comprehensive noun inflection analysis.
957
+ Args:
958
+ hint_lemma: A lemma suggestion to help Pattern.
959
+ fixed_gender: A pattern.de constant (MALE, FEMALE, NEUTRAL) to FORCE a specific gender.
960
+ """
961
+ log(f" Analyzing as noun (hint_lemma={hint_lemma}, fixed_gender={fixed_gender})")
962
  analysis = {}
963
+
964
+ # 1. Determine Base Form
965
  singular = singularize(word)
966
  plural = pluralize(word)
967
+
 
968
  if plural != word and singular != word:
969
  base = word
 
970
  elif singular != word:
971
  base = singular
 
972
  elif hint_lemma and hint_lemma != word:
973
  base = hint_lemma
 
974
  else:
 
975
  base = word
 
976
 
977
+ # 2. Determine Gender
978
+ # If Wiktionary gave us a gender, USE IT. Ignore Pattern's internal dictionary.
979
+ if fixed_gender is not None:
980
+ genders = [fixed_gender]
981
+ log(f" [Pattern] Enforcing gender from DB: {fixed_gender}")
 
 
 
 
 
982
  else:
983
+ # Fallback to auto-detection
984
+ g = gender(base, pos=NOUN)
985
+ if isinstance(g, tuple):
986
+ genders = list(g)
987
+ elif g is None:
988
+ genders = [MALE]
989
+ else:
990
+ genders = [g]
991
 
992
  analysis["base_form"] = base
993
  analysis["plural"] = pluralize(base)
994
  analysis["singular"] = base
995
  analysis["declension_by_gender"] = {}
996
 
997
+ # 3. Generate Declensions
998
  for gen in genders:
999
  gender_str = {MALE: "Masculine", FEMALE: "Feminine", NEUTRAL: "Neuter"}.get(gen, "Unknown")
1000
  gen_declension = {}
1001
+
1002
  for number, number_name in [(SINGULAR, "Singular"), (PLURAL, "Plural")]:
1003
  word_form = base if number == SINGULAR else pluralize(base)
1004
  word_form_cap = word_form.capitalize()
1005
  gender_for_article = gen if number == SINGULAR else PLURAL
1006
+
1007
+ for case, case_name in [(NOMINATIVE, "Nominativ"), (ACCUSATIVE, "Akkusativ"),
1008
  (DATIVE, "Dativ"), (GENITIVE, "Genitiv")]:
1009
  try:
1010
  def_art = article(word_form, DEFINITE, gender_for_article, case)
1011
  indef_art = article(word_form, INDEFINITE, gender_for_article, case)
1012
+
1013
  indef_form = f"{indef_art} {word_form_cap}" if indef_art else word_form_cap
1014
+ if number == PLURAL: indef_form = "—"
1015
+
1016
+ # Fix for Pattern sometimes missing Genitive 's' suffix on Masculine/Neuter
1017
+ noun_text = word_form_cap
1018
+ if number == SINGULAR and case == GENITIVE and gen in [MALE, NEUTRAL] and not noun_text.endswith("s") and not noun_text.endswith("x") and not noun_text.endswith("z"):
1019
+ # Simple heuristic fix: German Genitive usually adds 's' or 'es'
1020
+ # Pattern handles this usually, but if we force gender on a word Pattern doesn't know, it might miss it.
1021
+ # For safety, we trust Pattern's output, but if you find Pattern fails here, you inject logic here.
1022
+ pass
1023
+
1024
  gen_declension[f"{case_name} {number_name}"] = {
1025
+ "definite": f"{def_art} {noun_text}" if def_art else noun_text,
1026
  "indefinite": indef_form,
1027
+ "bare": noun_text
1028
  }
1029
  except Exception as e:
1030
  log(f" Failed to get article for {gender_str}/{case_name} {number_name}: {e}")
1031
+
1032
  analysis["declension_by_gender"][gender_str] = gen_declension
1033
 
1034
+ # Flatten for the main keys if only one gender exists
1035
  if len(genders) == 1:
1036
+ first_gen_key = list(analysis["declension_by_gender"].keys())[0]
1037
+ analysis["declension"] = analysis["declension_by_gender"][first_gen_key]
1038
+ analysis["gender"] = first_gen_key
1039
 
1040
  return analysis
1041
 
 
2191
 
2192
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2193
  """
2194
+ (PRIMARY ENGINE) Analyzes a word using the Wiktionary DB as Ground Truth,
2195
+ filling in missing gaps with Pattern.de generation.
2196
  """
2197
+ print(f"\n[Wiktionary Engine] Starting analysis for: {word}")
2198
+
2199
  final_result: Dict[str, Any] = {
2200
  "input_word": word,
2201
  "analysis": {}
 
2203
 
2204
  conn = wiktionary_get_connection()
2205
  if not conn:
2206
+ log("[Wiktionary Engine] No DB connection available.")
2207
+ return {}
2208
 
2209
  # --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
2210
  spacy_pos_hint = None
 
2224
  else: spacy_pos_hint = spacy_pos_raw
2225
 
2226
  spacy_lemma_hint = token.lemma_
2227
+ log(f"[DEBUG] Priority Hint: spaCy POS='{spacy_pos_hint}', Lemma='{spacy_lemma_hint}'")
2228
  except Exception as e:
2229
+ log(f"[DEBUG] Priority Hint failed: {e}")
2230
 
2231
  # --- 2. FIND ALL WIKTIONARY ENTRIES ---
2232
  try:
2233
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
2234
  except Exception as e:
2235
  log(f"[DEBUG] Wiktionary query failed: {e}")
2236
+ return {}
2237
 
2238
  if not wiktionary_reports:
2239
+ log(f"[DEBUG] No Wiktionary entries found for '{word}'.")
2240
+ return {}
2241
 
2242
  # --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
2243
  def get_priority_score(report):
2244
  wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
2245
  wikt_lemma = report.get("lemma")
 
2246
  # Priority 1: Exact POS match with spaCy hint
2247
  if spacy_pos_hint and wikt_pos == spacy_pos_hint:
2248
+ if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint: return 1
 
 
2249
  return 2
2250
+ # Priority 2: Input word is the lemma
2251
+ if wikt_lemma and wikt_lemma.lower() == word.lower(): return 3
 
 
 
 
2252
  return 4
2253
 
2254
  wiktionary_reports.sort(key=get_priority_score)
2255
+ log(f"[DEBUG] Sorted {len(wiktionary_reports)} entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
2256
 
2257
+ # --- 4. PROCESS ENTRIES (HYBRID STRATEGY) ---
 
2258
  word_lower = word.lower()
2259
 
2260
  for wikt_report in wiktionary_reports:
 
2262
  lemma = wikt_report.get("lemma", word)
2263
  pos_title = wikt_report.get("pos_title", "")
2264
 
2265
+ log(f"\n--- Processing Entry: {lemma} ({pos_key}) ---")
2266
+
2267
+ # --- A. Raw Wiktionary Forms (Ground Truth) ---
2268
+ wikt_forms_list = wikt_report.get("forms", [])
2269
  inflections_wikt_block = {
2270
  "base_form": lemma,
2271
+ "forms_list": wikt_forms_list,
2272
  "source": "wiktionary"
2273
  }
2274
 
2275
+ # --- B. Generate Base Pattern Template (The Scaffold) ---
2276
+ # We ALWAYS generate this if Pattern is available, to provide the table structure.
2277
  pattern_block = {}
2278
+
2279
  if PATTERN_DE_AVAILABLE:
2280
  try:
2281
+ log(f"[DEBUG] Generating Pattern.de base template for '{lemma}' ({pos_key})...")
2282
  if pos_key == "noun" or "Substantiv" in pos_title:
2283
+ # Gender-Aware Generation
2284
+ wikt_tags = wikt_report.get("entry_tags", [])
2285
+ forced_gender = _map_wikt_gender_to_pattern(wikt_tags)
2286
+ if forced_gender:
2287
+ log(f"[DEBUG] Context: Forcing Pattern gender to {forced_gender} based on Wiktionary tags.")
2288
  else:
2289
+ log(f"[DEBUG] Context: No gender tags in Wiktionary. Letting Pattern auto-detect.")
2290
+
2291
+ pattern_block = pattern_analyze_as_noun(lemma, fixed_gender=forced_gender)
2292
+
2293
+ elif pos_key == "verb" or "Verb" in pos_title or "Konjugierte Form" in pos_title:
2294
+ use_word = word if "Konjugierte Form" in pos_title else lemma
2295
+ pattern_block = pattern_analyze_as_verb(use_word)
2296
+
2297
  elif pos_key == "adjective" or "Adjektiv" in pos_title or "Deklinierte Form" in pos_title:
2298
+ use_word = word if "Deklinierte Form" in pos_title else lemma
2299
+ pattern_block = pattern_analyze_as_adjective(use_word)
2300
+
 
 
2301
  elif pos_key == "adverb":
2302
  pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
2303
  except Exception as e:
2304
+ log(f"[ERROR] Pattern.de generation failed: {e}")
2305
+ pattern_block = {"error": f"Pattern.de failed: {e}"}
2306
 
2307
+ # --- C. THE HYBRID MERGE: Overwrite Pattern data with Wiktionary Truth ---
2308
+ # logic: If Wiktionary has a form for a specific slot, use it.
2309
+ # If not, keep the Pattern generated form (thereby filling the gap).
2310
+
2311
+ if pattern_block and "error" not in pattern_block and wikt_forms_list:
2312
+ log(f"[DEBUG] Starting Hybrid Merge (Wiktionary forms: {len(wikt_forms_list)})...")
2313
+
2314
+ overwrites_count = 0
2315
+
2316
+ for wikt_form in wikt_forms_list:
2317
+ text = wikt_form.get("form_text")
2318
+ tags = wikt_form.get("tags")
2319
+ if not text or not tags: continue
2320
+
2321
+ # Map Wikt tags to the address inside pattern_block
2322
+ path_keys = _map_wikt_form_to_pattern_keys(pos_key, tags)
2323
+
2324
+ if path_keys:
2325
+ # Navigate to the slot in pattern_block
2326
+ target = pattern_block
2327
+
2328
+ # Special handling for Noun structure (declension_by_gender)
2329
+ if pos_key == "noun" and "declension_by_gender" in pattern_block:
2330
+ # We apply the overwrite to ALL genders present in the pattern block
2331
+ # (Usually only 1 if we forced it, but maybe more if ambiguous)
2332
+ for gender_key in pattern_block["declension_by_gender"]:
2333
+ # path_keys[0] is e.g. "Nominativ Singular"
2334
+ slot_key = path_keys[0]
2335
+ target_dict = pattern_block["declension_by_gender"][gender_key]
2336
+
2337
+ if slot_key in target_dict:
2338
+ # Noun slots have subkeys: 'bare', 'definite', 'indefinite'
2339
+ # Wiktionary usually gives the form with article "der See" or without "Seen"
2340
+ # We try to be smart about updating 'bare' vs 'definite'
2341
+
2342
+ current_bare = target_dict[slot_key].get('bare', '')
2343
+
2344
+ # Simple clean: remove articles to get bare
2345
+ clean_text = re.sub(r"^(der|die|das|den|dem|des|ein|eine|einen|einem|einer|eines)\s+", "", text, flags=re.IGNORECASE).strip()
2346
+
2347
+ if clean_text != current_bare:
2348
+ log(f"[DEBUG] Merge: Overwriting {gender_key} -> {slot_key} | Old: '{current_bare}' -> New: '{clean_text}' (Source: Wiktionary)")
2349
+ target_dict[slot_key]['bare'] = clean_text
2350
+ # Also update full forms if possible
2351
+ if "definite" in target_dict[slot_key]:
2352
+ # We can reconstruct definite if we know the article, but let's just trust the bare text update
2353
+ # because the HTML renderer often rebuilds the article.
2354
+ # However, let's update 'definite' if the wikt text looks like it has an article
2355
+ if " " in text:
2356
+ target_dict[slot_key]['definite'] = text
2357
+ overwrites_count += 1
2358
+
2359
+ # Handling for Verbs/Adjectives (Nested Dicts)
2360
+ else:
2361
+ # Navigate deep
2362
+ valid_path = True
2363
+ for key in path_keys[:-1]:
2364
+ if key in target:
2365
+ target = target[key]
2366
+ else:
2367
+ valid_path = False
2368
+ break
2369
+
2370
+ if valid_path:
2371
+ last_key = path_keys[-1]
2372
+ if last_key in target and target[last_key] != text:
2373
+ log(f"[DEBUG] Merge: Overwriting {path_keys} | Old: '{target[last_key]}' -> New: '{text}' (Source: Wiktionary)")
2374
+ target[last_key] = text
2375
+ overwrites_count += 1
2376
+
2377
+ log(f"[DEBUG] Merge complete. {overwrites_count} slots updated with Ground Truth.")
2378
+ # Mark the block as hybrid so UI can verify validity
2379
+ pattern_block["is_hybrid"] = True
2380
+
2381
+ # --- D. Build Semantics Block ---
2382
+ # Use lemma from Wiktionary (Ground Truth)
2383
+ semantics_lemma = lemma
2384
  semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
2385
 
2386
+ # --- E. Assemble Final Report ---
2387
  pos_entry_report = {
2388
  "inflections_wiktionary": inflections_wikt_block,
2389
+ "inflections_pattern": pattern_block, # This is now the Hybrid Block
2390
  "semantics_combined": semantics_block,
2391
  "wiktionary_metadata": {
 
2392
  "pos_title": pos_title,
2393
  "etymology": wikt_report.get("etymology_text"),
2394
  "pronunciation": wikt_report.get("sounds"),
 
2396
  "examples": [ex for s in wikt_report.get("senses", []) for ex in s.get("examples", [])],
2397
  "entry_tags": wikt_report.get("entry_tags"),
2398
  "entry_categories": wikt_report.get("entry_categories"),
2399
+ # New fields
 
2400
  "entry_notes": wikt_report.get("entry_notes"),
2401
  "other_pos": wikt_report.get("other_pos"),
2402
  "raw_tags": wikt_report.get("raw_tags"),
 
2406
  "holonyms": wikt_report.get("holonyms"),
2407
  "meronyms": wikt_report.get("meronyms"),
2408
  "coordinate_terms": wikt_report.get("coordinate_terms"),
 
2409
  "expressions": wikt_report.get("expressions"),
2410
  "proverbs": wikt_report.get("proverbs")
 
2411
  }
2412
  }
2413
 
2414
+ # --- F. Validation Filter ---
2415
  is_valid = False
2416
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2417
 
2418
+ # Check 1: Lemma Match
 
 
2419
  if lemma.lower() == word_lower:
2420
  is_valid = True
2421
+ log(f"[DEBUG] Validate: Accepted '{lemma}' (Lemma Match)")
2422
 
2423
+ # Check 2: Form Match
 
 
2424
  if not is_valid and not is_inflected_entry:
2425
+ # Look in Ground Truth (Wiktionary)
2426
+ for form_entry in wikt_forms_list:
2427
  form_text = form_entry.get("form_text", "")
2428
+ clean_form = re.sub(r"\(.*\)", "", form_text).strip() # Remove parens
2429
+ clean_form = re.sub(r"^(der|die|das|ein|eine|...)\s+", "", clean_form, flags=re.IGNORECASE).strip() # Remove articles
2430
+ if word_lower in clean_form.lower():
 
 
2431
  is_valid = True
2432
+ log(f"[DEBUG] Validate: Accepted '{lemma}' (Found in Wiktionary forms)")
2433
  break
2434
+
2435
+ # Look in Pattern Generation (if Wikt failed)
2436
+ if not is_valid and pattern_block:
2437
+ if word_appears_in_inflections(word, pattern_block, pos_key):
2438
+ is_valid = True
2439
+ log(f"[DEBUG] Validate: Accepted '{lemma}' (Found in Pattern forms)")
2440
+
2441
  if is_valid:
2442
  if pos_key not in final_result["analysis"]:
2443
  final_result["analysis"][pos_key] = []
2444
  final_result["analysis"][pos_key].append(pos_entry_report)
2445
  else:
2446
+ log(f"[DEBUG] Validate: Dropped '{lemma}' ({pos_key}) - No match found.")
 
 
2447
 
2448
+ final_result["info"] = f"Analysis from Wiktionary (Hybrid Engine). Found {len(wiktionary_reports)} entries."
2449
  return final_result
2450
 
2451
  # ============================================================================
 
3478
  """
3479
 
3480
  def _format_word_analysis_html(data: Dict[str, Any]) -> str:
3481
+ """
3482
+ Generates HTML for a single word analysis (German version).
3483
+ Renders the 'inflections_pattern' block, which contains the
3484
+ Hybrid (Wiktionary-verified) data from the backend.
3485
+ """
3486
  if not data or "analysis" not in data:
3487
  return f"{HTML_CSS}<div class='ling-card'>Keine Daten verfügbar. {data.get('info', '')}</div>"
3488
 
3489
  html = HTML_CSS
3490
  analysis = data["analysis"]
3491
 
3492
+ # Iterate over POS categories (noun, verb, etc.)
3493
  for pos_key, entries in analysis.items():
3494
  if not entries: continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3495
 
3496
+ # We usually display the best candidate, but if there are multiple distinct entries
3497
+ # (like "der See" vs "die See"), the backend groups them in the list.
3498
+ # We should ideally render ALL entries in the list to show the homonyms.
3499
+ # This loop handles that.
3500
+ for entry in entries:
3501
+
3502
+ # Data Extraction
3503
+ inf_wikt = entry.get("inflections_wiktionary") or {}
3504
+ inf_pat = entry.get("inflections_pattern") or {}
3505
+ sem_comb = entry.get("semantics_combined") or {}
3506
+ meta = entry.get("wiktionary_metadata") or {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3507
 
3508
+ lemma = inf_wikt.get("base_form") or \
3509
+ inf_pat.get("base_form") or \
3510
+ sem_comb.get("lemma") or \
3511
+ data.get("input_word") or "?"
3512
+
3513
+ # --- POS Display Logic ---
3514
+ display_pos = pos_key.upper()
3515
+ css_class = "pos-other"
3516
+
3517
+ if pos_key == 'noun':
3518
+ css_class = "pos-noun"
3519
+ display_pos = "SUBSTANTIV"
3520
+ # Append Gender to POS badge if available
3521
+ if "gender" in inf_pat:
3522
+ gender_map = {"Masculine": "M", "Feminine": "F", "Neuter": "N"}
3523
+ g_short = gender_map.get(inf_pat['gender'], "?")
3524
+ display_pos += f" ({g_short})"
3525
+
3526
+ elif pos_key == 'verb':
3527
+ css_class = "pos-verb"
3528
+ display_pos = "VERB"
3529
+ elif pos_key == 'adj' or pos_key == 'adjective':
3530
+ css_class = "pos-adj"
3531
+ display_pos = "ADJEKTIV"
3532
+ elif pos_key == 'adv' or pos_key == 'adverb':
3533
+ css_class = "pos-adv"
3534
+ display_pos = "ADVERB"
3535
+
3536
+ # --- CARD START ---
3537
+ html += f"""
3538
+ <div class="ling-card">
3539
+ <div class="ling-header">
3540
+ <span class="ling-lemma">{lemma}</span>
3541
+ <span class="ling-pos {css_class}">{display_pos}</span>
3542
+ """
3543
+
3544
+ # Add small title if available (e.g., "Konjugierte Form")
3545
+ if meta.get("pos_title"):
3546
+ html += f"<span style='margin-left:10px; color:#6b7280; font-size:0.85em;'>{meta['pos_title']}</span>"
3547
+
3548
+ html += "</div>" # End Header
3549
+
3550
+ # --- SOURCE BADGE LOGIC ---
3551
+ # Determine credibility of the data
3552
+ is_hybrid = inf_pat.get("is_hybrid", False)
3553
+ wikt_forms_count = len(inf_wikt.get("forms_list", []))
3554
+
3555
+ badge_style = "float:right; font-weight:bold; font-size:0.75em; padding:2px 6px; border-radius:4px;"
3556
+
3557
+ if is_hybrid:
3558
+ source_html = f"<span style='{badge_style} background:#ecfdf5; color:#065f46; border:1px solid #a7f3d0;'>Quelle: Wiktionary (Verifiziert)</span>"
3559
+ elif wikt_forms_count > 0:
3560
+ source_html = f"<span style='{badge_style} background:#ecfdf5; color:#065f46; border:1px solid #a7f3d0;'>Quelle: Wiktionary (DB)</span>"
3561
+ elif inf_pat and "error" not in inf_pat:
3562
+ source_html = f"<span style='{badge_style} background:#fffbeb; color:#92400e; border:1px solid #fcd34d;'>Quelle: Pattern (Generiert)</span>"
3563
+ else:
3564
+ source_html = ""
3565
+
3566
+ # --- INFLECTIONS SECTION ---
3567
+ html += f"<div class='ling-section'>{source_html}<div class='ling-subtitle'>Morphologie & Flexion</div>"
3568
+ html += "<table class='inflection-table'>"
3569
+
3570
+ # We render the table based on 'inf_pat' because the backend has already merged
3571
+ # the Wiktionary truths into this structure.
3572
+
3573
+ if pos_key == 'noun':
3574
+ decl = inf_pat.get('declension')
3575
+ # Fallback if declension is nested in gender key
3576
+ if not decl and inf_pat.get('declension_by_gender'):
3577
+ # If we have a specific gender from the analysis, try to grab that specific table
3578
+ target_gender = inf_pat.get("gender")
3579
+ if target_gender and target_gender in inf_pat['declension_by_gender']:
3580
+ decl = inf_pat['declension_by_gender'][target_gender]
3581
+ else:
3582
+ # Fallback: take the first available
3583
+ first_gender = list(inf_pat['declension_by_gender'].keys())[0]
3584
+ decl = inf_pat['declension_by_gender'][first_gender]
3585
 
3586
+ if decl:
3587
+ # Noun Table Rows
3588
+ nom_sg = decl.get('Nominativ Singular', {}).get('definite', '-')
3589
+ nom_pl = decl.get('Nominativ Plural', {}).get('definite', '-')
3590
+ gen_sg = decl.get('Genitiv Singular', {}).get('definite', '-')
3591
+ dat_pl = decl.get('Dativ Plural', {}).get('definite', '-')
3592
+
3593
+ html += f"<tr><td class='inflection-label'>Nom. Singular</td><td>{nom_sg}</td></tr>"
3594
+ html += f"<tr><td class='inflection-label'>Nom. Plural</td><td>{nom_pl}</td></tr>"
3595
+ html += f"<tr><td class='inflection-label'>Gen. Singular</td><td>{gen_sg}</td></tr>"
3596
+ html += f"<tr><td class='inflection-label'>Dat. Plural</td><td>{dat_pl}</td></tr>"
3597
+ else:
3598
+ html += f"<tr><td colspan='2'><i>Keine Flexionsdaten verfügbar.</i></td></tr>"
3599
+
3600
+ elif pos_key == 'verb':
3601
+ cj = inf_pat.get('conjugation') or {}
3602
+ pres = cj.get('Präsens') or {}
3603
+ past = cj.get('Präteritum') or {}
3604
+ parts = inf_pat.get('participles') or {}
3605
 
3606
+ html += f"<tr><td class='inflection-label'>Infinitiv</td><td>{inf_pat.get('infinitive', lemma)}</td></tr>"
3607
+ html += f"<tr><td class='inflection-label'>3. Pers. Sg. (er/sie)</td><td>{pres.get('er/sie/es', '-')}</td></tr>"
3608
+ html += f"<tr><td class='inflection-label'>Präteritum (ich)</td><td>{past.get('ich', '-')}</td></tr>"
3609
+ html += f"<tr><td class='inflection-label'>Partizip II</td><td>{parts.get('Partizip Perfekt', '-')}</td></tr>"
3610
+ html += f"<tr><td class='inflection-label'>Konjunktiv II (ich)</td><td>{cj.get('Konjunktiv II', {}).get('ich', '-')}</td></tr>"
3611
+
3612
+ elif pos_key in ['adjective', 'adj']:
3613
+ html += f"<tr><td class='inflection-label'>Positiv</td><td>{inf_pat.get('predicative', lemma)}</td></tr>"
3614
+ html += f"<tr><td class='inflection-label'>Komparativ</td><td>{inf_pat.get('comparative', '-')}</td></tr>"
3615
+ html += f"<tr><td class='inflection-label'>Superlativ</td><td>{inf_pat.get('superlative', '-')}</td></tr>"
3616
 
3617
+ elif pos_key in ['adverb', 'adv']:
3618
+ html += f"<tr><td class='inflection-label'>Form</td><td>{lemma} (unveränderlich)</td></tr>"
3619
+
3620
+ html += "</table>"
3621
+
3622
+ # --- RAW FORMS FOOTER (The "Evidence") ---
3623
+ # Display the raw forms list from DB if available, as this proves the ground truth
3624
+ forms_list = inf_wikt.get("forms_list") or []
3625
+ if forms_list:
3626
+ # Deduplicate and flatten
3627
+ unique_forms = sorted(list(set([f.get('form_text') for f in forms_list if f.get('form_text')])))
3628
+ # Limit display to avoid wall of text
3629
+ display_forms = ", ".join(unique_forms[:12])
3630
+ if len(unique_forms) > 12: display_forms += f", ... ({len(unique_forms)-12} weitere)"
3631
+
3632
+ html += f"<div style='font-size:0.8em; color:#6b7280; margin-top:5px;'>"
3633
+ html += f"<strong>Beobachtete Formen (DB):</strong> {display_forms}</div>"
3634
 
3635
+ html += "</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3636
 
3637
+ # --- SEMANTICS SECTION ---
3638
+ html += "<div class='ling-section'><div class='ling-subtitle'>Bedeutungen & Definitionen</div>"
 
 
3639
 
3640
+ wikt_senses = sem_comb.get("wiktionary_senses") or []
3641
+ ode_senses = sem_comb.get("odenet_senses") or []
 
 
 
 
3642
 
3643
+ if not wikt_senses and not ode_senses:
3644
+ html += "<div class='sense-item'><i>Keine Definitionen gefunden.</i></div>"
3645
+
3646
+ # Render Wiktionary Senses
3647
+ for s in wikt_senses[:3]:
3648
+ gloss_raw = s.get("definition") or ""
3649
+ gloss = str(gloss_raw).replace(";", "<br>")
3650
+ if gloss:
3651
+ html += f"<div class='sense-item'><span class='source-badge src-wikt'>Wikt</span> {gloss}</div>"
3652
 
3653
+ # Render OdeNet Senses
3654
+ for s in ode_senses[:3]:
3655
+ defi = s.get("definition") or ""
3656
+ if defi:
3657
+ html += f"<div class='sense-item'><span class='source-badge src-oewn'>OdeNet</span> {defi}</div>"
3658
+
 
 
 
 
 
3659
  html += "</div>"
 
 
 
 
 
 
 
 
 
 
3660
 
3661
+ # --- RELATIONS SECTION ---
3662
+ rels = sem_comb.get("conceptnet_relations") or []
3663
+ if rels:
3664
+ html += "<div class='ling-section'><div class='ling-subtitle'>Wissensgraph (Kontext)</div>"
3665
+
3666
+ top_n_rels = 6
3667
+ visible_rels = rels[:top_n_rels]
3668
+ hidden_rels = rels[top_n_rels:]
3669
+
3670
+ def render_rel(r):
3671
+ rel_name = r.get("relation", "Rel")
3672
+ target = r.get("other_node") or "?"
3673
+ if target == "?" and "surface" in r:
3674
+ parts = str(r["surface"]).split()
3675
+ if len(parts) > 2: target = parts[-1]
3676
+ return f"<span class='rel-chip'><span class='rel-type'>{rel_name}:</span> {target}</span>"
3677
+
3678
+ html += "<div>"
3679
+ for r in visible_rels:
3680
+ html += render_rel(r)
3681
+ html += "</div>"
3682
+
3683
+ if hidden_rels:
3684
+ html += f"""
3685
+ <details class='kg-details'>
3686
+ <summary>Zeige {len(hidden_rels)} weitere Relationen</summary>
3687
+ <div class='kg-content'>
3688
+ """
3689
+ for r in hidden_rels:
3690
+ html += render_rel(r)
3691
+ html += "</div></details>"
3692
+
3693
+ html += "</div>"
3694
 
3695
+ html += "</div>" # End Card (div.ling-card)
3696
 
3697
  return html
3698
 
3699
+ def _map_wikt_form_to_pattern_keys(pos_key: str, tags_str: str) -> Optional[List[str]]:
3700
+ """
3701
+ Parses a Wiktionary tag string and returns the corresponding path keys
3702
+ for the Pattern.de dictionary structure.
3703
+ """
3704
+ if not tags_str: return None
3705
+ t = tags_str.lower()
3706
+
3707
+ if pos_key == "noun":
3708
+ # Pattern Structure: [Gender] -> "Nominativ Singular" -> "bare"/"definite"
3709
+ case = ""
3710
+ if "nominative" in t: case = "Nominativ"
3711
+ elif "genitive" in t: case = "Genitiv"
3712
+ elif "dative" in t: case = "Dativ"
3713
+ elif "accusative" in t: case = "Akkusativ"
3714
+
3715
+ number = ""
3716
+ if "singular" in t: number = "Singular"
3717
+ elif "plural" in t: number = "Plural"
3718
+
3719
+ if case and number:
3720
+ return [f"{case} {number}"]
3721
+
3722
+ elif pos_key == "verb":
3723
+ # Pattern Structure: "conjugation" -> "Präsens" -> "ich"
3724
+ tense = ""
3725
+ if "present" in t: tense = "Präsens"
3726
+ elif "past" in t or "preterite" in t: tense = "Präteritum"
3727
+ elif "subjunctive i" in t: tense = "Konjunktiv I"
3728
+ elif "subjunctive ii" in t: tense = "Konjunktiv II"
3729
+ elif "imperative" in t: tense = "Imperativ"
3730
+
3731
+ person_key = ""
3732
+ if "participle" in t:
3733
+ if "past" in t or "perfect" in t: return ["participles", "Partizip Perfekt"]
3734
+ if "present" in t: return ["participles", "Partizip Präsens"]
3735
+
3736
+ if "singular" in t:
3737
+ if "1" in t: person_key = "ich" if tense != "Imperativ" else "du" # 1sg usually not imp, but handling safety
3738
+ elif "2" in t: person_key = "du"
3739
+ elif "3" in t: person_key = "er/sie/es"
3740
+ elif "plural" in t:
3741
+ if "1" in t: person_key = "wir"
3742
+ elif "2" in t: person_key = "ihr"
3743
+ elif "3" in t: person_key = "sie/Sie"
3744
+
3745
+ if tense and person_key:
3746
+ return ["conjugation", tense, person_key]
3747
+
3748
+ elif pos_key == "adjective":
3749
+ # Pattern Structure: "comparative", "superlative"
3750
+ if "comparative" in t and "predicative" in t: return ["comparative"]
3751
+ if "superlative" in t and "predicative" in t: return ["superlative"]
3752
+ if "positive" in t and "predicative" in t: return ["predicative"]
3753
+
3754
+ return None
3755
+
3756
+ def _map_wikt_gender_to_pattern(tags_list: List[str]) -> Optional[int]:
3757
+ """
3758
+ Maps Wiktionary tag strings (e.g., 'masculine') to pattern.de constants.
3759
+ Returns None if no specific gender is found.
3760
+ """
3761
+ if not tags_list:
3762
+ return None
3763
+
3764
+ # Flatten and normalize tags
3765
+ # Wiktionary often provides tags like "masculine", "feminine", "neuter"
3766
+ tags_lower = [str(t).lower() for t in tags_list]
3767
+
3768
+ if "masculine" in tags_lower or "m" in tags_lower:
3769
+ return MALE
3770
+ if "feminine" in tags_lower or "f" in tags_lower:
3771
+ return FEMALE
3772
+ if "neuter" in tags_lower or "n" in tags_lower:
3773
+ return NEUTRAL
3774
+
3775
+ return None
3776
+
3777
  def _format_comprehensive_html(data: Dict[str, Any]) -> str:
3778
  """ Generates HTML for the comprehensive sentence analysis. """
3779
  if "error" in data: