cstr commited on
Commit
5c5cf64
·
verified ·
1 Parent(s): 9da6dfa

path checks for fallback methods

Browse files
Files changed (1) hide show
  1. app.py +221 -250
app.py CHANGED
@@ -2524,183 +2524,177 @@ def comprehensive_german_analysis(text: str, top_n_value: Optional[float] = 0) -
2524
  }
2525
 
2526
  # --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
2527
- def _analyze_word_with_hanta(word: str, top_n_value: int) -> Dict[str, Any]:
2528
  """
2529
- (PUBLIC DISPATCHER) Analyzes a single word for all possible forms.
2530
- (FALLBACK ENGINE 1) Analyzes a single word using HanTa + OdeNet + Pattern.
2531
- This function intelligently selects the best available engine:
2532
- 1. PRIMARY: Attempts to use the HanTa-led engine (V17) for maximum accuracy.
2533
- 2. FALLBACK: If HanTa is not available, it uses the spaCy-IWNLP-led
2534
- engine (V16 logic from 'analyze_word_comprehensively') as a robust fallback.
2535
  """
2536
- if not word or not word.strip():
2537
- return {"info": "Please enter a word."}
2538
-
2539
  top_n = int(top_n_value) if top_n_value is not None else 0
 
 
 
 
 
 
 
 
 
 
 
2540
 
2541
- # --- PRIMARY ENGINE: HanTa-led (V17) ---
2542
- if HANTA_AVAILABLE:
2543
- print(f"\n[Word Encyclopedia] Starting V18 (HanTa) analysis for: \"{word}\"")
2544
- final_result: Dict[str, Any] = {
2545
- "input_word": word,
2546
- "analysis": {}
2547
- }
2548
-
2549
- try:
2550
- hanta_tagger = hanta_get_tagger()
2551
- if not hanta_tagger:
2552
- raise Exception("HanTa Tagger failed to initialize.") # Will be caught and trigger fallback
2553
-
2554
- # --- 1. Get All Grammatical Candidates (HanTa) ---
2555
- hanta_tags = _hanta_get_candidates(word, hanta_tagger)
2556
- if not hanta_tags:
2557
- return {"info": f"No grammatical analysis found for '{word}'."}
2558
-
2559
- # --- 2. Map Tags to POS Groups (with Adverb Heuristic) ---
2560
- pos_groups_map = _hanta_map_tags_to_pos(hanta_tags)
2561
- log(f"Found {len(pos_groups_map)} possible POS group(s): {list(pos_groups_map.keys())}")
2562
 
2563
- # --- 3. Validate and Build Report for each POS Group ---
2564
- for pos_group, specific_tags in pos_groups_map.items():
2565
- print(f"--- Analyzing as: {pos_group.upper()} ---")
2566
-
2567
- # --- 3a. Get Lemma (HanTa) ---
2568
- lemma = _hanta_get_lemma_for_pos(word, pos_group, hanta_tagger)
2569
- log(f"Lemma for {pos_group} is: '{lemma}'")
2570
-
2571
- # --- 3b. Get Semantics & VALIDATE (OdeNet) ---
2572
- # We call the NEW, CORRECTED helper from Section 6c
2573
- all_odenet_senses = _get_odenet_senses_by_pos(lemma)
2574
- pos_odenet_senses = all_odenet_senses.get(pos_group, [])
2575
-
2576
- # We only reject if OdeNet is working and returns no senses.
2577
- # If OdeNet is down, the list will contain a placeholder and we proceed.
2578
- if not pos_odenet_senses:
2579
- log(f"✗ REJECTED {pos_group}: OdeNet is available but has no '{pos_group}' senses for lemma '{lemma}'.")
2580
- continue
2581
-
2582
- # Filter out the placeholder if OdeNet is down
2583
- if pos_odenet_senses and "info" in pos_odenet_senses[0]:
2584
- log(f"✓ VERIFIED {pos_group}: OdeNet is unavailable, proceeding without validation.")
2585
- pos_odenet_senses = [] # Clear the placeholder
2586
- else:
2587
- log(f"✓ VERIFIED {pos_group}: OdeNet found {len(pos_odenet_senses)} sense(s).")
2588
-
2589
- # --- 3c. Get Inflections (Pattern) ---
2590
- inflection_report = {}
2591
- if not PATTERN_DE_AVAILABLE:
2592
- inflection_report = {"info": "pattern.de library not available. No inflections generated."}
2593
- else:
2594
- try:
2595
- if pos_group == "noun":
2596
- inflection_report = pattern_analyze_as_noun(lemma)
2597
- elif pos_group == "verb":
2598
- inflection_report = pattern_analyze_as_verb(lemma)
2599
- elif pos_group == "adjective":
2600
- inflection_report = pattern_analyze_as_adjective(lemma)
2601
- elif pos_group == "adverb":
2602
- inflection_report = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
2603
-
2604
- if not pattern_is_good_analysis(inflection_report, pos_group) and pos_group != "adverb":
2605
- log(f"⚠️ Warning: pattern.de generated a poor inflection table for {lemma} ({pos_group}).")
2606
- inflection_report["warning"] = "Inflection table from pattern.de seems incomplete or invalid."
2607
- except Exception as e:
2608
- log(f"pattern.de inflection failed for {lemma} ({pos_group}): {e}")
2609
- inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
2610
-
2611
- # --- 3d. Build Final Report Block ---
2612
- final_result["analysis"][pos_group] = [{
2613
- "hanta_analysis": {
2614
- "detected_tags": sorted(list(specific_tags)),
2615
- "lemma": lemma,
2616
- "morphemes": [
2617
- hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
2618
- ]
2619
- },
2620
- "inflections_pattern": inflection_report,
2621
- "semantics_combined": _build_semantics_block_for_lemma(lemma, pos_group, top_n),
2622
- }]
2623
-
2624
- if not final_result["analysis"]:
2625
- return {
2626
- "input_word": word,
2627
- "info": f"No valid, semantically-verified analysis found for '{word}'. It may be a typo or a function word."
2628
- }
2629
 
2630
- final_result["info"] = "Analysis performed by HanTa-led fallback engine."
2631
- return final_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2632
 
2633
- except Exception as e:
2634
- print(f"[Word Encyclopedia] HanTa FALLBACK Engine FAILED: {e}")
2635
- traceback.print_exc()
2636
- return {} # Signal failure
2637
 
2638
- # --- FALLBACK ENGINE: spaCy-IWNLP-led (V16) ---
2639
- if IWNLP_AVAILABLE:
2640
- try:
2641
- log("--- Dispatcher: HanTa not found or failed. Attempting IWNLP Fallback Engine ---")
2642
- result = _analyze_word_with_iwnlp(word, top_n_value)
2643
- result["info"] = result.get("info", "") + " (Analysis performed by IWNLP-based fallback engine)"
2644
- return result
2645
- except Exception as e:
2646
- log(f"--- IWNLP Fallback Engine FAILED: {e} ---")
2647
- traceback.print_exc()
2648
- return {"error": f"IWNLP Fallback Engine failed: {e}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2649
 
2650
- # --- No engines available ---
2651
- log("--- Dispatcher: No valid analysis engines found. ---")
2652
- return {
2653
- "input_word": word,
2654
- "error": "Fatal Error: Neither HanTa nor spacy-iwnlp are available. "
2655
- "Please install at least one to use the Word Encyclopedia."
2656
- }
 
 
 
 
 
 
 
 
 
 
 
2657
 
2658
- def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2659
  """
2660
- (FALLBACK ENGINE 2) Analyzes a single word using IWNLP + OdeNet + Pattern.
2661
- This was the V16 engine.
2662
-
2663
- V19 UPDATE: This function *must* be modified to match the new
2664
- output format: `analysis: { "pos_key": [ ...list... ] }`
2665
-
2666
- (NON-CONTEXTUAL) Analyzes a single word for ALL its possible
2667
- grammatical and semantic forms.
2668
-
2669
- ** Strategy: IWNLP Lemmas + spaCy POS + Pattern.de Validators**
2670
- 1. Get spaCy's primary POS (e.g., "ADV" for "heute").
2671
- 2. Get IWNLP's list of *lemmas* (e.g., "Lauf" -> ['Lauf', 'laufen']).
2672
- 3. Create a unique set of all possible lemmas from spaCy, IWNLP, and the word itself.
2673
- 4. Iterate this lemma set:
2674
- - Try to analyze each lemma as NOUN (capitalized).
2675
- - Try to analyze each lemma as VERB.
2676
- - Try to analyze each lemma as ADJECTIVE.
2677
- - Validate each with pattern_is_good_analysis AND by checking for OdeNet senses.
2678
- 5. After checking inflections, check if spaCy's POS was 'ADV'.
2679
- If so, and OdeNet has 'r' senses, add an 'adverb' report.
2680
- 6. This finds all inflecting forms ("Lauf", "gut") AND non-inflecting
2681
- forms ("heute") while rejecting artifacts ("klauf", "heutst").
2682
  """
2683
  if not word or not word.strip():
2684
- return {"info": "Please enter a word."}
2685
 
2686
  if not IWNLP_AVAILABLE:
2687
- return {"error": "`spacy-iwnlp` library not available. This tab requires it."}
2688
 
2689
  top_n = int(top_n_value) if top_n_value is not None else 0
2690
 
2691
- print(f"\n[Word Encyclopedia] Starting IWNP-fallback analysis for: \"{word}\" (top_n={top_n})")
2692
 
2693
  final_result: Dict[str, Any] = {
2694
  "input_word": word,
2695
  "analysis": {}
2696
  }
 
2697
 
2698
  # --- Helper: Get OdeNet senses ---
2699
- def _get_odenet_senses_by_pos(w):
2700
  """
2701
  (Internal helper for IWNLP fallback)
2702
-
2703
- *** V18 FIX: OdeNet uses 'a' for BOTH Adjective and Adverb. ***
2704
  """
2705
  senses_by_pos: Dict[str, List[Dict]] = {
2706
  "noun": [], "verb": [], "adjective": [], "adverb": []
@@ -2712,68 +2706,36 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2712
  "verb": [{"info": "OdeNet unavailable"}],
2713
  "adjective": [{"info": "OdeNet unavailable"}],
2714
  "adverb": [{"info": "OdeNet unavailable"}]}
2715
-
2716
  try:
2717
  all_senses = odenet_get_thesaurus_info(w).get("senses", [])
2718
  for sense in all_senses:
2719
  if "error" in sense: continue
2720
  pos_tag = sense.get("pos")
2721
-
2722
  if pos_tag == 'n':
2723
  senses_by_pos["noun"].append(sense)
2724
  elif pos_tag == 'v':
2725
  senses_by_pos["verb"].append(sense)
2726
-
2727
- # --- THIS IS THE CRITICAL FIX ---
2728
  elif pos_tag == 'a':
2729
  log(f"[IWNLP Fallback] Found OdeNet 'a' tag (Adj/Adv) for sense: {sense.get('definition', '...')[:30]}")
2730
  senses_by_pos["adjective"].append(sense)
2731
  senses_by_pos["adverb"].append(sense)
2732
- # --- END OF FIX ---
2733
-
2734
  except Exception as e:
2735
  print(f"[Word Encyclopedia] OdeNet check failed: {e}")
2736
  return senses_by_pos
2737
 
2738
- # --- Helper: Build semantics block ---
2739
- def _build_semantics(lemma, odenet_senses, top_n):
2740
- conceptnet_relations = []
2741
- if REQUESTS_AVAILABLE:
2742
- try:
2743
- conceptnet_result = conceptnet_get_relations(lemma, language='de')
2744
- conceptnet_relations = conceptnet_result.get("relations", [])
2745
- except Exception as e:
2746
- conceptnet_relations = [{"error": str(e)}]
2747
-
2748
- if top_n > 0:
2749
- odenet_senses = odenet_senses[:top_n]
2750
- conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
2751
- conceptnet_relations = conceptnet_relations[:top_n]
2752
-
2753
- return {
2754
- "lemma": lemma,
2755
- "odenet_senses": odenet_senses,
2756
- "conceptnet_relations": conceptnet_relations
2757
- }
2758
-
2759
  # --- 1. GET ALL LEMMA CANDIDATES & SPACY POS ---
2760
  try:
2761
  iwnlp = iwnlp_get_pipeline()
2762
  if not iwnlp:
2763
- return {"error": "IWNLP pipeline failed to initialize."}
2764
 
2765
  doc = iwnlp(word)
2766
  token = doc[0]
2767
 
2768
- # Get spaCy's best POS guess
2769
  spacy_pos = token.pos_ # e.g., "NOUN" for "Lauf", "ADV" for "heute"
2770
  spacy_lemma = token.lemma_
2771
-
2772
- # *** THIS IS THE FIX ***
2773
- # Get IWNLP's lemma list (it only registers 'iwnlp_lemmas')
2774
  iwnlp_lemmas_list = token._.iwnlp_lemmas or []
2775
 
2776
- # Combine all possible lemmas
2777
  all_lemmas = set(iwnlp_lemmas_list)
2778
  all_lemmas.add(spacy_lemma)
2779
  all_lemmas.add(word) # Add the word itself
@@ -2783,25 +2745,19 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2783
 
2784
  except Exception as e:
2785
  traceback.print_exc()
2786
- return {"error": f"IWNLP analysis failed: {e}"}
2787
 
2788
  # --- 2. CHECK INFLECTING POSSIBILITIES FOR EACH LEMMA ---
2789
-
2790
- # This dict will hold the *best* analysis for each POS
2791
- # e.g., "gut" -> { 'adjective': {...}, 'noun': {...} }
2792
-
2793
  valid_analyses: Dict[str, Dict[str, Any]] = {}
2794
-
2795
  for lemma in all_lemmas:
2796
  if not lemma: continue
2797
 
2798
- odenet_senses_by_pos = _get_odenet_senses_by_pos(lemma)
2799
 
2800
  # --- Check NOUN ---
2801
  if 'noun' not in valid_analyses:
2802
  noun_inflections = {}
2803
  is_good_noun = False
2804
-
2805
  if not PATTERN_DE_AVAILABLE:
2806
  noun_inflections = {"info": "pattern.de not available."}
2807
  is_good_noun = True
@@ -2816,32 +2772,20 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2816
  if is_good_noun:
2817
  odenet_senses = odenet_senses_by_pos.get('noun', [])
2818
  if not odenet_senses and lemma.lower() == word.lower():
2819
- odenet_senses = _get_odenet_senses_by_pos(lemma.capitalize()).get('noun', [])
2820
-
2821
- # We accept if (senses exist) OR (OdeNet is down and we can't check)
2822
  if odenet_senses:
2823
- # We must filter out the "unavailable" placeholder
2824
- if "info" not in odenet_senses[0]:
2825
  log(f" ✓ [IWNLP Fallback] Valid NOUN found: {lemma}")
2826
  valid_analyses['noun'] = {
2827
  "lemma": noun_inflections.get("base_form", lemma),
2828
  "inflections": noun_inflections,
2829
- "odenet_senses": odenet_senses
2830
- }
2831
- elif not WN_AVAILABLE: # OdeNet is down
2832
- log(f" ✓ [IWNLP Fallback] Accepting NOUN (OdeNet unavailable): {lemma}")
2833
- valid_analyses['noun'] = {
2834
- "lemma": noun_inflections.get("base_form", lemma),
2835
- "inflections": noun_inflections,
2836
- "odenet_senses": [] # No senses to show
2837
  }
2838
 
2839
-
2840
  # --- Check VERB ---
2841
  if 'verb' not in valid_analyses:
2842
  verb_inflections = {}
2843
  is_good_verb = False
2844
-
2845
  if not PATTERN_DE_AVAILABLE:
2846
  verb_inflections = {"info": "pattern.de not available."}
2847
  is_good_verb = True
@@ -2855,28 +2799,19 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2855
 
2856
  if is_good_verb:
2857
  odenet_senses = odenet_senses_by_pos.get('verb', [])
2858
-
2859
  if odenet_senses:
2860
- if "info" not in odenet_senses[0]:
2861
  log(f" ✓ [IWNLP Fallback] Valid VERB found: {lemma}")
2862
  valid_analyses['verb'] = {
2863
  "lemma": verb_inflections.get("infinitive", lemma),
2864
  "inflections": verb_inflections,
2865
- "odenet_senses": odenet_senses
2866
- }
2867
- elif not WN_AVAILABLE:
2868
- log(f" ✓ [IWNLP Fallback] Accepting VERB (OdeNet unavailable): {lemma}")
2869
- valid_analyses['verb'] = {
2870
- "lemma": verb_inflections.get("infinitive", lemma),
2871
- "inflections": verb_inflections,
2872
- "odenet_senses": []
2873
  }
2874
 
2875
  # --- Check ADJECTIVE ---
2876
  if 'adjective' not in valid_analyses:
2877
  adj_inflections = {}
2878
  is_good_adj = False
2879
-
2880
  if not PATTERN_DE_AVAILABLE:
2881
  adj_inflections = {"info": "pattern.de not available."}
2882
  is_good_adj = True
@@ -2890,46 +2825,28 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2890
 
2891
  if is_good_adj:
2892
  odenet_senses = odenet_senses_by_pos.get('adjective', [])
2893
-
2894
  if odenet_senses:
2895
- if "info" not in odenet_senses[0]:
2896
  log(f" ✓ [IWNLP Fallback] Valid ADJECTIVE found: {lemma}")
2897
  valid_analyses['adjective'] = {
2898
  "lemma": adj_inflections.get("predicative", lemma),
2899
  "inflections": adj_inflections,
2900
- "odenet_senses": odenet_senses
2901
- }
2902
- elif not WN_AVAILABLE:
2903
- log(f" ✓ [IWNLP Fallback] Accepting ADJECTIVE (OdeNet unavailable): {lemma}")
2904
- valid_analyses['adjective'] = {
2905
- "lemma": adj_inflections.get("predicative", lemma),
2906
- "inflections": adj_inflections,
2907
- "odenet_senses": []
2908
  }
2909
 
2910
  # --- 3. CHECK NON-INFLECTING POS (ADVERB) ---
2911
  if spacy_pos == "ADV":
2912
- odenet_senses = _get_odenet_senses_by_pos(word).get('adverb', [])
2913
-
2914
  if odenet_senses:
2915
- if "info" not in odenet_senses[0]:
2916
  log(f" ✓ [IWNLP Fallback] Valid ADVERB found: {word}")
2917
  valid_analyses['adverb'] = {
2918
  "lemma": word,
2919
  "inflections": {"base_form": word},
2920
- "odenet_senses": odenet_senses
2921
- }
2922
- elif not WN_AVAILABLE:
2923
- log(f" ✓ [IWNLP Fallback] Accepting ADVERB (OdeNet unavailable): {word}")
2924
- valid_analyses['adverb'] = {
2925
- "lemma": word,
2926
- "inflections": {"base_form": word},
2927
- "odenet_senses": []
2928
  }
2929
 
2930
  # --- 4. CHECK OTHER FUNCTION WORDS (e.g. "mein" -> DET) ---
2931
- # We add this if spaCy found a function word AND we haven't found any
2932
- # content-word analyses (which are more informative).
2933
  FUNCTION_POS = {"DET", "PRON", "ADP", "AUX", "CCONJ", "SCONJ", "PART", "PUNCT", "SYM"}
2934
  if spacy_pos in FUNCTION_POS and not valid_analyses:
2935
  pos_key = spacy_pos.lower()
@@ -2937,25 +2854,79 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2937
  valid_analyses[pos_key] = {
2938
  "lemma": spacy_lemma,
2939
  "inflections": {"base_form": spacy_lemma},
2940
- "odenet_senses": [], # Function words aren't in OdeNet
2941
- "spacy_analysis": { # Add the spaCy info
2942
  "word": token.text, "lemma": token.lemma_,
2943
  "pos_UPOS": token.pos_, "pos_TAG": token.tag_,
2944
  "morphology": str(token.morph)
2945
  }
2946
  }
2947
 
2948
- # --- 5. BUILD FINAL REPORT ---
2949
  for pos_key, analysis_data in valid_analyses.items():
2950
  lemma = analysis_data["lemma"]
2951
- pos_report = {
2952
- "inflections_pattern": analysis_data["inflections"],
2953
- "semantics_combined": _build_semantics_block_for_lemma(lemma, pos_key, top_n)
2954
- }
2955
- if "spacy_analysis" in analysis_data:
2956
- pos_report["spacy_analysis"] = analysis_data["spacy_analysis"]
 
2957
 
2958
- final_result["analysis"][pos_key] = [pos_report] # Wrap in list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2959
 
2960
  if not final_result["analysis"]:
2961
  return {} # No results
 
2524
  }
2525
 
2526
  # --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
2527
+ def _analyze_word_with_hanta(word: str, top_n_value: Optional[float] = 0) -> Dict[str, Any]:
2528
  """
2529
+ (FALLBACK ENGINE 2) Analyzes a single word using HanTa + OdeNet + Pattern.
2530
+ This was the V18 engine. Returns {} on failure.
 
 
 
 
2531
  """
2532
+ if not HANTA_AVAILABLE:
2533
+ return {} # Signal failure
2534
+
2535
  top_n = int(top_n_value) if top_n_value is not None else 0
2536
+ print(f"\n[Word Encyclopedia] Running V18 (HanTa) fallback for: \"{word}\"")
2537
+ final_result: Dict[str, Any] = {
2538
+ "input_word": word,
2539
+ "analysis": {}
2540
+ }
2541
+ word_lower = word.lower() # For validation
2542
+
2543
+ try:
2544
+ hanta_tagger = hanta_get_tagger()
2545
+ if not hanta_tagger:
2546
+ raise Exception("HanTa Tagger failed to initialize.")
2547
 
2548
+ hanta_tags = _hanta_get_candidates(word, hanta_tagger)
2549
+ if not hanta_tags:
2550
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2551
 
2552
+ pos_groups_map = _hanta_map_tags_to_pos(hanta_tags)
2553
+ log(f"Found {len(pos_groups_map)} possible POS group(s): {list(pos_groups_map.keys())}")
2554
+
2555
+ for pos_group, specific_tags in pos_groups_map.items():
2556
+ print(f"--- Analyzing as: {pos_group.upper()} ---")
2557
+
2558
+ lemma = _hanta_get_lemma_for_pos(word, pos_group, hanta_tagger)
2559
+ log(f"Lemma for {pos_group} is: '{lemma}'")
2560
+
2561
+ all_odenet_senses = _get_odenet_senses_by_pos(lemma)
2562
+ pos_odenet_senses = all_odenet_senses.get(pos_group, [])
2563
+
2564
+ if not pos_odenet_senses:
2565
+ log(f"✗ REJECTED {pos_group}: OdeNet is available but has no '{pos_group}' senses for lemma '{lemma}'.")
2566
+ continue
2567
+
2568
+ if pos_odenet_senses and "info" in pos_odenet_senses[0]:
2569
+ log(f"✓ VERIFIED {pos_group}: OdeNet is unavailable, proceeding without validation.")
2570
+ pos_odenet_senses = []
2571
+ else:
2572
+ log(f"✓ VERIFIED {pos_group}: OdeNet found {len(pos_odenet_senses)} sense(s).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2573
 
2574
+ # --- 1. Get Inflections (Pattern) ---
2575
+ inflection_report = {}
2576
+ if not PATTERN_DE_AVAILABLE:
2577
+ inflection_report = {"info": "pattern.de library not available. No inflections generated."}
2578
+ else:
2579
+ try:
2580
+ if pos_group == "noun":
2581
+ inflection_report = pattern_analyze_as_noun(lemma)
2582
+ elif pos_group == "verb":
2583
+ inflection_report = pattern_analyze_as_verb(lemma)
2584
+ elif pos_group == "adjective":
2585
+ inflection_report = pattern_analyze_as_adjective(lemma)
2586
+ elif pos_group == "adverb":
2587
+ inflection_report = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
2588
+
2589
+ if not pattern_is_good_analysis(inflection_report, pos_group) and pos_group != "adverb":
2590
+ log(f"⚠️ Warning: pattern.de generated a poor inflection table for {lemma} ({pos_group}).")
2591
+ inflection_report["warning"] = "Inflection table from pattern.de seems incomplete or invalid."
2592
+ except Exception as e:
2593
+ log(f"pattern.de inflection failed for {lemma} ({pos_group}): {e}")
2594
+ inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
2595
 
2596
+ # --- 2. Build Semantics Block ---
2597
+ semantics_block = _build_semantics_block_for_lemma(lemma, pos_group, top_n)
 
 
2598
 
2599
+ # --- 3. Build Final Report Block ---
2600
+ pos_entry_report = {
2601
+ "hanta_analysis": {
2602
+ "detected_tags": sorted(list(specific_tags)),
2603
+ "lemma": lemma,
2604
+ "morphemes": [
2605
+ hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
2606
+ ]
2607
+ },
2608
+ "inflections_pattern": inflection_report,
2609
+ "semantics_combined": semantics_block
2610
+ }
2611
+
2612
+ # --- 4. *** VALIDATION FILTER *** ---
2613
+ is_valid = False
2614
+ if lemma.lower() == word_lower:
2615
+ is_valid = True
2616
+ log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches lemma.")
2617
+
2618
+ if not is_valid:
2619
+ # Check pattern.de's lexeme (for verbs)
2620
+ for form in inflection_report.get("lexeme", []):
2621
+ if form.lower() == word_lower:
2622
+ is_valid = True
2623
+ log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word found in pattern.de lexeme.")
2624
+ break
2625
+
2626
+ if not is_valid:
2627
+ # Check pattern.de's participles (for "abgeschnitten")
2628
+ for part_form in inflection_report.get("participles", {}).values():
2629
+ if part_form.lower() == word_lower:
2630
+ is_valid = True
2631
+ log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word found in pattern.de participles.")
2632
+ break
2633
+
2634
+ if not is_valid and pos_group == "adjective":
2635
+ # Check adjective forms
2636
+ if word_lower == inflection_report.get("predicative", "").lower() or \
2637
+ word_lower == inflection_report.get("comparative", "").lower() or \
2638
+ word_lower == inflection_report.get("superlative", "").lower():
2639
+ is_valid = True
2640
+ log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches adj comparison form.")
2641
+
2642
+ if not is_valid and pos_group == "noun":
2643
+ # Check noun forms
2644
+ if word_lower == inflection_report.get("singular", "").lower() or \
2645
+ word_lower == inflection_report.get("plural", "").lower():
2646
+ is_valid = True
2647
+ log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches noun singular/plural.")
2648
+
2649
+ if not is_valid and pos_group == "adverb":
2650
+ is_valid = True # Adverbs are non-inflecting, always keep.
2651
 
2652
+ if is_valid:
2653
+ if pos_group not in final_result["analysis"]:
2654
+ final_result["analysis"][pos_group] = []
2655
+ final_result["analysis"][pos_group].append(pos_entry_report)
2656
+ else:
2657
+ log(f"[DEBUG] HanTa: DROPPING entry '{lemma}' ({pos_group}) because input word '{word}' was not found in its valid forms.")
2658
+ # --- END OF VALIDATION ---
2659
+
2660
+ if not final_result["analysis"]:
2661
+ return {} # No results
2662
+
2663
+ final_result["info"] = "Analysis performed by HanTa-led fallback engine."
2664
+ return final_result
2665
+
2666
+ except Exception as e:
2667
+ print(f"[Word Encyclopedia] HanTa FALLBACK Engine FAILED: {e}")
2668
+ traceback.print_exc()
2669
+ return {} # Signal failure
2670
 
2671
+ def _analyze_word_with_iwnlp(word: str, top_n_value: Optional[float] = 0) -> Dict[str, Any]:
2672
  """
2673
+ (FALLBACK ENGINE 3) Analyzes a single word using IWNLP + OdeNet + Pattern.
2674
+ This is the full V16/V18 logic, restored and with the new validation filter.
2675
+ Returns {} on failure.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2676
  """
2677
  if not word or not word.strip():
2678
+ return {} # Use empty dict for "info"
2679
 
2680
  if not IWNLP_AVAILABLE:
2681
+ return {} # Signal failure
2682
 
2683
  top_n = int(top_n_value) if top_n_value is not None else 0
2684
 
2685
+ print(f"\n[Word Encyclopedia] Running IWNLP-fallback analysis for: \"{word}\" (top_n={top_n})")
2686
 
2687
  final_result: Dict[str, Any] = {
2688
  "input_word": word,
2689
  "analysis": {}
2690
  }
2691
+ word_lower = word.lower() # For validation
2692
 
2693
  # --- Helper: Get OdeNet senses ---
2694
+ def _get_odenet_senses_by_pos_internal(w):
2695
  """
2696
  (Internal helper for IWNLP fallback)
2697
+ OdeNet uses 'a' for BOTH Adjective and Adverb.
 
2698
  """
2699
  senses_by_pos: Dict[str, List[Dict]] = {
2700
  "noun": [], "verb": [], "adjective": [], "adverb": []
 
2706
  "verb": [{"info": "OdeNet unavailable"}],
2707
  "adjective": [{"info": "OdeNet unavailable"}],
2708
  "adverb": [{"info": "OdeNet unavailable"}]}
 
2709
  try:
2710
  all_senses = odenet_get_thesaurus_info(w).get("senses", [])
2711
  for sense in all_senses:
2712
  if "error" in sense: continue
2713
  pos_tag = sense.get("pos")
 
2714
  if pos_tag == 'n':
2715
  senses_by_pos["noun"].append(sense)
2716
  elif pos_tag == 'v':
2717
  senses_by_pos["verb"].append(sense)
 
 
2718
  elif pos_tag == 'a':
2719
  log(f"[IWNLP Fallback] Found OdeNet 'a' tag (Adj/Adv) for sense: {sense.get('definition', '...')[:30]}")
2720
  senses_by_pos["adjective"].append(sense)
2721
  senses_by_pos["adverb"].append(sense)
 
 
2722
  except Exception as e:
2723
  print(f"[Word Encyclopedia] OdeNet check failed: {e}")
2724
  return senses_by_pos
2725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2726
  # --- 1. GET ALL LEMMA CANDIDATES & SPACY POS ---
2727
  try:
2728
  iwnlp = iwnlp_get_pipeline()
2729
  if not iwnlp:
2730
+ return {} # Signal failure
2731
 
2732
  doc = iwnlp(word)
2733
  token = doc[0]
2734
 
 
2735
  spacy_pos = token.pos_ # e.g., "NOUN" for "Lauf", "ADV" for "heute"
2736
  spacy_lemma = token.lemma_
 
 
 
2737
  iwnlp_lemmas_list = token._.iwnlp_lemmas or []
2738
 
 
2739
  all_lemmas = set(iwnlp_lemmas_list)
2740
  all_lemmas.add(spacy_lemma)
2741
  all_lemmas.add(word) # Add the word itself
 
2745
 
2746
  except Exception as e:
2747
  traceback.print_exc()
2748
+ return {} # Signal failure
2749
 
2750
  # --- 2. CHECK INFLECTING POSSIBILITIES FOR EACH LEMMA ---
 
 
 
 
2751
  valid_analyses: Dict[str, Dict[str, Any]] = {}
 
2752
  for lemma in all_lemmas:
2753
  if not lemma: continue
2754
 
2755
+ odenet_senses_by_pos = _get_odenet_senses_by_pos_internal(lemma)
2756
 
2757
  # --- Check NOUN ---
2758
  if 'noun' not in valid_analyses:
2759
  noun_inflections = {}
2760
  is_good_noun = False
 
2761
  if not PATTERN_DE_AVAILABLE:
2762
  noun_inflections = {"info": "pattern.de not available."}
2763
  is_good_noun = True
 
2772
  if is_good_noun:
2773
  odenet_senses = odenet_senses_by_pos.get('noun', [])
2774
  if not odenet_senses and lemma.lower() == word.lower():
2775
+ odenet_senses = _get_odenet_senses_by_pos_internal(lemma.capitalize()).get('noun', [])
 
 
2776
  if odenet_senses:
2777
+ if "info" not in odenet_senses[0] or not WN_AVAILABLE:
 
2778
  log(f" ✓ [IWNLP Fallback] Valid NOUN found: {lemma}")
2779
  valid_analyses['noun'] = {
2780
  "lemma": noun_inflections.get("base_form", lemma),
2781
  "inflections": noun_inflections,
2782
+ "odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
 
 
 
 
 
 
 
2783
  }
2784
 
 
2785
  # --- Check VERB ---
2786
  if 'verb' not in valid_analyses:
2787
  verb_inflections = {}
2788
  is_good_verb = False
 
2789
  if not PATTERN_DE_AVAILABLE:
2790
  verb_inflections = {"info": "pattern.de not available."}
2791
  is_good_verb = True
 
2799
 
2800
  if is_good_verb:
2801
  odenet_senses = odenet_senses_by_pos.get('verb', [])
 
2802
  if odenet_senses:
2803
+ if "info" not in odenet_senses[0] or not WN_AVAILABLE:
2804
  log(f" ✓ [IWNLP Fallback] Valid VERB found: {lemma}")
2805
  valid_analyses['verb'] = {
2806
  "lemma": verb_inflections.get("infinitive", lemma),
2807
  "inflections": verb_inflections,
2808
+ "odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
 
 
 
 
 
 
 
2809
  }
2810
 
2811
  # --- Check ADJECTIVE ---
2812
  if 'adjective' not in valid_analyses:
2813
  adj_inflections = {}
2814
  is_good_adj = False
 
2815
  if not PATTERN_DE_AVAILABLE:
2816
  adj_inflections = {"info": "pattern.de not available."}
2817
  is_good_adj = True
 
2825
 
2826
  if is_good_adj:
2827
  odenet_senses = odenet_senses_by_pos.get('adjective', [])
 
2828
  if odenet_senses:
2829
+ if "info" not in odenet_senses[0] or not WN_AVAILABLE:
2830
  log(f" ✓ [IWNLP Fallback] Valid ADJECTIVE found: {lemma}")
2831
  valid_analyses['adjective'] = {
2832
  "lemma": adj_inflections.get("predicative", lemma),
2833
  "inflections": adj_inflections,
2834
+ "odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
 
 
 
 
 
 
 
2835
  }
2836
 
2837
  # --- 3. CHECK NON-INFLECTING POS (ADVERB) ---
2838
  if spacy_pos == "ADV":
2839
+ odenet_senses = _get_odenet_senses_by_pos_internal(word).get('adverb', [])
 
2840
  if odenet_senses:
2841
+ if "info" not in odenet_senses[0] or not WN_AVAILABLE:
2842
  log(f" ✓ [IWNLP Fallback] Valid ADVERB found: {word}")
2843
  valid_analyses['adverb'] = {
2844
  "lemma": word,
2845
  "inflections": {"base_form": word},
2846
+ "odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
 
 
 
 
 
 
 
2847
  }
2848
 
2849
  # --- 4. CHECK OTHER FUNCTION WORDS (e.g. "mein" -> DET) ---
 
 
2850
  FUNCTION_POS = {"DET", "PRON", "ADP", "AUX", "CCONJ", "SCONJ", "PART", "PUNCT", "SYM"}
2851
  if spacy_pos in FUNCTION_POS and not valid_analyses:
2852
  pos_key = spacy_pos.lower()
 
2854
  valid_analyses[pos_key] = {
2855
  "lemma": spacy_lemma,
2856
  "inflections": {"base_form": spacy_lemma},
2857
+ "odenet_senses": [],
2858
+ "spacy_analysis": {
2859
  "word": token.text, "lemma": token.lemma_,
2860
  "pos_UPOS": token.pos_, "pos_TAG": token.tag_,
2861
  "morphology": str(token.morph)
2862
  }
2863
  }
2864
 
2865
+ # --- 5. BUILD FINAL REPORT (V21 MODIFIED + VALIDATION) ---
2866
  for pos_key, analysis_data in valid_analyses.items():
2867
  lemma = analysis_data["lemma"]
2868
+ inflection_block = analysis_data["inflections"]
2869
+
2870
+ # --- E. VALIDATION FILTER ---
2871
+ is_valid = False
2872
+ if lemma.lower() == word_lower:
2873
+ is_valid = True
2874
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches lemma.")
2875
 
2876
+ if not is_valid:
2877
+ # Check pattern.de's lexeme (for verbs)
2878
+ for form in inflection_block.get("lexeme", []):
2879
+ if form.lower() == word_lower:
2880
+ is_valid = True
2881
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word found in pattern.de lexeme.")
2882
+ break
2883
+
2884
+ if not is_valid:
2885
+ # Check pattern.de's participles (for "abgeschnitten")
2886
+ for part_form in inflection_block.get("participles", {}).values():
2887
+ if part_form.lower() == word_lower:
2888
+ is_valid = True
2889
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word found in pattern.de participles.")
2890
+ break
2891
+
2892
+ if not is_valid and pos_key == "adjective":
2893
+ # Check adjective forms
2894
+ if word_lower == inflection_block.get("predicative", "").lower() or \
2895
+ word_lower == inflection_block.get("comparative", "").lower() or \
2896
+ word_lower == inflection_block.get("superlative", "").lower():
2897
+ is_valid = True
2898
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches adj comparison form.")
2899
+
2900
+ if not is_valid and pos_key == "noun":
2901
+ # Check noun forms
2902
+ if word_lower == inflection_block.get("singular", "").lower() or \
2903
+ word_lower == inflection_block.get("plural", "").lower():
2904
+ is_valid = True
2905
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches noun singular/plural.")
2906
+
2907
+ if not is_valid and (pos_key == "adverb" or "spacy_analysis" in analysis_data):
2908
+ is_valid = True # Adverbs and Function Words are non-inflecting, always keep.
2909
+ log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because it is a non-inflecting word (ADV/FUNC).")
2910
+
2911
+ if is_valid:
2912
+ pos_report = {
2913
+ "inflections_pattern": inflection_block,
2914
+ # Use the new global helper
2915
+ "semantics_combined": _build_semantics_block_for_lemma(
2916
+ lemma,
2917
+ pos_key,
2918
+ top_n
2919
+ )
2920
+ }
2921
+ if "spacy_analysis" in analysis_data:
2922
+ pos_report["spacy_analysis"] = analysis_data["spacy_analysis"]
2923
+
2924
+ if pos_key not in final_result["analysis"]:
2925
+ final_result["analysis"][pos_key] = []
2926
+ final_result["analysis"][pos_key].append(pos_report)
2927
+ else:
2928
+ log(f"[DEBUG] IWNLP: DROPPING entry '{lemma}' ({pos_key}) because input word '{word}' was not found in its valid forms.")
2929
+ # --- END VALIDATION ---
2930
 
2931
  if not final_result["analysis"]:
2932
  return {} # No results