Spaces:
Running
Running
path checks for fallback methods
Browse files
app.py
CHANGED
|
@@ -2524,183 +2524,177 @@ def comprehensive_german_analysis(text: str, top_n_value: Optional[float] = 0) -
|
|
| 2524 |
}
|
| 2525 |
|
| 2526 |
# --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
|
| 2527 |
-
def _analyze_word_with_hanta(word: str, top_n_value:
|
| 2528 |
"""
|
| 2529 |
-
(
|
| 2530 |
-
|
| 2531 |
-
This function intelligently selects the best available engine:
|
| 2532 |
-
1. PRIMARY: Attempts to use the HanTa-led engine (V17) for maximum accuracy.
|
| 2533 |
-
2. FALLBACK: If HanTa is not available, it uses the spaCy-IWNLP-led
|
| 2534 |
-
engine (V16 logic from 'analyze_word_comprehensively') as a robust fallback.
|
| 2535 |
"""
|
| 2536 |
-
if not
|
| 2537 |
-
return {
|
| 2538 |
-
|
| 2539 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2540 |
|
| 2541 |
-
|
| 2542 |
-
|
| 2543 |
-
|
| 2544 |
-
final_result: Dict[str, Any] = {
|
| 2545 |
-
"input_word": word,
|
| 2546 |
-
"analysis": {}
|
| 2547 |
-
}
|
| 2548 |
-
|
| 2549 |
-
try:
|
| 2550 |
-
hanta_tagger = hanta_get_tagger()
|
| 2551 |
-
if not hanta_tagger:
|
| 2552 |
-
raise Exception("HanTa Tagger failed to initialize.") # Will be caught and trigger fallback
|
| 2553 |
-
|
| 2554 |
-
# --- 1. Get All Grammatical Candidates (HanTa) ---
|
| 2555 |
-
hanta_tags = _hanta_get_candidates(word, hanta_tagger)
|
| 2556 |
-
if not hanta_tags:
|
| 2557 |
-
return {"info": f"No grammatical analysis found for '{word}'."}
|
| 2558 |
-
|
| 2559 |
-
# --- 2. Map Tags to POS Groups (with Adverb Heuristic) ---
|
| 2560 |
-
pos_groups_map = _hanta_map_tags_to_pos(hanta_tags)
|
| 2561 |
-
log(f"Found {len(pos_groups_map)} possible POS group(s): {list(pos_groups_map.keys())}")
|
| 2562 |
|
| 2563 |
-
|
| 2564 |
-
|
| 2565 |
-
|
| 2566 |
-
|
| 2567 |
-
|
| 2568 |
-
|
| 2569 |
-
|
| 2570 |
-
|
| 2571 |
-
|
| 2572 |
-
|
| 2573 |
-
|
| 2574 |
-
|
| 2575 |
-
|
| 2576 |
-
|
| 2577 |
-
|
| 2578 |
-
|
| 2579 |
-
|
| 2580 |
-
|
| 2581 |
-
|
| 2582 |
-
|
| 2583 |
-
|
| 2584 |
-
log(f"✓ VERIFIED {pos_group}: OdeNet is unavailable, proceeding without validation.")
|
| 2585 |
-
pos_odenet_senses = [] # Clear the placeholder
|
| 2586 |
-
else:
|
| 2587 |
-
log(f"✓ VERIFIED {pos_group}: OdeNet found {len(pos_odenet_senses)} sense(s).")
|
| 2588 |
-
|
| 2589 |
-
# --- 3c. Get Inflections (Pattern) ---
|
| 2590 |
-
inflection_report = {}
|
| 2591 |
-
if not PATTERN_DE_AVAILABLE:
|
| 2592 |
-
inflection_report = {"info": "pattern.de library not available. No inflections generated."}
|
| 2593 |
-
else:
|
| 2594 |
-
try:
|
| 2595 |
-
if pos_group == "noun":
|
| 2596 |
-
inflection_report = pattern_analyze_as_noun(lemma)
|
| 2597 |
-
elif pos_group == "verb":
|
| 2598 |
-
inflection_report = pattern_analyze_as_verb(lemma)
|
| 2599 |
-
elif pos_group == "adjective":
|
| 2600 |
-
inflection_report = pattern_analyze_as_adjective(lemma)
|
| 2601 |
-
elif pos_group == "adverb":
|
| 2602 |
-
inflection_report = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 2603 |
-
|
| 2604 |
-
if not pattern_is_good_analysis(inflection_report, pos_group) and pos_group != "adverb":
|
| 2605 |
-
log(f"⚠️ Warning: pattern.de generated a poor inflection table for {lemma} ({pos_group}).")
|
| 2606 |
-
inflection_report["warning"] = "Inflection table from pattern.de seems incomplete or invalid."
|
| 2607 |
-
except Exception as e:
|
| 2608 |
-
log(f"pattern.de inflection failed for {lemma} ({pos_group}): {e}")
|
| 2609 |
-
inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
|
| 2610 |
-
|
| 2611 |
-
# --- 3d. Build Final Report Block ---
|
| 2612 |
-
final_result["analysis"][pos_group] = [{
|
| 2613 |
-
"hanta_analysis": {
|
| 2614 |
-
"detected_tags": sorted(list(specific_tags)),
|
| 2615 |
-
"lemma": lemma,
|
| 2616 |
-
"morphemes": [
|
| 2617 |
-
hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
|
| 2618 |
-
]
|
| 2619 |
-
},
|
| 2620 |
-
"inflections_pattern": inflection_report,
|
| 2621 |
-
"semantics_combined": _build_semantics_block_for_lemma(lemma, pos_group, top_n),
|
| 2622 |
-
}]
|
| 2623 |
-
|
| 2624 |
-
if not final_result["analysis"]:
|
| 2625 |
-
return {
|
| 2626 |
-
"input_word": word,
|
| 2627 |
-
"info": f"No valid, semantically-verified analysis found for '{word}'. It may be a typo or a function word."
|
| 2628 |
-
}
|
| 2629 |
|
| 2630 |
-
|
| 2631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2632 |
|
| 2633 |
-
|
| 2634 |
-
|
| 2635 |
-
traceback.print_exc()
|
| 2636 |
-
return {} # Signal failure
|
| 2637 |
|
| 2638 |
-
|
| 2639 |
-
|
| 2640 |
-
|
| 2641 |
-
|
| 2642 |
-
|
| 2643 |
-
|
| 2644 |
-
|
| 2645 |
-
|
| 2646 |
-
|
| 2647 |
-
|
| 2648 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2649 |
|
| 2650 |
-
|
| 2651 |
-
|
| 2652 |
-
|
| 2653 |
-
|
| 2654 |
-
|
| 2655 |
-
"
|
| 2656 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2657 |
|
| 2658 |
-
def _analyze_word_with_iwnlp(word: str, top_n_value:
|
| 2659 |
"""
|
| 2660 |
-
(FALLBACK ENGINE
|
| 2661 |
-
This
|
| 2662 |
-
|
| 2663 |
-
V19 UPDATE: This function *must* be modified to match the new
|
| 2664 |
-
output format: `analysis: { "pos_key": [ ...list... ] }`
|
| 2665 |
-
|
| 2666 |
-
(NON-CONTEXTUAL) Analyzes a single word for ALL its possible
|
| 2667 |
-
grammatical and semantic forms.
|
| 2668 |
-
|
| 2669 |
-
** Strategy: IWNLP Lemmas + spaCy POS + Pattern.de Validators**
|
| 2670 |
-
1. Get spaCy's primary POS (e.g., "ADV" for "heute").
|
| 2671 |
-
2. Get IWNLP's list of *lemmas* (e.g., "Lauf" -> ['Lauf', 'laufen']).
|
| 2672 |
-
3. Create a unique set of all possible lemmas from spaCy, IWNLP, and the word itself.
|
| 2673 |
-
4. Iterate this lemma set:
|
| 2674 |
-
- Try to analyze each lemma as NOUN (capitalized).
|
| 2675 |
-
- Try to analyze each lemma as VERB.
|
| 2676 |
-
- Try to analyze each lemma as ADJECTIVE.
|
| 2677 |
-
- Validate each with pattern_is_good_analysis AND by checking for OdeNet senses.
|
| 2678 |
-
5. After checking inflections, check if spaCy's POS was 'ADV'.
|
| 2679 |
-
If so, and OdeNet has 'r' senses, add an 'adverb' report.
|
| 2680 |
-
6. This finds all inflecting forms ("Lauf", "gut") AND non-inflecting
|
| 2681 |
-
forms ("heute") while rejecting artifacts ("klauf", "heutst").
|
| 2682 |
"""
|
| 2683 |
if not word or not word.strip():
|
| 2684 |
-
return {
|
| 2685 |
|
| 2686 |
if not IWNLP_AVAILABLE:
|
| 2687 |
-
return {
|
| 2688 |
|
| 2689 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
| 2690 |
|
| 2691 |
-
print(f"\n[Word Encyclopedia]
|
| 2692 |
|
| 2693 |
final_result: Dict[str, Any] = {
|
| 2694 |
"input_word": word,
|
| 2695 |
"analysis": {}
|
| 2696 |
}
|
|
|
|
| 2697 |
|
| 2698 |
# --- Helper: Get OdeNet senses ---
|
| 2699 |
-
def
|
| 2700 |
"""
|
| 2701 |
(Internal helper for IWNLP fallback)
|
| 2702 |
-
|
| 2703 |
-
*** V18 FIX: OdeNet uses 'a' for BOTH Adjective and Adverb. ***
|
| 2704 |
"""
|
| 2705 |
senses_by_pos: Dict[str, List[Dict]] = {
|
| 2706 |
"noun": [], "verb": [], "adjective": [], "adverb": []
|
|
@@ -2712,68 +2706,36 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2712 |
"verb": [{"info": "OdeNet unavailable"}],
|
| 2713 |
"adjective": [{"info": "OdeNet unavailable"}],
|
| 2714 |
"adverb": [{"info": "OdeNet unavailable"}]}
|
| 2715 |
-
|
| 2716 |
try:
|
| 2717 |
all_senses = odenet_get_thesaurus_info(w).get("senses", [])
|
| 2718 |
for sense in all_senses:
|
| 2719 |
if "error" in sense: continue
|
| 2720 |
pos_tag = sense.get("pos")
|
| 2721 |
-
|
| 2722 |
if pos_tag == 'n':
|
| 2723 |
senses_by_pos["noun"].append(sense)
|
| 2724 |
elif pos_tag == 'v':
|
| 2725 |
senses_by_pos["verb"].append(sense)
|
| 2726 |
-
|
| 2727 |
-
# --- THIS IS THE CRITICAL FIX ---
|
| 2728 |
elif pos_tag == 'a':
|
| 2729 |
log(f"[IWNLP Fallback] Found OdeNet 'a' tag (Adj/Adv) for sense: {sense.get('definition', '...')[:30]}")
|
| 2730 |
senses_by_pos["adjective"].append(sense)
|
| 2731 |
senses_by_pos["adverb"].append(sense)
|
| 2732 |
-
# --- END OF FIX ---
|
| 2733 |
-
|
| 2734 |
except Exception as e:
|
| 2735 |
print(f"[Word Encyclopedia] OdeNet check failed: {e}")
|
| 2736 |
return senses_by_pos
|
| 2737 |
|
| 2738 |
-
# --- Helper: Build semantics block ---
|
| 2739 |
-
def _build_semantics(lemma, odenet_senses, top_n):
|
| 2740 |
-
conceptnet_relations = []
|
| 2741 |
-
if REQUESTS_AVAILABLE:
|
| 2742 |
-
try:
|
| 2743 |
-
conceptnet_result = conceptnet_get_relations(lemma, language='de')
|
| 2744 |
-
conceptnet_relations = conceptnet_result.get("relations", [])
|
| 2745 |
-
except Exception as e:
|
| 2746 |
-
conceptnet_relations = [{"error": str(e)}]
|
| 2747 |
-
|
| 2748 |
-
if top_n > 0:
|
| 2749 |
-
odenet_senses = odenet_senses[:top_n]
|
| 2750 |
-
conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
| 2751 |
-
conceptnet_relations = conceptnet_relations[:top_n]
|
| 2752 |
-
|
| 2753 |
-
return {
|
| 2754 |
-
"lemma": lemma,
|
| 2755 |
-
"odenet_senses": odenet_senses,
|
| 2756 |
-
"conceptnet_relations": conceptnet_relations
|
| 2757 |
-
}
|
| 2758 |
-
|
| 2759 |
# --- 1. GET ALL LEMMA CANDIDATES & SPACY POS ---
|
| 2760 |
try:
|
| 2761 |
iwnlp = iwnlp_get_pipeline()
|
| 2762 |
if not iwnlp:
|
| 2763 |
-
return {
|
| 2764 |
|
| 2765 |
doc = iwnlp(word)
|
| 2766 |
token = doc[0]
|
| 2767 |
|
| 2768 |
-
# Get spaCy's best POS guess
|
| 2769 |
spacy_pos = token.pos_ # e.g., "NOUN" for "Lauf", "ADV" for "heute"
|
| 2770 |
spacy_lemma = token.lemma_
|
| 2771 |
-
|
| 2772 |
-
# *** THIS IS THE FIX ***
|
| 2773 |
-
# Get IWNLP's lemma list (it only registers 'iwnlp_lemmas')
|
| 2774 |
iwnlp_lemmas_list = token._.iwnlp_lemmas or []
|
| 2775 |
|
| 2776 |
-
# Combine all possible lemmas
|
| 2777 |
all_lemmas = set(iwnlp_lemmas_list)
|
| 2778 |
all_lemmas.add(spacy_lemma)
|
| 2779 |
all_lemmas.add(word) # Add the word itself
|
|
@@ -2783,25 +2745,19 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2783 |
|
| 2784 |
except Exception as e:
|
| 2785 |
traceback.print_exc()
|
| 2786 |
-
return {
|
| 2787 |
|
| 2788 |
# --- 2. CHECK INFLECTING POSSIBILITIES FOR EACH LEMMA ---
|
| 2789 |
-
|
| 2790 |
-
# This dict will hold the *best* analysis for each POS
|
| 2791 |
-
# e.g., "gut" -> { 'adjective': {...}, 'noun': {...} }
|
| 2792 |
-
|
| 2793 |
valid_analyses: Dict[str, Dict[str, Any]] = {}
|
| 2794 |
-
|
| 2795 |
for lemma in all_lemmas:
|
| 2796 |
if not lemma: continue
|
| 2797 |
|
| 2798 |
-
odenet_senses_by_pos =
|
| 2799 |
|
| 2800 |
# --- Check NOUN ---
|
| 2801 |
if 'noun' not in valid_analyses:
|
| 2802 |
noun_inflections = {}
|
| 2803 |
is_good_noun = False
|
| 2804 |
-
|
| 2805 |
if not PATTERN_DE_AVAILABLE:
|
| 2806 |
noun_inflections = {"info": "pattern.de not available."}
|
| 2807 |
is_good_noun = True
|
|
@@ -2816,32 +2772,20 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2816 |
if is_good_noun:
|
| 2817 |
odenet_senses = odenet_senses_by_pos.get('noun', [])
|
| 2818 |
if not odenet_senses and lemma.lower() == word.lower():
|
| 2819 |
-
odenet_senses =
|
| 2820 |
-
|
| 2821 |
-
# We accept if (senses exist) OR (OdeNet is down and we can't check)
|
| 2822 |
if odenet_senses:
|
| 2823 |
-
|
| 2824 |
-
if "info" not in odenet_senses[0]:
|
| 2825 |
log(f" ✓ [IWNLP Fallback] Valid NOUN found: {lemma}")
|
| 2826 |
valid_analyses['noun'] = {
|
| 2827 |
"lemma": noun_inflections.get("base_form", lemma),
|
| 2828 |
"inflections": noun_inflections,
|
| 2829 |
-
"odenet_senses": odenet_senses
|
| 2830 |
-
}
|
| 2831 |
-
elif not WN_AVAILABLE: # OdeNet is down
|
| 2832 |
-
log(f" ✓ [IWNLP Fallback] Accepting NOUN (OdeNet unavailable): {lemma}")
|
| 2833 |
-
valid_analyses['noun'] = {
|
| 2834 |
-
"lemma": noun_inflections.get("base_form", lemma),
|
| 2835 |
-
"inflections": noun_inflections,
|
| 2836 |
-
"odenet_senses": [] # No senses to show
|
| 2837 |
}
|
| 2838 |
|
| 2839 |
-
|
| 2840 |
# --- Check VERB ---
|
| 2841 |
if 'verb' not in valid_analyses:
|
| 2842 |
verb_inflections = {}
|
| 2843 |
is_good_verb = False
|
| 2844 |
-
|
| 2845 |
if not PATTERN_DE_AVAILABLE:
|
| 2846 |
verb_inflections = {"info": "pattern.de not available."}
|
| 2847 |
is_good_verb = True
|
|
@@ -2855,28 +2799,19 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2855 |
|
| 2856 |
if is_good_verb:
|
| 2857 |
odenet_senses = odenet_senses_by_pos.get('verb', [])
|
| 2858 |
-
|
| 2859 |
if odenet_senses:
|
| 2860 |
-
if "info" not in odenet_senses[0]:
|
| 2861 |
log(f" ✓ [IWNLP Fallback] Valid VERB found: {lemma}")
|
| 2862 |
valid_analyses['verb'] = {
|
| 2863 |
"lemma": verb_inflections.get("infinitive", lemma),
|
| 2864 |
"inflections": verb_inflections,
|
| 2865 |
-
"odenet_senses": odenet_senses
|
| 2866 |
-
}
|
| 2867 |
-
elif not WN_AVAILABLE:
|
| 2868 |
-
log(f" ✓ [IWNLP Fallback] Accepting VERB (OdeNet unavailable): {lemma}")
|
| 2869 |
-
valid_analyses['verb'] = {
|
| 2870 |
-
"lemma": verb_inflections.get("infinitive", lemma),
|
| 2871 |
-
"inflections": verb_inflections,
|
| 2872 |
-
"odenet_senses": []
|
| 2873 |
}
|
| 2874 |
|
| 2875 |
# --- Check ADJECTIVE ---
|
| 2876 |
if 'adjective' not in valid_analyses:
|
| 2877 |
adj_inflections = {}
|
| 2878 |
is_good_adj = False
|
| 2879 |
-
|
| 2880 |
if not PATTERN_DE_AVAILABLE:
|
| 2881 |
adj_inflections = {"info": "pattern.de not available."}
|
| 2882 |
is_good_adj = True
|
|
@@ -2890,46 +2825,28 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2890 |
|
| 2891 |
if is_good_adj:
|
| 2892 |
odenet_senses = odenet_senses_by_pos.get('adjective', [])
|
| 2893 |
-
|
| 2894 |
if odenet_senses:
|
| 2895 |
-
if "info" not in odenet_senses[0]:
|
| 2896 |
log(f" ✓ [IWNLP Fallback] Valid ADJECTIVE found: {lemma}")
|
| 2897 |
valid_analyses['adjective'] = {
|
| 2898 |
"lemma": adj_inflections.get("predicative", lemma),
|
| 2899 |
"inflections": adj_inflections,
|
| 2900 |
-
"odenet_senses": odenet_senses
|
| 2901 |
-
}
|
| 2902 |
-
elif not WN_AVAILABLE:
|
| 2903 |
-
log(f" ✓ [IWNLP Fallback] Accepting ADJECTIVE (OdeNet unavailable): {lemma}")
|
| 2904 |
-
valid_analyses['adjective'] = {
|
| 2905 |
-
"lemma": adj_inflections.get("predicative", lemma),
|
| 2906 |
-
"inflections": adj_inflections,
|
| 2907 |
-
"odenet_senses": []
|
| 2908 |
}
|
| 2909 |
|
| 2910 |
# --- 3. CHECK NON-INFLECTING POS (ADVERB) ---
|
| 2911 |
if spacy_pos == "ADV":
|
| 2912 |
-
odenet_senses =
|
| 2913 |
-
|
| 2914 |
if odenet_senses:
|
| 2915 |
-
if "info" not in odenet_senses[0]:
|
| 2916 |
log(f" ✓ [IWNLP Fallback] Valid ADVERB found: {word}")
|
| 2917 |
valid_analyses['adverb'] = {
|
| 2918 |
"lemma": word,
|
| 2919 |
"inflections": {"base_form": word},
|
| 2920 |
-
"odenet_senses": odenet_senses
|
| 2921 |
-
}
|
| 2922 |
-
elif not WN_AVAILABLE:
|
| 2923 |
-
log(f" ✓ [IWNLP Fallback] Accepting ADVERB (OdeNet unavailable): {word}")
|
| 2924 |
-
valid_analyses['adverb'] = {
|
| 2925 |
-
"lemma": word,
|
| 2926 |
-
"inflections": {"base_form": word},
|
| 2927 |
-
"odenet_senses": []
|
| 2928 |
}
|
| 2929 |
|
| 2930 |
# --- 4. CHECK OTHER FUNCTION WORDS (e.g. "mein" -> DET) ---
|
| 2931 |
-
# We add this if spaCy found a function word AND we haven't found any
|
| 2932 |
-
# content-word analyses (which are more informative).
|
| 2933 |
FUNCTION_POS = {"DET", "PRON", "ADP", "AUX", "CCONJ", "SCONJ", "PART", "PUNCT", "SYM"}
|
| 2934 |
if spacy_pos in FUNCTION_POS and not valid_analyses:
|
| 2935 |
pos_key = spacy_pos.lower()
|
|
@@ -2937,25 +2854,79 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2937 |
valid_analyses[pos_key] = {
|
| 2938 |
"lemma": spacy_lemma,
|
| 2939 |
"inflections": {"base_form": spacy_lemma},
|
| 2940 |
-
"odenet_senses": [],
|
| 2941 |
-
"spacy_analysis": {
|
| 2942 |
"word": token.text, "lemma": token.lemma_,
|
| 2943 |
"pos_UPOS": token.pos_, "pos_TAG": token.tag_,
|
| 2944 |
"morphology": str(token.morph)
|
| 2945 |
}
|
| 2946 |
}
|
| 2947 |
|
| 2948 |
-
# --- 5. BUILD FINAL REPORT ---
|
| 2949 |
for pos_key, analysis_data in valid_analyses.items():
|
| 2950 |
lemma = analysis_data["lemma"]
|
| 2951 |
-
|
| 2952 |
-
|
| 2953 |
-
|
| 2954 |
-
|
| 2955 |
-
if
|
| 2956 |
-
|
|
|
|
| 2957 |
|
| 2958 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2959 |
|
| 2960 |
if not final_result["analysis"]:
|
| 2961 |
return {} # No results
|
|
|
|
| 2524 |
}
|
| 2525 |
|
| 2526 |
# --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
|
| 2527 |
+
def _analyze_word_with_hanta(word: str, top_n_value: Optional[float] = 0) -> Dict[str, Any]:
|
| 2528 |
"""
|
| 2529 |
+
(FALLBACK ENGINE 2) Analyzes a single word using HanTa + OdeNet + Pattern.
|
| 2530 |
+
This was the V18 engine. Returns {} on failure.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2531 |
"""
|
| 2532 |
+
if not HANTA_AVAILABLE:
|
| 2533 |
+
return {} # Signal failure
|
| 2534 |
+
|
| 2535 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
| 2536 |
+
print(f"\n[Word Encyclopedia] Running V18 (HanTa) fallback for: \"{word}\"")
|
| 2537 |
+
final_result: Dict[str, Any] = {
|
| 2538 |
+
"input_word": word,
|
| 2539 |
+
"analysis": {}
|
| 2540 |
+
}
|
| 2541 |
+
word_lower = word.lower() # For validation
|
| 2542 |
+
|
| 2543 |
+
try:
|
| 2544 |
+
hanta_tagger = hanta_get_tagger()
|
| 2545 |
+
if not hanta_tagger:
|
| 2546 |
+
raise Exception("HanTa Tagger failed to initialize.")
|
| 2547 |
|
| 2548 |
+
hanta_tags = _hanta_get_candidates(word, hanta_tagger)
|
| 2549 |
+
if not hanta_tags:
|
| 2550 |
+
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2551 |
|
| 2552 |
+
pos_groups_map = _hanta_map_tags_to_pos(hanta_tags)
|
| 2553 |
+
log(f"Found {len(pos_groups_map)} possible POS group(s): {list(pos_groups_map.keys())}")
|
| 2554 |
+
|
| 2555 |
+
for pos_group, specific_tags in pos_groups_map.items():
|
| 2556 |
+
print(f"--- Analyzing as: {pos_group.upper()} ---")
|
| 2557 |
+
|
| 2558 |
+
lemma = _hanta_get_lemma_for_pos(word, pos_group, hanta_tagger)
|
| 2559 |
+
log(f"Lemma for {pos_group} is: '{lemma}'")
|
| 2560 |
+
|
| 2561 |
+
all_odenet_senses = _get_odenet_senses_by_pos(lemma)
|
| 2562 |
+
pos_odenet_senses = all_odenet_senses.get(pos_group, [])
|
| 2563 |
+
|
| 2564 |
+
if not pos_odenet_senses:
|
| 2565 |
+
log(f"✗ REJECTED {pos_group}: OdeNet is available but has no '{pos_group}' senses for lemma '{lemma}'.")
|
| 2566 |
+
continue
|
| 2567 |
+
|
| 2568 |
+
if pos_odenet_senses and "info" in pos_odenet_senses[0]:
|
| 2569 |
+
log(f"✓ VERIFIED {pos_group}: OdeNet is unavailable, proceeding without validation.")
|
| 2570 |
+
pos_odenet_senses = []
|
| 2571 |
+
else:
|
| 2572 |
+
log(f"✓ VERIFIED {pos_group}: OdeNet found {len(pos_odenet_senses)} sense(s).")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2573 |
|
| 2574 |
+
# --- 1. Get Inflections (Pattern) ---
|
| 2575 |
+
inflection_report = {}
|
| 2576 |
+
if not PATTERN_DE_AVAILABLE:
|
| 2577 |
+
inflection_report = {"info": "pattern.de library not available. No inflections generated."}
|
| 2578 |
+
else:
|
| 2579 |
+
try:
|
| 2580 |
+
if pos_group == "noun":
|
| 2581 |
+
inflection_report = pattern_analyze_as_noun(lemma)
|
| 2582 |
+
elif pos_group == "verb":
|
| 2583 |
+
inflection_report = pattern_analyze_as_verb(lemma)
|
| 2584 |
+
elif pos_group == "adjective":
|
| 2585 |
+
inflection_report = pattern_analyze_as_adjective(lemma)
|
| 2586 |
+
elif pos_group == "adverb":
|
| 2587 |
+
inflection_report = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 2588 |
+
|
| 2589 |
+
if not pattern_is_good_analysis(inflection_report, pos_group) and pos_group != "adverb":
|
| 2590 |
+
log(f"⚠️ Warning: pattern.de generated a poor inflection table for {lemma} ({pos_group}).")
|
| 2591 |
+
inflection_report["warning"] = "Inflection table from pattern.de seems incomplete or invalid."
|
| 2592 |
+
except Exception as e:
|
| 2593 |
+
log(f"pattern.de inflection failed for {lemma} ({pos_group}): {e}")
|
| 2594 |
+
inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
|
| 2595 |
|
| 2596 |
+
# --- 2. Build Semantics Block ---
|
| 2597 |
+
semantics_block = _build_semantics_block_for_lemma(lemma, pos_group, top_n)
|
|
|
|
|
|
|
| 2598 |
|
| 2599 |
+
# --- 3. Build Final Report Block ---
|
| 2600 |
+
pos_entry_report = {
|
| 2601 |
+
"hanta_analysis": {
|
| 2602 |
+
"detected_tags": sorted(list(specific_tags)),
|
| 2603 |
+
"lemma": lemma,
|
| 2604 |
+
"morphemes": [
|
| 2605 |
+
hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
|
| 2606 |
+
]
|
| 2607 |
+
},
|
| 2608 |
+
"inflections_pattern": inflection_report,
|
| 2609 |
+
"semantics_combined": semantics_block
|
| 2610 |
+
}
|
| 2611 |
+
|
| 2612 |
+
# --- 4. *** VALIDATION FILTER *** ---
|
| 2613 |
+
is_valid = False
|
| 2614 |
+
if lemma.lower() == word_lower:
|
| 2615 |
+
is_valid = True
|
| 2616 |
+
log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches lemma.")
|
| 2617 |
+
|
| 2618 |
+
if not is_valid:
|
| 2619 |
+
# Check pattern.de's lexeme (for verbs)
|
| 2620 |
+
for form in inflection_report.get("lexeme", []):
|
| 2621 |
+
if form.lower() == word_lower:
|
| 2622 |
+
is_valid = True
|
| 2623 |
+
log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word found in pattern.de lexeme.")
|
| 2624 |
+
break
|
| 2625 |
+
|
| 2626 |
+
if not is_valid:
|
| 2627 |
+
# Check pattern.de's participles (for "abgeschnitten")
|
| 2628 |
+
for part_form in inflection_report.get("participles", {}).values():
|
| 2629 |
+
if part_form.lower() == word_lower:
|
| 2630 |
+
is_valid = True
|
| 2631 |
+
log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word found in pattern.de participles.")
|
| 2632 |
+
break
|
| 2633 |
+
|
| 2634 |
+
if not is_valid and pos_group == "adjective":
|
| 2635 |
+
# Check adjective forms
|
| 2636 |
+
if word_lower == inflection_report.get("predicative", "").lower() or \
|
| 2637 |
+
word_lower == inflection_report.get("comparative", "").lower() or \
|
| 2638 |
+
word_lower == inflection_report.get("superlative", "").lower():
|
| 2639 |
+
is_valid = True
|
| 2640 |
+
log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches adj comparison form.")
|
| 2641 |
+
|
| 2642 |
+
if not is_valid and pos_group == "noun":
|
| 2643 |
+
# Check noun forms
|
| 2644 |
+
if word_lower == inflection_report.get("singular", "").lower() or \
|
| 2645 |
+
word_lower == inflection_report.get("plural", "").lower():
|
| 2646 |
+
is_valid = True
|
| 2647 |
+
log(f"[DEBUG] HanTa: KEEPING entry '{lemma}' ({pos_group}) because input word matches noun singular/plural.")
|
| 2648 |
+
|
| 2649 |
+
if not is_valid and pos_group == "adverb":
|
| 2650 |
+
is_valid = True # Adverbs are non-inflecting, always keep.
|
| 2651 |
|
| 2652 |
+
if is_valid:
|
| 2653 |
+
if pos_group not in final_result["analysis"]:
|
| 2654 |
+
final_result["analysis"][pos_group] = []
|
| 2655 |
+
final_result["analysis"][pos_group].append(pos_entry_report)
|
| 2656 |
+
else:
|
| 2657 |
+
log(f"[DEBUG] HanTa: DROPPING entry '{lemma}' ({pos_group}) because input word '{word}' was not found in its valid forms.")
|
| 2658 |
+
# --- END OF VALIDATION ---
|
| 2659 |
+
|
| 2660 |
+
if not final_result["analysis"]:
|
| 2661 |
+
return {} # No results
|
| 2662 |
+
|
| 2663 |
+
final_result["info"] = "Analysis performed by HanTa-led fallback engine."
|
| 2664 |
+
return final_result
|
| 2665 |
+
|
| 2666 |
+
except Exception as e:
|
| 2667 |
+
print(f"[Word Encyclopedia] HanTa FALLBACK Engine FAILED: {e}")
|
| 2668 |
+
traceback.print_exc()
|
| 2669 |
+
return {} # Signal failure
|
| 2670 |
|
| 2671 |
+
def _analyze_word_with_iwnlp(word: str, top_n_value: Optional[float] = 0) -> Dict[str, Any]:
|
| 2672 |
"""
|
| 2673 |
+
(FALLBACK ENGINE 3) Analyzes a single word using IWNLP + OdeNet + Pattern.
|
| 2674 |
+
This is the full V16/V18 logic, restored and with the new validation filter.
|
| 2675 |
+
Returns {} on failure.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2676 |
"""
|
| 2677 |
if not word or not word.strip():
|
| 2678 |
+
return {} # Use empty dict for "info"
|
| 2679 |
|
| 2680 |
if not IWNLP_AVAILABLE:
|
| 2681 |
+
return {} # Signal failure
|
| 2682 |
|
| 2683 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
| 2684 |
|
| 2685 |
+
print(f"\n[Word Encyclopedia] Running IWNLP-fallback analysis for: \"{word}\" (top_n={top_n})")
|
| 2686 |
|
| 2687 |
final_result: Dict[str, Any] = {
|
| 2688 |
"input_word": word,
|
| 2689 |
"analysis": {}
|
| 2690 |
}
|
| 2691 |
+
word_lower = word.lower() # For validation
|
| 2692 |
|
| 2693 |
# --- Helper: Get OdeNet senses ---
|
| 2694 |
+
def _get_odenet_senses_by_pos_internal(w):
|
| 2695 |
"""
|
| 2696 |
(Internal helper for IWNLP fallback)
|
| 2697 |
+
OdeNet uses 'a' for BOTH Adjective and Adverb.
|
|
|
|
| 2698 |
"""
|
| 2699 |
senses_by_pos: Dict[str, List[Dict]] = {
|
| 2700 |
"noun": [], "verb": [], "adjective": [], "adverb": []
|
|
|
|
| 2706 |
"verb": [{"info": "OdeNet unavailable"}],
|
| 2707 |
"adjective": [{"info": "OdeNet unavailable"}],
|
| 2708 |
"adverb": [{"info": "OdeNet unavailable"}]}
|
|
|
|
| 2709 |
try:
|
| 2710 |
all_senses = odenet_get_thesaurus_info(w).get("senses", [])
|
| 2711 |
for sense in all_senses:
|
| 2712 |
if "error" in sense: continue
|
| 2713 |
pos_tag = sense.get("pos")
|
|
|
|
| 2714 |
if pos_tag == 'n':
|
| 2715 |
senses_by_pos["noun"].append(sense)
|
| 2716 |
elif pos_tag == 'v':
|
| 2717 |
senses_by_pos["verb"].append(sense)
|
|
|
|
|
|
|
| 2718 |
elif pos_tag == 'a':
|
| 2719 |
log(f"[IWNLP Fallback] Found OdeNet 'a' tag (Adj/Adv) for sense: {sense.get('definition', '...')[:30]}")
|
| 2720 |
senses_by_pos["adjective"].append(sense)
|
| 2721 |
senses_by_pos["adverb"].append(sense)
|
|
|
|
|
|
|
| 2722 |
except Exception as e:
|
| 2723 |
print(f"[Word Encyclopedia] OdeNet check failed: {e}")
|
| 2724 |
return senses_by_pos
|
| 2725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2726 |
# --- 1. GET ALL LEMMA CANDIDATES & SPACY POS ---
|
| 2727 |
try:
|
| 2728 |
iwnlp = iwnlp_get_pipeline()
|
| 2729 |
if not iwnlp:
|
| 2730 |
+
return {} # Signal failure
|
| 2731 |
|
| 2732 |
doc = iwnlp(word)
|
| 2733 |
token = doc[0]
|
| 2734 |
|
|
|
|
| 2735 |
spacy_pos = token.pos_ # e.g., "NOUN" for "Lauf", "ADV" for "heute"
|
| 2736 |
spacy_lemma = token.lemma_
|
|
|
|
|
|
|
|
|
|
| 2737 |
iwnlp_lemmas_list = token._.iwnlp_lemmas or []
|
| 2738 |
|
|
|
|
| 2739 |
all_lemmas = set(iwnlp_lemmas_list)
|
| 2740 |
all_lemmas.add(spacy_lemma)
|
| 2741 |
all_lemmas.add(word) # Add the word itself
|
|
|
|
| 2745 |
|
| 2746 |
except Exception as e:
|
| 2747 |
traceback.print_exc()
|
| 2748 |
+
return {} # Signal failure
|
| 2749 |
|
| 2750 |
# --- 2. CHECK INFLECTING POSSIBILITIES FOR EACH LEMMA ---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2751 |
valid_analyses: Dict[str, Dict[str, Any]] = {}
|
|
|
|
| 2752 |
for lemma in all_lemmas:
|
| 2753 |
if not lemma: continue
|
| 2754 |
|
| 2755 |
+
odenet_senses_by_pos = _get_odenet_senses_by_pos_internal(lemma)
|
| 2756 |
|
| 2757 |
# --- Check NOUN ---
|
| 2758 |
if 'noun' not in valid_analyses:
|
| 2759 |
noun_inflections = {}
|
| 2760 |
is_good_noun = False
|
|
|
|
| 2761 |
if not PATTERN_DE_AVAILABLE:
|
| 2762 |
noun_inflections = {"info": "pattern.de not available."}
|
| 2763 |
is_good_noun = True
|
|
|
|
| 2772 |
if is_good_noun:
|
| 2773 |
odenet_senses = odenet_senses_by_pos.get('noun', [])
|
| 2774 |
if not odenet_senses and lemma.lower() == word.lower():
|
| 2775 |
+
odenet_senses = _get_odenet_senses_by_pos_internal(lemma.capitalize()).get('noun', [])
|
|
|
|
|
|
|
| 2776 |
if odenet_senses:
|
| 2777 |
+
if "info" not in odenet_senses[0] or not WN_AVAILABLE:
|
|
|
|
| 2778 |
log(f" ✓ [IWNLP Fallback] Valid NOUN found: {lemma}")
|
| 2779 |
valid_analyses['noun'] = {
|
| 2780 |
"lemma": noun_inflections.get("base_form", lemma),
|
| 2781 |
"inflections": noun_inflections,
|
| 2782 |
+
"odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2783 |
}
|
| 2784 |
|
|
|
|
| 2785 |
# --- Check VERB ---
|
| 2786 |
if 'verb' not in valid_analyses:
|
| 2787 |
verb_inflections = {}
|
| 2788 |
is_good_verb = False
|
|
|
|
| 2789 |
if not PATTERN_DE_AVAILABLE:
|
| 2790 |
verb_inflections = {"info": "pattern.de not available."}
|
| 2791 |
is_good_verb = True
|
|
|
|
| 2799 |
|
| 2800 |
if is_good_verb:
|
| 2801 |
odenet_senses = odenet_senses_by_pos.get('verb', [])
|
|
|
|
| 2802 |
if odenet_senses:
|
| 2803 |
+
if "info" not in odenet_senses[0] or not WN_AVAILABLE:
|
| 2804 |
log(f" ✓ [IWNLP Fallback] Valid VERB found: {lemma}")
|
| 2805 |
valid_analyses['verb'] = {
|
| 2806 |
"lemma": verb_inflections.get("infinitive", lemma),
|
| 2807 |
"inflections": verb_inflections,
|
| 2808 |
+
"odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2809 |
}
|
| 2810 |
|
| 2811 |
# --- Check ADJECTIVE ---
|
| 2812 |
if 'adjective' not in valid_analyses:
|
| 2813 |
adj_inflections = {}
|
| 2814 |
is_good_adj = False
|
|
|
|
| 2815 |
if not PATTERN_DE_AVAILABLE:
|
| 2816 |
adj_inflections = {"info": "pattern.de not available."}
|
| 2817 |
is_good_adj = True
|
|
|
|
| 2825 |
|
| 2826 |
if is_good_adj:
|
| 2827 |
odenet_senses = odenet_senses_by_pos.get('adjective', [])
|
|
|
|
| 2828 |
if odenet_senses:
|
| 2829 |
+
if "info" not in odenet_senses[0] or not WN_AVAILABLE:
|
| 2830 |
log(f" ✓ [IWNLP Fallback] Valid ADJECTIVE found: {lemma}")
|
| 2831 |
valid_analyses['adjective'] = {
|
| 2832 |
"lemma": adj_inflections.get("predicative", lemma),
|
| 2833 |
"inflections": adj_inflections,
|
| 2834 |
+
"odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2835 |
}
|
| 2836 |
|
| 2837 |
# --- 3. CHECK NON-INFLECTING POS (ADVERB) ---
|
| 2838 |
if spacy_pos == "ADV":
|
| 2839 |
+
odenet_senses = _get_odenet_senses_by_pos_internal(word).get('adverb', [])
|
|
|
|
| 2840 |
if odenet_senses:
|
| 2841 |
+
if "info" not in odenet_senses[0] or not WN_AVAILABLE:
|
| 2842 |
log(f" ✓ [IWNLP Fallback] Valid ADVERB found: {word}")
|
| 2843 |
valid_analyses['adverb'] = {
|
| 2844 |
"lemma": word,
|
| 2845 |
"inflections": {"base_form": word},
|
| 2846 |
+
"odenet_senses": [] if "info" in odenet_senses[0] else odenet_senses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2847 |
}
|
| 2848 |
|
| 2849 |
# --- 4. CHECK OTHER FUNCTION WORDS (e.g. "mein" -> DET) ---
|
|
|
|
|
|
|
| 2850 |
FUNCTION_POS = {"DET", "PRON", "ADP", "AUX", "CCONJ", "SCONJ", "PART", "PUNCT", "SYM"}
|
| 2851 |
if spacy_pos in FUNCTION_POS and not valid_analyses:
|
| 2852 |
pos_key = spacy_pos.lower()
|
|
|
|
| 2854 |
valid_analyses[pos_key] = {
|
| 2855 |
"lemma": spacy_lemma,
|
| 2856 |
"inflections": {"base_form": spacy_lemma},
|
| 2857 |
+
"odenet_senses": [],
|
| 2858 |
+
"spacy_analysis": {
|
| 2859 |
"word": token.text, "lemma": token.lemma_,
|
| 2860 |
"pos_UPOS": token.pos_, "pos_TAG": token.tag_,
|
| 2861 |
"morphology": str(token.morph)
|
| 2862 |
}
|
| 2863 |
}
|
| 2864 |
|
| 2865 |
+
# --- 5. BUILD FINAL REPORT (V21 MODIFIED + VALIDATION) ---
|
| 2866 |
for pos_key, analysis_data in valid_analyses.items():
|
| 2867 |
lemma = analysis_data["lemma"]
|
| 2868 |
+
inflection_block = analysis_data["inflections"]
|
| 2869 |
+
|
| 2870 |
+
# --- E. VALIDATION FILTER ---
|
| 2871 |
+
is_valid = False
|
| 2872 |
+
if lemma.lower() == word_lower:
|
| 2873 |
+
is_valid = True
|
| 2874 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches lemma.")
|
| 2875 |
|
| 2876 |
+
if not is_valid:
|
| 2877 |
+
# Check pattern.de's lexeme (for verbs)
|
| 2878 |
+
for form in inflection_block.get("lexeme", []):
|
| 2879 |
+
if form.lower() == word_lower:
|
| 2880 |
+
is_valid = True
|
| 2881 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word found in pattern.de lexeme.")
|
| 2882 |
+
break
|
| 2883 |
+
|
| 2884 |
+
if not is_valid:
|
| 2885 |
+
# Check pattern.de's participles (for "abgeschnitten")
|
| 2886 |
+
for part_form in inflection_block.get("participles", {}).values():
|
| 2887 |
+
if part_form.lower() == word_lower:
|
| 2888 |
+
is_valid = True
|
| 2889 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word found in pattern.de participles.")
|
| 2890 |
+
break
|
| 2891 |
+
|
| 2892 |
+
if not is_valid and pos_key == "adjective":
|
| 2893 |
+
# Check adjective forms
|
| 2894 |
+
if word_lower == inflection_block.get("predicative", "").lower() or \
|
| 2895 |
+
word_lower == inflection_block.get("comparative", "").lower() or \
|
| 2896 |
+
word_lower == inflection_block.get("superlative", "").lower():
|
| 2897 |
+
is_valid = True
|
| 2898 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches adj comparison form.")
|
| 2899 |
+
|
| 2900 |
+
if not is_valid and pos_key == "noun":
|
| 2901 |
+
# Check noun forms
|
| 2902 |
+
if word_lower == inflection_block.get("singular", "").lower() or \
|
| 2903 |
+
word_lower == inflection_block.get("plural", "").lower():
|
| 2904 |
+
is_valid = True
|
| 2905 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because input word matches noun singular/plural.")
|
| 2906 |
+
|
| 2907 |
+
if not is_valid and (pos_key == "adverb" or "spacy_analysis" in analysis_data):
|
| 2908 |
+
is_valid = True # Adverbs and Function Words are non-inflecting, always keep.
|
| 2909 |
+
log(f"[DEBUG] IWNLP: KEEPING entry '{lemma}' ({pos_key}) because it is a non-inflecting word (ADV/FUNC).")
|
| 2910 |
+
|
| 2911 |
+
if is_valid:
|
| 2912 |
+
pos_report = {
|
| 2913 |
+
"inflections_pattern": inflection_block,
|
| 2914 |
+
# Use the new global helper
|
| 2915 |
+
"semantics_combined": _build_semantics_block_for_lemma(
|
| 2916 |
+
lemma,
|
| 2917 |
+
pos_key,
|
| 2918 |
+
top_n
|
| 2919 |
+
)
|
| 2920 |
+
}
|
| 2921 |
+
if "spacy_analysis" in analysis_data:
|
| 2922 |
+
pos_report["spacy_analysis"] = analysis_data["spacy_analysis"]
|
| 2923 |
+
|
| 2924 |
+
if pos_key not in final_result["analysis"]:
|
| 2925 |
+
final_result["analysis"][pos_key] = []
|
| 2926 |
+
final_result["analysis"][pos_key].append(pos_report)
|
| 2927 |
+
else:
|
| 2928 |
+
log(f"[DEBUG] IWNLP: DROPPING entry '{lemma}' ({pos_key}) because input word '{word}' was not found in its valid forms.")
|
| 2929 |
+
# --- END VALIDATION ---
|
| 2930 |
|
| 2931 |
if not final_result["analysis"]:
|
| 2932 |
return {} # No results
|