cstr commited on
Commit
934468e
·
verified ·
1 Parent(s): 9de49fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -55
app.py CHANGED
@@ -150,6 +150,24 @@ except ImportError:
150
  print("Install with: pip install HanTa")
151
  print("="*70)
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # ============================================================================
154
  # 2. SHARED GLOBALS & CONFIG
155
  # ============================================================================
@@ -174,6 +192,10 @@ CONCEPTNET_LOCK = threading.Lock()
174
  HANTA_TAGGER_INSTANCE: Optional[HanoverTagger] = None
175
  HANTA_TAGGER_LOCK = threading.Lock()
176
 
 
 
 
 
177
  # --- Helper ---
178
  def _html_wrap(content: str, line_height: str = "2.0") -> str:
179
  """Wraps displaCy HTML in a consistent, scrollable div."""
@@ -1624,11 +1646,7 @@ def _build_semantics(lemma: str, odenet_senses: List[Dict], top_n: int) -> Dict[
1624
  }
1625
 
1626
  # ============================================================================
1627
- # 6d. WIKTIONARY DATABASE LOGIC (NEW PRIMARY ENGINE)
1628
- # ============================================================================
1629
-
1630
- # ============================================================================
1631
- # 6d. WIKTIONARY DATABASE LOGIC (NEW PRIMARY ENGINE)
1632
  # ============================================================================
1633
 
1634
  def wiktionary_download_db() -> bool:
@@ -1913,7 +1931,7 @@ def _wiktionary_format_semantics_block(
1913
 
1914
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1915
  """
1916
- (NEW PRIMARY ENGINE) Analyzes a word using the Wiktionary DB.
1917
  Returns {} on failure to signal dispatcher to fall back.
1918
  """
1919
  final_result: Dict[str, Any] = {
@@ -2033,7 +2051,7 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2033
  }
2034
  }
2035
 
2036
- # --- E. *** YOUR NEW VALIDATION FILTER (Corrected) *** ---
2037
  is_valid = False
2038
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2039
 
@@ -2069,6 +2087,199 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
2069
  final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching entries, kept {sum(len(v) for v in final_result.get('analysis', {}).values())}."
2070
  return final_result
2071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2072
  # ============================================================================
2073
  # 7. CONSOLIDATED ANALYZER LOGIC
2074
  # ============================================================================
@@ -2079,7 +2290,7 @@ def comprehensive_german_analysis(text: str, top_n_value: Optional[float] = 0) -
2079
  """
2080
  (CONTEXTUAL) Combines NLP tools for a deep analysis of German text.
2081
 
2082
- ** V19 UPDATE: ** Reads the new list-based, multi-engine output
2083
  from `analyze_word_encyclopedia` and combines all senses for ranking.
2084
  """
2085
 
@@ -2256,7 +2467,6 @@ def comprehensive_german_analysis(text: str, top_n_value: Optional[float] = 0) -
2256
  if key.endswith("_senses") and nlp_de:
2257
  ranked_senses = []
2258
  for sense in semantic_analysis[key]:
2259
- # ... (your existing re-ranking code) ...
2260
  if "error" in sense: continue
2261
  definition = sense.get("definition", "")
2262
  relevance = 0.0
@@ -2278,7 +2488,6 @@ def comprehensive_german_analysis(text: str, top_n_value: Optional[float] = 0) -
2278
  # ConceptNet Relations
2279
  if "conceptnet_relations" in semantic_analysis and nlp_de:
2280
  ranked_relations = []
2281
- # ... (your existing re-ranking code) ...
2282
  for rel in semantic_analysis["conceptnet_relations"]:
2283
  if "error" in rel: continue
2284
  text_to_score = rel.get('surface') or rel.get('other_node', '')
@@ -2400,7 +2609,7 @@ def _analyze_word_with_hanta(word: str, top_n_value: int) -> Dict[str, Any]:
2400
  inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
2401
 
2402
  # --- 3d. Build Final Report Block ---
2403
- final_result["analysis"][pos_group] = {
2404
  "hanta_analysis": {
2405
  "detected_tags": sorted(list(specific_tags)),
2406
  "lemma": lemma,
@@ -2408,9 +2617,9 @@ def _analyze_word_with_hanta(word: str, top_n_value: int) -> Dict[str, Any]:
2408
  hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
2409
  ]
2410
  },
2411
- "inflections": inflection_report,
2412
- "semantics": _build_semantics(lemma, pos_odenet_senses, top_n)
2413
- }
2414
 
2415
  if not final_result["analysis"]:
2416
  return {
@@ -2430,7 +2639,6 @@ def _analyze_word_with_hanta(word: str, top_n_value: int) -> Dict[str, Any]:
2430
  if IWNLP_AVAILABLE:
2431
  try:
2432
  log("--- Dispatcher: HanTa not found or failed. Attempting IWNLP Fallback Engine ---")
2433
- # We call your existing V16 function, which we just made robust in Step 2.
2434
  result = _analyze_word_with_iwnlp(word, top_n_value)
2435
  result["info"] = result.get("info", "") + " (Analysis performed by IWNLP-based fallback engine)"
2436
  return result
@@ -2739,20 +2947,15 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2739
 
2740
  # --- 5. BUILD FINAL REPORT ---
2741
  for pos_key, analysis_data in valid_analyses.items():
 
2742
  pos_report = {
2743
  "inflections_pattern": analysis_data["inflections"],
2744
- "semantics_combined": _build_semantics(
2745
- analysis_data["lemma"],
2746
- analysis_data["odenet_senses"],
2747
- top_n
2748
- )
2749
  }
2750
- # Add spaCy analysis if it was included
2751
  if "spacy_analysis" in analysis_data:
2752
  pos_report["spacy_analysis"] = analysis_data["spacy_analysis"]
2753
 
2754
- # Wrap it in a list
2755
- final_result["analysis"][pos_key] = [pos_report] # <--- THE CHANGE
2756
 
2757
  if not final_result["analysis"]:
2758
  return {} # No results
@@ -2761,14 +2964,16 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
2761
  return final_result
2762
 
2763
 
2764
- # --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
2765
 
2766
- # --- THIS IS THE NEW PUBLIC DISPATCHER FUNCTION ---
2767
  # --- THIS IS THE NEW PUBLIC DISPATCHER FUNCTION ---
2768
  def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engine_choice: str = "wiktionary") -> Dict[str, Any]:
2769
  """
2770
- (PUBLIC DISPATCHER V21) Analyzes a single word using the selected engine
2771
  as a starting point, then automatically falls back if no results are found.
 
 
2772
  """
2773
  if not word or not word.strip():
2774
  return {"info": "Please enter a word."}
@@ -2781,37 +2986,41 @@ def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engin
2781
  log(f"\n[Word Encyclopedia] User selected engine: '{engine_choice}' for word: '{word}'")
2782
 
2783
  try:
2784
- # --- 1. Try Wiktionary (if selected) ---
2785
  if engine_choice == "wiktionary":
2786
- log(f"[DEBUG] V21 Dispatcher: Trying Wiktionary (Primary) for '{word}'...")
2787
  result = _analyze_word_with_wiktionary(word, top_n)
2788
  if result and result.get("analysis"):
2789
  return result # Success
2790
  info_log.append("Wiktionary found no results.")
2791
- log(f"[DEBUG] V21 Dispatcher: Wiktionary found no results for '{word}'. Falling back to HanTa...")
2792
-
2793
- # --- 2. Try HanTa (if selected or as fallback) ---
2794
- # This block runs if:
2795
- # a) User selected "hanta" OR
2796
- # b) User selected "wiktionary" and it found nothing
2797
- if engine_choice == "hanta" or (engine_choice == "wiktionary" and not result.get("analysis")):
2798
- log(f"[DEBUG] V21 Dispatcher: Trying HanTa (Fallback 1) for '{word}'...")
 
 
 
 
 
 
 
2799
  result = _analyze_word_with_hanta(word, top_n)
2800
  if result and result.get("analysis"):
2801
- result["info"] = f"Analysis from HanTa (Fallback 1). {(' '.join(info_log))}"
2802
  return result # Success
2803
  info_log.append("HanTa found no results.")
2804
- log(f"[DEBUG] V21 Dispatcher: HanTa found no results for '{word}'. Falling back to IWNLP...")
2805
 
2806
- # --- 3. Try IWNLP (if selected or as fallback) ---
2807
- # This block runs if:
2808
- # a) User selected "iwnlp" OR
2809
- # b) The previous engines were tried and all failed (result['analysis'] is still empty)
2810
  if engine_choice == "iwnlp" or (not result.get("analysis")):
2811
- log(f"[DEBUG] V21 Dispatcher: Trying IWNLP (Fallback 2) for '{word}'...")
2812
  result = _analyze_word_with_iwnlp(word, top_n)
2813
  if result and result.get("analysis"):
2814
- result["info"] = f"Analysis from IWNLP (Fallback 2). {(' '.join(info_log))}"
2815
  return result # Success
2816
  info_log.append("IWNLP found no results.")
2817
 
@@ -2825,7 +3034,7 @@ def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engin
2825
  }
2826
 
2827
  # --- No engines found anything ---
2828
- log(f"[DEBUG] V21 Dispatcher: All engines failed to find results for '{word}'.")
2829
  return {
2830
  "input_word": word,
2831
  "info": f"No analysis found. All engines failed. ({' '.join(info_log)})"
@@ -3023,7 +3232,7 @@ def create_combined_tab():
3023
  )
3024
 
3025
  def create_word_encyclopedia_tab():
3026
- """--- NEW: Creates the UI for the NON-CONTEXTUAL Word Analyzer tab ---"""
3027
  gr.Markdown("# 📖 Word Encyclopedia (Non-Contextual)")
3028
  gr.Markdown("This tool analyzes a **single word** for *all possible* grammatical and semantic forms. It finds ambiguities (e.g., 'Lauf' as noun and verb) and groups all data by Part-of-Speech.")
3029
 
@@ -3042,39 +3251,37 @@ def create_word_encyclopedia_tab():
3042
  interactive=True
3043
  )
3044
 
3045
- # --- THIS IS THE NEW UI ELEMENT ---
3046
  engine_radio = gr.Radio(
3047
- label="Select Analysis Engine",
3048
  choices=[
3049
  ("Wiktionary (Default)", "wiktionary"),
3050
- ("HanTa (Fallback 1)", "hanta"),
3051
- ("IWNLP (Fallback 2)", "iwnlp")
 
3052
  ],
3053
  value="wiktionary",
3054
  interactive=True
3055
  )
3056
- # --- END OF NEW UI ELEMENT ---
3057
 
3058
  analyze_button = gr.Button("Analyze Word", variant="primary")
3059
 
3060
  output = gr.JSON(label="Word Encyclopedia Analysis (JSON)")
3061
 
3062
- # --- UPDATE THE CLICK FUNCTION ---
3063
  analyze_button.click(
3064
  fn=analyze_word_encyclopedia,
3065
- # Add 'engine_radio' to the inputs
3066
  inputs=[word_input, top_n_number, engine_radio],
3067
  outputs=[output],
3068
  api_name="analyze_word"
3069
  )
3070
 
3071
- # Update the examples to include the radio button
3072
  gr.Examples(
3073
  [["Lauf", 3, "wiktionary"],
3074
  ["See", 0, "wiktionary"],
3075
  ["schnell", 3, "wiktionary"],
3076
  ["heute", 0, "wiktionary"],
3077
- ["heute", 0, "hanta"]], # Example to show a different engine
3078
  inputs=[word_input, top_n_number, engine_radio],
3079
  outputs=[output],
3080
  fn=analyze_word_encyclopedia
@@ -3105,6 +3312,61 @@ def create_wiktionary_tab():
3105
  inputs=[word_input], outputs=[output], fn=lambda word: _analyze_word_with_wiktionary(word, 0)
3106
  )
3107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3108
  def create_hanta_tab():
3109
  """Creates the UI for the standalone HanTa Engine tab."""
3110
  gr.Markdown("# 🤖 HanTa Lookup (Raw Engine)")
@@ -3185,6 +3447,9 @@ def create_consolidated_interface():
3185
 
3186
  with gr.Tab("🔬 Engine: IWNLP-spaCy (DE)"):
3187
  create_iwnlp_tab()
 
 
 
3188
 
3189
  # --- Standalone Component Tabs ---
3190
  with gr.Tab("📚 Component: Inflections (DE)"):
@@ -3237,6 +3502,18 @@ if __name__ == "__main__":
3237
  print(f"✗ FAILED to initialize Wiktionary: {e}")
3238
  print("--- Wiktionary Done ---\n")
3239
 
 
 
 
 
 
 
 
 
 
 
 
 
3240
  # --- 4. Initialize HanTa Tagger ---
3241
  print("--- Initializing HanTa Tagger ---")
3242
  if HANTA_AVAILABLE:
 
150
  print("Install with: pip install HanTa")
151
  print("="*70)
152
 
153
+ # --- DWDSmor Import ---
154
+ DWDSMOR_AVAILABLE = False
155
+ DwdsmorLemmatizerClass = object # Dummy definition
156
+ try:
157
+ import dwdsmor
158
+ import dwdsmor.spacy # Test this import
159
+ DWDSMOR_AVAILABLE = True
160
+ print("✓ Successfully imported dwdsmor")
161
+ except ImportError as e:
162
+ DWDSMOR_AVAILABLE = False
163
+ print("="*70)
164
+ print(f"WARNING: `dwdsmor` or a dependency failed to import: {e}")
165
+ print("The DWDSmor engine will not be available.")
166
+ print("On macOS, run: brew install sfst")
167
+ print("On Debian/Ubuntu, run: apt-get install sfst")
168
+ print("Then, run: pip install dwdsmor")
169
+ print("="*70)
170
+
171
  # ============================================================================
172
  # 2. SHARED GLOBALS & CONFIG
173
  # ============================================================================
 
192
  HANTA_TAGGER_INSTANCE: Optional[HanoverTagger] = None
193
  HANTA_TAGGER_LOCK = threading.Lock()
194
 
195
+ # --- DWDSmor Cache & Lock ---
196
+ DWDSMOR_LEMMATIZER: Optional[Any] = None
197
+ DWDSMOR_LEMMATIZER_LOCK = threading.Lock()
198
+
199
  # --- Helper ---
200
  def _html_wrap(content: str, line_height: str = "2.0") -> str:
201
  """Wraps displaCy HTML in a consistent, scrollable div."""
 
1646
  }
1647
 
1648
  # ============================================================================
1649
+ # 6d. WIKTIONARY DATABASE LOGIC (PRIMARY ENGINE)
 
 
 
 
1650
  # ============================================================================
1651
 
1652
  def wiktionary_download_db() -> bool:
 
1931
 
1932
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1933
  """
1934
+ (PRIMARY ENGINE) Analyzes a word using the Wiktionary DB.
1935
  Returns {} on failure to signal dispatcher to fall back.
1936
  """
1937
  final_result: Dict[str, Any] = {
 
2051
  }
2052
  }
2053
 
2054
+ # --- E. VALIDATION FILTER ---
2055
  is_valid = False
2056
  is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
2057
 
 
2087
  final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching entries, kept {sum(len(v) for v in final_result.get('analysis', {}).values())}."
2088
  return final_result
2089
 
2090
+ # ============================================================================
2091
+ # 6e. SHARED SEMANTIC HELPER
2092
+ # ============================================================================
2093
+
2094
+ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int) -> Dict[str, Any]:
2095
+ """
2096
+ (REUSABLE HELPER)
2097
+ Fetches OdeNet and ConceptNet data for a given lemma and POS.
2098
+ """
2099
+ log(f"[DEBUG] Building semantics for lemma='{lemma}', pos='{pos_key}'")
2100
+
2101
+ # 1. Get OdeNet senses for this lemma + POS
2102
+ odenet_senses = []
2103
+ if WN_AVAILABLE:
2104
+ try:
2105
+ senses_by_pos = _get_odenet_senses_by_pos(lemma)
2106
+ odenet_senses_raw = senses_by_pos.get(pos_key, [])
2107
+
2108
+ # Filter out placeholder
2109
+ if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
2110
+ odenet_senses = odenet_senses_raw
2111
+ except Exception as e:
2112
+ log(f"[DEBUG] OdeNet lookup failed for {lemma} ({pos_key}): {e}")
2113
+
2114
+ # 2. Get ConceptNet relations for this lemma
2115
+ conceptnet_relations = []
2116
+ if REQUESTS_AVAILABLE:
2117
+ try:
2118
+ conceptnet_result = conceptnet_get_relations(lemma, language='de')
2119
+ conceptnet_relations = conceptnet_result.get("relations", [])
2120
+ except Exception as e:
2121
+ conceptnet_relations = [{"error": str(e)}]
2122
+
2123
+ # 3. Apply top_n limit
2124
+ if top_n > 0:
2125
+ odenet_senses = odenet_senses[:top_n]
2126
+ conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
2127
+ conceptnet_relations = conceptnet_relations[:top_n]
2128
+
2129
+ return {
2130
+ "lemma": lemma,
2131
+ "wiktionary_senses": [], # This block is for non-Wiktionary engines
2132
+ "odenet_senses": odenet_senses,
2133
+ "conceptnet_relations": conceptnet_relations,
2134
+ "wiktionary_synonyms": [],
2135
+ "wiktionary_antonyms": []
2136
+ }
2137
+
2138
+ # ============================================================================
2139
+ # 6f. DWDSMOR ENGINE (NEW FALLBACK 1)
2140
+ # ============================================================================
2141
+
2142
+ def dwdsmor_get_lemmatizer() -> Optional[Any]: # Return type is 'sfst.Transducer'
2143
+ """
2144
+ Thread-safe function to get a single instance of the DWDSmor analyzer.
2145
+ It will automatically download/cache the 'open' automata from Hugging Face Hub.
2146
+ """
2147
+ global DWDSMOR_LEMMATIZER
2148
+ if not DWDSMOR_AVAILABLE:
2149
+ raise ImportError("dwdsmor library is not installed.")
2150
+
2151
+ if DWDSMOR_LEMMATIZER:
2152
+ return DWDSMOR_LEMMATIZER
2153
+
2154
+ with DWDSMOR_LEMMATIZER_LOCK:
2155
+ if DWDSMOR_LEMMATIZER:
2156
+ return DWDSMOR_LEMMATIZER
2157
+
2158
+ try:
2159
+ print("Initializing DWDSmor lemmatizer (loading automata)...")
2160
+
2161
+ # --- THIS IS THE FIX ---
2162
+ # Use the correct API from dwdsmor's own tools (analysis.py)
2163
+ # This will find and download the HF repo automatically
2164
+ from dwdsmor import automaton
2165
+ automata = automaton.automata()
2166
+ analyzer = automata.analyzer("lemma") # Use the 'lemma' automaton
2167
+ # --- END OF FIX ---
2168
+
2169
+ # Force the traversal to actually run by converting to a list.
2170
+ print("[DEBUG] DWDSmor: Running warm-up call...")
2171
+ _ = list(analyzer.analyze("Test", join_tags=True))
2172
+
2173
+ print("✓ DWDSmor lemmatizer initialized successfully.")
2174
+ DWDSMOR_LEMMATIZER = analyzer
2175
+ return DWDSMOR_LEMMATIZER
2176
+ except Exception as e:
2177
+ print(f"✗ CRITICAL: Failed to initialize DWDSmor: {e}")
2178
+ traceback.print_exc()
2179
+ return None
2180
+
2181
+ def _dwdsmor_map_pos_key(dwdsmor_pos: str) -> str:
2182
+ """Maps DWDSmor POS tags to our internal keys."""
2183
+ if dwdsmor_pos == "V": return "verb"
2184
+ if dwdsmor_pos == "NN": return "noun"
2185
+ if dwdsmor_pos == "NPROP": return "noun" # Proper Noun
2186
+ if dwdsmor_pos == "ADJ": return "adjective"
2187
+ if dwdsmor_pos == "ADV": return "adverb"
2188
+ return dwdsmor_pos.lower() # Fallback for others
2189
+
2190
+ def _analyze_word_with_dwdsmor(word: str, top_n: int) -> Dict[str, Any]:
2191
+ """
2192
+ (FALLBACK ENGINE 1) Analyzes a single word using DWDSmor + Pattern + Semantics.
2193
+ Returns {} on failure.
2194
+ """
2195
+ if not DWDSMOR_AVAILABLE:
2196
+ return {} # Signal failure
2197
+
2198
+ print(f"\n[Word Encyclopedia] Running V21 (DWDSmor) engine for: \"{word}\"")
2199
+ final_result: Dict[str, Any] = {
2200
+ "input_word": word,
2201
+ "analysis": {}
2202
+ }
2203
+
2204
+ try:
2205
+ analyzer = dwdsmor_get_lemmatizer()
2206
+ if not analyzer:
2207
+ raise Exception("DWDSmor lemmatizer failed to initialize.")
2208
+
2209
+ analyses = list(analyzer.analyze(word, join_tags=True))
2210
+
2211
+ if not analyses:
2212
+ return {} # No results
2213
+
2214
+ log(f"[DEBUG] DWDSmor: Found {len(analyses)} potential analyses.")
2215
+
2216
+ processed_lemmas_pos: Set[Tuple[str, str]] = set()
2217
+
2218
+ for analysis in analyses:
2219
+
2220
+ # --- THIS IS THE FIX ---
2221
+ # The 'Traversal' object from analyzer.analyze() uses:
2222
+ # .analysis -> for the lemma string (e.g., "Haus")
2223
+ # .pos -> for the POS tag (e.g., "NN")
2224
+ # .spec -> for the full analysis string
2225
+ if not analysis.analysis or not analysis.pos:
2226
+ continue
2227
+
2228
+ lemma = analysis.analysis # Use .analysis, not .lemma
2229
+ pos_key = _dwdsmor_map_pos_key(analysis.pos)
2230
+ # --- END OF FIX ---
2231
+
2232
+ if (lemma, pos_key) in processed_lemmas_pos:
2233
+ continue
2234
+ processed_lemmas_pos.add((lemma, pos_key))
2235
+
2236
+ log(f"--- Analyzing DWDSmor path: lemma='{lemma}', pos='{pos_key}' ---")
2237
+
2238
+ # --- 1. Get Inflections (Pattern) ---
2239
+ pattern_block = {}
2240
+ if PATTERN_DE_AVAILABLE:
2241
+ try:
2242
+ if pos_key == "noun":
2243
+ pattern_block = pattern_analyze_as_noun(lemma)
2244
+ elif pos_key == "verb":
2245
+ pattern_block = pattern_analyze_as_verb(lemma)
2246
+ elif pos_key == "adjective":
2247
+ pattern_block = pattern_analyze_as_adjective(lemma)
2248
+ elif pos_key == "adverb":
2249
+ pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
2250
+ except Exception as e:
2251
+ pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
2252
+
2253
+ # --- 2. Build Semantics Block ---
2254
+ semantics_block = _build_semantics_block_for_lemma(lemma, pos_key, top_n)
2255
+
2256
+ # --- 3. Build Final Report Block ---
2257
+ pos_entry_report = {
2258
+ "dwdsmor_analysis": {
2259
+ "lemma": lemma,
2260
+ "pos": analysis.pos,
2261
+ "analysis_string": analysis.spec, # .spec is the full string
2262
+ "source": "dwdsmor"
2263
+ },
2264
+ "inflections_pattern": pattern_block,
2265
+ "semantics_combined": semantics_block
2266
+ }
2267
+
2268
+ if pos_key not in final_result["analysis"]:
2269
+ final_result["analysis"][pos_key] = []
2270
+ final_result["analysis"][pos_key].append(pos_entry_report)
2271
+
2272
+ if not final_result["analysis"]:
2273
+ return {} # No valid paths found
2274
+
2275
+ final_result["info"] = "Analysis performed by DWDSmor-led engine."
2276
+ return final_result
2277
+
2278
+ except Exception as e:
2279
+ print(f"[Word Encyclopedia] DWDSmor Engine FAILED: {e}")
2280
+ traceback.print_exc()
2281
+ return {} # Signal failure
2282
+
2283
  # ============================================================================
2284
  # 7. CONSOLIDATED ANALYZER LOGIC
2285
  # ============================================================================
 
2290
  """
2291
  (CONTEXTUAL) Combines NLP tools for a deep analysis of German text.
2292
 
2293
+ Reads the list-based, multi-engine output
2294
  from `analyze_word_encyclopedia` and combines all senses for ranking.
2295
  """
2296
 
 
2467
  if key.endswith("_senses") and nlp_de:
2468
  ranked_senses = []
2469
  for sense in semantic_analysis[key]:
 
2470
  if "error" in sense: continue
2471
  definition = sense.get("definition", "")
2472
  relevance = 0.0
 
2488
  # ConceptNet Relations
2489
  if "conceptnet_relations" in semantic_analysis and nlp_de:
2490
  ranked_relations = []
 
2491
  for rel in semantic_analysis["conceptnet_relations"]:
2492
  if "error" in rel: continue
2493
  text_to_score = rel.get('surface') or rel.get('other_node', '')
 
2609
  inflection_report = {"error": f"pattern.de failed: {e}", "traceback": traceback.format_exc()}
2610
 
2611
  # --- 3d. Build Final Report Block ---
2612
+ final_result["analysis"][pos_group] = [{
2613
  "hanta_analysis": {
2614
  "detected_tags": sorted(list(specific_tags)),
2615
  "lemma": lemma,
 
2617
  hanta_tagger.analyze(word.capitalize() if pos_group == 'noun' else word.lower(), taglevel=3)
2618
  ]
2619
  },
2620
+ "inflections_pattern": inflection_report,
2621
+ "semantics_combined": _build_semantics_block_for_lemma(lemma, pos_group, top_n),
2622
+ }]
2623
 
2624
  if not final_result["analysis"]:
2625
  return {
 
2639
  if IWNLP_AVAILABLE:
2640
  try:
2641
  log("--- Dispatcher: HanTa not found or failed. Attempting IWNLP Fallback Engine ---")
 
2642
  result = _analyze_word_with_iwnlp(word, top_n_value)
2643
  result["info"] = result.get("info", "") + " (Analysis performed by IWNLP-based fallback engine)"
2644
  return result
 
2947
 
2948
  # --- 5. BUILD FINAL REPORT ---
2949
  for pos_key, analysis_data in valid_analyses.items():
2950
+ lemma = analysis_data["lemma"]
2951
  pos_report = {
2952
  "inflections_pattern": analysis_data["inflections"],
2953
+ "semantics_combined": _build_semantics_block_for_lemma(lemma, pos_key, top_n)
 
 
 
 
2954
  }
 
2955
  if "spacy_analysis" in analysis_data:
2956
  pos_report["spacy_analysis"] = analysis_data["spacy_analysis"]
2957
 
2958
+ final_result["analysis"][pos_key] = [pos_report] # Wrap in list
 
2959
 
2960
  if not final_result["analysis"]:
2961
  return {} # No results
 
2964
  return final_result
2965
 
2966
 
2967
+ # --- 7b. Word Encyclopedia (Non-Contextual) Analyzer ---
2968
 
2969
+ # --- PUBLIC DISPATCHER FUNCTION ---
2970
  # --- THIS IS THE NEW PUBLIC DISPATCHER FUNCTION ---
2971
  def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engine_choice: str = "wiktionary") -> Dict[str, Any]:
2972
  """
2973
+ (PUBLIC DISPATCHER V22) Analyzes a single word using the selected engine
2974
  as a starting point, then automatically falls back if no results are found.
2975
+
2976
+ Chain: Wiktionary -> DWDSmor -> HanTa -> IWNLP
2977
  """
2978
  if not word or not word.strip():
2979
  return {"info": "Please enter a word."}
 
2986
  log(f"\n[Word Encyclopedia] User selected engine: '{engine_choice}' for word: '{word}'")
2987
 
2988
  try:
2989
+ # --- 1. Try Wiktionary ---
2990
  if engine_choice == "wiktionary":
2991
+ log(f"[DEBUG] V22 Dispatcher: Trying Wiktionary (Primary) for '{word}'...")
2992
  result = _analyze_word_with_wiktionary(word, top_n)
2993
  if result and result.get("analysis"):
2994
  return result # Success
2995
  info_log.append("Wiktionary found no results.")
2996
+ log(f"[DEBUG] V22 Dispatcher: Wiktionary found no results. Falling back to DWDSmor...")
2997
+
2998
+ # --- 2. Try DWDSmor (NEW) ---
2999
+ if engine_choice == "dwdsmor" or (engine_choice == "wiktionary" and not result.get("analysis")):
3000
+ log(f"[DEBUG] V22 Dispatcher: Trying DWDSmor (Fallback 1) for '{word}'...")
3001
+ result = _analyze_word_with_dwdsmor(word, top_n)
3002
+ if result and result.get("analysis"):
3003
+ result["info"] = f"Analysis from DWDSmor (Fallback 1). {(' '.join(info_log))}"
3004
+ return result # Success
3005
+ info_log.append("DWDSmor found no results.")
3006
+ log(f"[DEBUG] V22 Dispatcher: DWDSmor found no results. Falling back to HanTa...")
3007
+
3008
+ # --- 3. Try HanTa ---
3009
+ if engine_choice == "hanta" or (not result.get("analysis")):
3010
+ log(f"[DEBUG] V22 Dispatcher: Trying HanTa (Fallback 2) for '{word}'...")
3011
  result = _analyze_word_with_hanta(word, top_n)
3012
  if result and result.get("analysis"):
3013
+ result["info"] = f"Analysis from HanTa (Fallback 2). {(' '.join(info_log))}"
3014
  return result # Success
3015
  info_log.append("HanTa found no results.")
3016
+ log(f"[DEBUG] V22 Dispatcher: HanTa found no results. Falling back to IWNLP...")
3017
 
3018
+ # --- 4. Try IWNLP ---
 
 
 
3019
  if engine_choice == "iwnlp" or (not result.get("analysis")):
3020
+ log(f"[DEBUG] V22 Dispatcher: Trying IWNLP (Fallback 3) for '{word}'...")
3021
  result = _analyze_word_with_iwnlp(word, top_n)
3022
  if result and result.get("analysis"):
3023
+ result["info"] = f"Analysis from IWNLP (Fallback 3). {(' '.join(info_log))}"
3024
  return result # Success
3025
  info_log.append("IWNLP found no results.")
3026
 
 
3034
  }
3035
 
3036
  # --- No engines found anything ---
3037
+ log(f"[DEBUG] V22 Dispatcher: All engines failed to find results for '{word}'.")
3038
  return {
3039
  "input_word": word,
3040
  "info": f"No analysis found. All engines failed. ({' '.join(info_log)})"
 
3232
  )
3233
 
3234
  def create_word_encyclopedia_tab():
3235
+ """--- UI for the NON-CONTEXTUAL Word Analyzer tab ---"""
3236
  gr.Markdown("# 📖 Word Encyclopedia (Non-Contextual)")
3237
  gr.Markdown("This tool analyzes a **single word** for *all possible* grammatical and semantic forms. It finds ambiguities (e.g., 'Lauf' as noun and verb) and groups all data by Part-of-Speech.")
3238
 
 
3251
  interactive=True
3252
  )
3253
 
3254
+ # --- ADD DWDSMOR TO THE RADIO BUTTONS ---
3255
  engine_radio = gr.Radio(
3256
+ label="Select Analysis Engine (will auto-fallback)",
3257
  choices=[
3258
  ("Wiktionary (Default)", "wiktionary"),
3259
+ ("DWDSmor (New)", "dwdsmor"),
3260
+ ("HanTa (Fallback 2)", "hanta"),
3261
+ ("IWNLP (Fallback 3)", "iwnlp")
3262
  ],
3263
  value="wiktionary",
3264
  interactive=True
3265
  )
3266
+ # --- END OF CHANGE ---
3267
 
3268
  analyze_button = gr.Button("Analyze Word", variant="primary")
3269
 
3270
  output = gr.JSON(label="Word Encyclopedia Analysis (JSON)")
3271
 
 
3272
  analyze_button.click(
3273
  fn=analyze_word_encyclopedia,
 
3274
  inputs=[word_input, top_n_number, engine_radio],
3275
  outputs=[output],
3276
  api_name="analyze_word"
3277
  )
3278
 
 
3279
  gr.Examples(
3280
  [["Lauf", 3, "wiktionary"],
3281
  ["See", 0, "wiktionary"],
3282
  ["schnell", 3, "wiktionary"],
3283
  ["heute", 0, "wiktionary"],
3284
+ ["gebildet", 0, "dwdsmor"]], # Example to show the new engine
3285
  inputs=[word_input, top_n_number, engine_radio],
3286
  outputs=[output],
3287
  fn=analyze_word_encyclopedia
 
3312
  inputs=[word_input], outputs=[output], fn=lambda word: _analyze_word_with_wiktionary(word, 0)
3313
  )
3314
 
3315
+ def create_dwdsmor_tab():
3316
+ """Creates the UI for the standalone DWDSmor lookup tab."""
3317
+ gr.Markdown("# 🏛️ DWDSmor Morphology (Raw Engine)")
3318
+ gr.Markdown("Directly query the `dwdsmor` FST-based engine. This is a high-precision morphological analyzer.")
3319
+
3320
+ def dwdsmor_raw_analysis(word):
3321
+ """Wrapper to get raw DWDSmor analysis as JSON."""
3322
+ if not DWDSMOR_AVAILABLE:
3323
+ return {"error": "DWDSmor library not installed."}
3324
+ try:
3325
+ analyzer = dwdsmor_get_lemmatizer()
3326
+ if not analyzer:
3327
+ return {"error": "DWDSmor lemmatizer failed to initialize."}
3328
+
3329
+ # --- THIS IS THE FIX ---
3330
+ # The analyzer.analyze() returns a Traversal object, which is iterable
3331
+ analyses = list(analyzer.analyze(word, join_tags=True))
3332
+ # --- END OF FIX ---
3333
+
3334
+ if not analyses:
3335
+ return {"info": f"No analysis found for '{word}'."}
3336
+
3337
+ # Convert Traversal objects to plain dicts for JSON output
3338
+ results = []
3339
+ for analysis in analyses:
3340
+ results.append({
3341
+ "lemma": analysis.analysis, # In this object, .analysis is the lemma
3342
+ "pos": analysis.pos,
3343
+ "analysis_string": analysis.spec, # .spec is the full string
3344
+ "tags": analysis.tags
3345
+ })
3346
+ return {"input_word": word, "analyses": results}
3347
+ except Exception as e:
3348
+ return {"error": str(e), "traceback": traceback.format_exc()}
3349
+
3350
+ with gr.Column():
3351
+ word_input = gr.Textbox(
3352
+ label="Single German Word",
3353
+ placeholder="e.g., gebildet, schnell, Häuser"
3354
+ )
3355
+ analyze_button = gr.Button("Analyze Word with DWDSmor", variant="primary")
3356
+
3357
+ output = gr.JSON(label="DWDSmor Raw Analysis (JSON)")
3358
+
3359
+ analyze_button.click(
3360
+ fn=dwdsmor_raw_analysis,
3361
+ inputs=[word_input],
3362
+ outputs=[output],
3363
+ api_name="dwdsmor_lookup"
3364
+ )
3365
+ gr.Examples(
3366
+ [["gebildet"], ["schnell"], ["Häuser"], ["gehe"]],
3367
+ inputs=[word_input], outputs=[output], fn=dwdsmor_raw_analysis
3368
+ )
3369
+
3370
  def create_hanta_tab():
3371
  """Creates the UI for the standalone HanTa Engine tab."""
3372
  gr.Markdown("# 🤖 HanTa Lookup (Raw Engine)")
 
3447
 
3448
  with gr.Tab("🔬 Engine: IWNLP-spaCy (DE)"):
3449
  create_iwnlp_tab()
3450
+
3451
+ with gr.Tab("🏛️ Engine: DWDSmor (DE)"):
3452
+ create_dwdsmor_tab()
3453
 
3454
  # --- Standalone Component Tabs ---
3455
  with gr.Tab("📚 Component: Inflections (DE)"):
 
3502
  print(f"✗ FAILED to initialize Wiktionary: {e}")
3503
  print("--- Wiktionary Done ---\n")
3504
 
3505
+ # --- Initialize DWDSmor ---
3506
+ print("--- Initializing DWDSmor Lemmatizer ---")
3507
+ if DWDSMOR_AVAILABLE:
3508
+ try:
3509
+ dwdsmor_get_lemmatizer() # Call the function to load the model
3510
+ except Exception as e:
3511
+ print(f"✗ FAILED to start DWDSmor: {e}")
3512
+ print(" 'Word Encyclopedia' DWDSmor engine will fail.")
3513
+ else:
3514
+ print("INFO: DWDSmor library not available, skipping lemmatizer.")
3515
+ print("--- DWDSmor Done ---\n")
3516
+
3517
  # --- 4. Initialize HanTa Tagger ---
3518
  print("--- Initializing HanTa Tagger ---")
3519
  if HANTA_AVAILABLE: