Spaces:
Running
Running
drop results path if queried word not even in inflections data
Browse files
app.py
CHANGED
|
@@ -1837,20 +1837,32 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1837 |
|
| 1838 |
def _wiktionary_format_semantics_block(
|
| 1839 |
wikt_report: Dict[str, Any],
|
|
|
|
| 1840 |
top_n: int
|
| 1841 |
) -> Dict[str, Any]:
|
| 1842 |
"""
|
| 1843 |
Combines Wiktionary senses with OdeNet/ConceptNet senses,
|
| 1844 |
-
using the
|
| 1845 |
"""
|
| 1846 |
|
| 1847 |
# --- THIS IS THE FIX ---
|
| 1848 |
-
#
|
| 1849 |
-
#
|
| 1850 |
-
# e.g., for the "heute" entry, this will be "heute".
|
| 1851 |
-
semantic_lemma = wikt_report.get("lemma", "")
|
| 1852 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1854 |
# --- END OF FIX ---
|
| 1855 |
|
| 1856 |
# 1. Get Wiktionary senses (from the original report)
|
|
@@ -1872,7 +1884,7 @@ def _wiktionary_format_semantics_block(
|
|
| 1872 |
if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
|
| 1873 |
odenet_senses = odenet_senses_raw
|
| 1874 |
except Exception as e:
|
| 1875 |
-
log(f"OdeNet lookup failed for {semantic_lemma} ({pos_key}): {e}")
|
| 1876 |
|
| 1877 |
# 3. Get ConceptNet relations for the *semantic_lemma*
|
| 1878 |
conceptnet_relations = []
|
|
@@ -1922,17 +1934,24 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1922 |
if iwnlp:
|
| 1923 |
doc = iwnlp(word)
|
| 1924 |
token = doc[0]
|
| 1925 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1926 |
spacy_lemma_hint = token.lemma_
|
| 1927 |
-
log(f"Wiktionary Priority Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
|
| 1928 |
except Exception as e:
|
| 1929 |
-
log(f"Wiktionary Priority Hint: spaCy/IWNLP failed: {e}")
|
| 1930 |
|
| 1931 |
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 1932 |
try:
|
| 1933 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1934 |
except Exception as e:
|
| 1935 |
-
log(f"Wiktionary query failed: {e}")
|
| 1936 |
return {} # Signal failure
|
| 1937 |
|
| 1938 |
if not wiktionary_reports:
|
|
@@ -1945,11 +1964,11 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1945 |
|
| 1946 |
# Priority 1: Exact POS match with spaCy hint
|
| 1947 |
if spacy_pos_hint and wikt_pos == spacy_pos_hint:
|
| 1948 |
-
|
| 1949 |
-
|
| 1950 |
-
|
| 1951 |
-
|
| 1952 |
-
|
| 1953 |
# Priority 2: Input word is the lemma (e.g., "Haus" -> "Haus")
|
| 1954 |
if wikt_lemma.lower() == word.lower():
|
| 1955 |
return 3
|
|
@@ -1958,59 +1977,96 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1958 |
return 4
|
| 1959 |
|
| 1960 |
wiktionary_reports.sort(key=get_priority_score)
|
| 1961 |
-
log(f"Wiktionary: Sorted entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
|
| 1962 |
|
| 1963 |
|
| 1964 |
-
# --- 4. BUILD THE FINAL REPORT (PATH-PURE) ---
|
|
|
|
|
|
|
| 1965 |
for wikt_report in wiktionary_reports:
|
| 1966 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1967 |
lemma = wikt_report.get("lemma", word)
|
|
|
|
| 1968 |
|
| 1969 |
-
# --- A. Build
|
| 1970 |
-
# This is the pure path. "heuen" entry looks up "heuen". "heute" entry looks up "heute".
|
| 1971 |
-
semantics_block = _wiktionary_format_semantics_block(wikt_report, top_n)
|
| 1972 |
-
|
| 1973 |
-
# --- B. Build Wiktionary Inflection Block ---
|
| 1974 |
inflections_wikt_block = {
|
| 1975 |
"base_form": lemma,
|
| 1976 |
"forms_list": wikt_report.get("forms", []),
|
| 1977 |
"source": "wiktionary"
|
| 1978 |
}
|
| 1979 |
|
| 1980 |
-
# ---
|
| 1981 |
pattern_block = {}
|
| 1982 |
if PATTERN_DE_AVAILABLE:
|
| 1983 |
try:
|
| 1984 |
-
if pos_key == "noun":
|
| 1985 |
pattern_block = pattern_analyze_as_noun(lemma)
|
| 1986 |
-
elif pos_key == "verb":
|
| 1987 |
-
|
| 1988 |
-
|
| 1989 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1990 |
elif pos_key == "adverb":
|
| 1991 |
pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 1992 |
except Exception as e:
|
| 1993 |
pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
|
| 1994 |
|
| 1995 |
-
# ---
|
|
|
|
|
|
|
|
|
|
| 1996 |
pos_entry_report = {
|
| 1997 |
"inflections_wiktionary": inflections_wikt_block,
|
| 1998 |
"inflections_pattern": pattern_block,
|
| 1999 |
"semantics_combined": semantics_block,
|
| 2000 |
"wiktionary_metadata": {
|
| 2001 |
-
"pos_title":
|
| 2002 |
"pronunciation": wikt_report.get("sounds"),
|
| 2003 |
"examples": wikt_report.get("examples")
|
| 2004 |
}
|
| 2005 |
}
|
| 2006 |
|
| 2007 |
-
#
|
| 2008 |
-
|
| 2009 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2010 |
|
| 2011 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2012 |
|
| 2013 |
-
final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching
|
| 2014 |
return final_result
|
| 2015 |
|
| 2016 |
# ============================================================================
|
|
@@ -2708,58 +2764,57 @@ def _analyze_word_with_iwnlp(word: str, top_n_value: int) -> Dict[str, Any]:
|
|
| 2708 |
# --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
|
| 2709 |
|
| 2710 |
# --- THIS IS THE NEW PUBLIC DISPATCHER FUNCTION ---
|
| 2711 |
-
def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0) -> Dict[str, Any]:
|
| 2712 |
"""
|
| 2713 |
-
(PUBLIC DISPATCHER
|
| 2714 |
|
| 2715 |
-
|
| 2716 |
-
1. PRIMARY: Wiktionary DB (Accurate, pre-compiled data)
|
| 2717 |
-
2. FALLBACK 1: HanTa-led engine (Good heuristics)
|
| 2718 |
-
3. FALLBACK 2: IWNLP-led engine (Different heuristics)
|
| 2719 |
"""
|
| 2720 |
if not word or not word.strip():
|
| 2721 |
return {"info": "Please enter a word."}
|
| 2722 |
|
| 2723 |
word = word.strip()
|
| 2724 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
|
|
|
| 2725 |
|
| 2726 |
-
|
| 2727 |
-
if WIKTIONARY_AVAILABLE:
|
| 2728 |
-
wikt_result = _analyze_word_with_wiktionary(word, top_n)
|
| 2729 |
-
if wikt_result and wikt_result.get("analysis"):
|
| 2730 |
-
log("V19 Dispatcher: Returning Wiktionary result.")
|
| 2731 |
-
return wikt_result
|
| 2732 |
-
elif WIKTIONARY_AVAILABLE:
|
| 2733 |
-
log("V19 Dispatcher: Wiktionary is available but found no results.")
|
| 2734 |
-
else:
|
| 2735 |
-
log("V19 Dispatcher: Wiktionary failed to initialize, falling back.")
|
| 2736 |
|
| 2737 |
-
|
| 2738 |
-
|
| 2739 |
-
|
| 2740 |
-
|
| 2741 |
-
|
| 2742 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2743 |
else:
|
| 2744 |
-
|
| 2745 |
|
| 2746 |
-
|
| 2747 |
-
|
| 2748 |
-
|
| 2749 |
-
|
| 2750 |
-
|
| 2751 |
-
|
| 2752 |
-
|
| 2753 |
-
|
| 2754 |
|
| 2755 |
-
#
|
| 2756 |
-
|
| 2757 |
-
|
| 2758 |
-
|
| 2759 |
-
|
| 2760 |
-
|
| 2761 |
-
|
| 2762 |
-
|
| 2763 |
|
| 2764 |
|
| 2765 |
# ============================================================================
|
|
@@ -2955,40 +3010,136 @@ def create_combined_tab():
|
|
| 2955 |
def create_word_encyclopedia_tab():
|
| 2956 |
"""--- NEW: Creates the UI for the NON-CONTEXTUAL Word Analyzer tab ---"""
|
| 2957 |
gr.Markdown("# π Word Encyclopedia (Non-Contextual)")
|
| 2958 |
-
gr.Markdown("This tool analyzes a **single word** for *all possible* grammatical and semantic forms. It
|
|
|
|
| 2959 |
with gr.Column():
|
| 2960 |
word_input = gr.Textbox(
|
| 2961 |
label="Single German Word",
|
| 2962 |
-
placeholder="e.g., Lauf, See, schnell"
|
| 2963 |
-
)
|
| 2964 |
-
top_n_number = gr.Number(
|
| 2965 |
-
label="Limit Semantic Senses per POS (0 for all)",
|
| 2966 |
-
value=0,
|
| 2967 |
-
step=1,
|
| 2968 |
-
minimum=0,
|
| 2969 |
-
interactive=True
|
| 2970 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2971 |
analyze_button = gr.Button("Analyze Word", variant="primary")
|
| 2972 |
|
| 2973 |
output = gr.JSON(label="Word Encyclopedia Analysis (JSON)")
|
| 2974 |
|
|
|
|
| 2975 |
analyze_button.click(
|
| 2976 |
fn=analyze_word_encyclopedia,
|
| 2977 |
-
inputs
|
|
|
|
| 2978 |
outputs=[output],
|
| 2979 |
api_name="analyze_word"
|
| 2980 |
)
|
| 2981 |
|
|
|
|
| 2982 |
gr.Examples(
|
| 2983 |
-
[["Lauf", 3],
|
| 2984 |
-
["See", 0],
|
| 2985 |
-
["schnell", 3],
|
| 2986 |
-
["
|
| 2987 |
-
|
|
|
|
| 2988 |
outputs=[output],
|
| 2989 |
fn=analyze_word_encyclopedia
|
| 2990 |
)
|
| 2991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2992 |
# --- Main UI Builder ---
|
| 2993 |
def create_consolidated_interface():
|
| 2994 |
"""Builds the final Gradio app with all tabs."""
|
|
@@ -2997,7 +3148,7 @@ def create_consolidated_interface():
|
|
| 2997 |
gr.Markdown("A suite of advanced tools for German linguistics, providing both contextual and non-contextual analysis.")
|
| 2998 |
|
| 2999 |
with gr.Tabs():
|
| 3000 |
-
# ---
|
| 3001 |
with gr.Tab("π Word Encyclopedia (DE)"):
|
| 3002 |
create_word_encyclopedia_tab()
|
| 3003 |
|
|
@@ -3009,14 +3160,25 @@ def create_consolidated_interface():
|
|
| 3009 |
|
| 3010 |
with gr.Tab("β
Grammar Check (DE)"):
|
| 3011 |
create_languagetool_tab()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3012 |
|
| 3013 |
-
with gr.Tab("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3014 |
create_pattern_tab()
|
| 3015 |
|
| 3016 |
-
with gr.Tab("π Thesaurus (DE)"):
|
| 3017 |
create_odenet_tab()
|
| 3018 |
|
| 3019 |
-
with gr.Tab("π ConceptNet (Direct)"):
|
| 3020 |
create_conceptnet_tab()
|
| 3021 |
|
| 3022 |
return demo
|
|
|
|
| 1837 |
|
| 1838 |
def _wiktionary_format_semantics_block(
|
| 1839 |
wikt_report: Dict[str, Any],
|
| 1840 |
+
pattern_block: Dict[str, Any],
|
| 1841 |
top_n: int
|
| 1842 |
) -> Dict[str, Any]:
|
| 1843 |
"""
|
| 1844 |
Combines Wiktionary senses with OdeNet/ConceptNet senses,
|
| 1845 |
+
using the CORRECT lemma from the pattern.de analysis block.
|
| 1846 |
"""
|
| 1847 |
|
| 1848 |
# --- THIS IS THE FIX ---
|
| 1849 |
+
# Determine the true lemma from the pattern.de block, as it's more reliable
|
| 1850 |
+
# for semantic lookup than the wiktionary lemma (which could be an inflected form).
|
|
|
|
|
|
|
| 1851 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1852 |
+
|
| 1853 |
+
semantic_lemma = ""
|
| 1854 |
+
if pos_key == "verb":
|
| 1855 |
+
semantic_lemma = pattern_block.get("infinitive")
|
| 1856 |
+
elif pos_key == "noun":
|
| 1857 |
+
semantic_lemma = pattern_block.get("base_form")
|
| 1858 |
+
elif pos_key == "adjective":
|
| 1859 |
+
semantic_lemma = pattern_block.get("predicative")
|
| 1860 |
+
|
| 1861 |
+
# Fallback if pattern.de fails or it's a non-inflecting POS
|
| 1862 |
+
if not semantic_lemma:
|
| 1863 |
+
semantic_lemma = wikt_report.get("lemma", "")
|
| 1864 |
+
|
| 1865 |
+
log(f"[DEBUG] Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
|
| 1866 |
# --- END OF FIX ---
|
| 1867 |
|
| 1868 |
# 1. Get Wiktionary senses (from the original report)
|
|
|
|
| 1884 |
if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
|
| 1885 |
odenet_senses = odenet_senses_raw
|
| 1886 |
except Exception as e:
|
| 1887 |
+
log(f"[DEBUG] OdeNet lookup failed for {semantic_lemma} ({pos_key}): {e}")
|
| 1888 |
|
| 1889 |
# 3. Get ConceptNet relations for the *semantic_lemma*
|
| 1890 |
conceptnet_relations = []
|
|
|
|
| 1934 |
if iwnlp:
|
| 1935 |
doc = iwnlp(word)
|
| 1936 |
token = doc[0]
|
| 1937 |
+
# Map spaCy POS to our internal keys
|
| 1938 |
+
spacy_pos_raw = token.pos_.lower()
|
| 1939 |
+
if spacy_pos_raw == "adj": spacy_pos_hint = "adjective"
|
| 1940 |
+
elif spacy_pos_raw == "adv": spacy_pos_hint = "adverb"
|
| 1941 |
+
elif spacy_pos_raw == "verb": spacy_pos_hint = "verb"
|
| 1942 |
+
elif spacy_pos_raw == "noun": spacy_pos_hint = "noun"
|
| 1943 |
+
else: spacy_pos_hint = spacy_pos_raw
|
| 1944 |
+
|
| 1945 |
spacy_lemma_hint = token.lemma_
|
| 1946 |
+
log(f"[DEBUG] Wiktionary Priority Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
|
| 1947 |
except Exception as e:
|
| 1948 |
+
log(f"[DEBUG] Wiktionary Priority Hint: spaCy/IWNLP failed: {e}")
|
| 1949 |
|
| 1950 |
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 1951 |
try:
|
| 1952 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1953 |
except Exception as e:
|
| 1954 |
+
log(f"[DEBUG] Wiktionary query failed: {e}")
|
| 1955 |
return {} # Signal failure
|
| 1956 |
|
| 1957 |
if not wiktionary_reports:
|
|
|
|
| 1964 |
|
| 1965 |
# Priority 1: Exact POS match with spaCy hint
|
| 1966 |
if spacy_pos_hint and wikt_pos == spacy_pos_hint:
|
| 1967 |
+
# Bonus if lemma also matches
|
| 1968 |
+
if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint:
|
| 1969 |
+
return 1
|
| 1970 |
+
return 2
|
| 1971 |
+
|
| 1972 |
# Priority 2: Input word is the lemma (e.g., "Haus" -> "Haus")
|
| 1973 |
if wikt_lemma.lower() == word.lower():
|
| 1974 |
return 3
|
|
|
|
| 1977 |
return 4
|
| 1978 |
|
| 1979 |
wiktionary_reports.sort(key=get_priority_score)
|
| 1980 |
+
log(f"[DEBUG] Wiktionary: Sorted entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
|
| 1981 |
|
| 1982 |
|
| 1983 |
+
# --- 4. BUILD AND *VALIDATE* THE FINAL REPORT (PATH-PURE) ---
|
| 1984 |
+
word_lower = word.lower()
|
| 1985 |
+
|
| 1986 |
for wikt_report in wiktionary_reports:
|
| 1987 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1988 |
lemma = wikt_report.get("lemma", word)
|
| 1989 |
+
pos_title = wikt_report.get("pos_title", "")
|
| 1990 |
|
| 1991 |
+
# --- A. Build Wiktionary Inflection Block ---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1992 |
inflections_wikt_block = {
|
| 1993 |
"base_form": lemma,
|
| 1994 |
"forms_list": wikt_report.get("forms", []),
|
| 1995 |
"source": "wiktionary"
|
| 1996 |
}
|
| 1997 |
|
| 1998 |
+
# --- B. Build Pattern Inflection Block (CRITICAL for finding true lemma) ---
|
| 1999 |
pattern_block = {}
|
| 2000 |
if PATTERN_DE_AVAILABLE:
|
| 2001 |
try:
|
| 2002 |
+
if pos_key == "noun" or "Substantiv" in pos_title:
|
| 2003 |
pattern_block = pattern_analyze_as_noun(lemma)
|
| 2004 |
+
elif pos_key == "verb" or "Verb" in pos_title or "Konjugierte Form" in pos_title:
|
| 2005 |
+
# Use the *input word* for inflected forms to find the right lemma
|
| 2006 |
+
if "Konjugierte Form" in pos_title:
|
| 2007 |
+
pattern_block = pattern_analyze_as_verb(word)
|
| 2008 |
+
else:
|
| 2009 |
+
pattern_block = pattern_analyze_as_verb(lemma)
|
| 2010 |
+
elif pos_key == "adjective" or "Adjektiv" in pos_title or "Deklinierte Form" in pos_title:
|
| 2011 |
+
# Use the *input word* for inflected forms
|
| 2012 |
+
if "Deklinierte Form" in pos_title:
|
| 2013 |
+
pattern_block = pattern_analyze_as_adjective(word)
|
| 2014 |
+
else:
|
| 2015 |
+
pattern_block = pattern_analyze_as_adjective(lemma)
|
| 2016 |
elif pos_key == "adverb":
|
| 2017 |
pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 2018 |
except Exception as e:
|
| 2019 |
pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
|
| 2020 |
|
| 2021 |
+
# --- C. Build Semantics Block (using correct lemma from pattern_block) ---
|
| 2022 |
+
semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
|
| 2023 |
+
|
| 2024 |
+
# --- D. Assemble the report (pre-validation) ---
|
| 2025 |
pos_entry_report = {
|
| 2026 |
"inflections_wiktionary": inflections_wikt_block,
|
| 2027 |
"inflections_pattern": pattern_block,
|
| 2028 |
"semantics_combined": semantics_block,
|
| 2029 |
"wiktionary_metadata": {
|
| 2030 |
+
"pos_title": pos_title,
|
| 2031 |
"pronunciation": wikt_report.get("sounds"),
|
| 2032 |
"examples": wikt_report.get("examples")
|
| 2033 |
}
|
| 2034 |
}
|
| 2035 |
|
| 2036 |
+
# --- E. *** YOUR NEW VALIDATION FILTER (Corrected) *** ---
|
| 2037 |
+
is_valid = False
|
| 2038 |
+
is_inflected_entry = "Konjugierte Form" in pos_title or "Deklinierte Form" in pos_title
|
| 2039 |
+
|
| 2040 |
+
# Check 1: Is the input word the lemma OF A BASE FORM entry?
|
| 2041 |
+
if not is_inflected_entry and lemma.lower() == word_lower:
|
| 2042 |
+
is_valid = True
|
| 2043 |
+
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches lemma of a base entry.")
|
| 2044 |
|
| 2045 |
+
# Check 2: Is the input word in the *bare* forms list?
|
| 2046 |
+
# (This is the only check that should apply to inflected entries)
|
| 2047 |
+
if not is_valid:
|
| 2048 |
+
for form_entry in inflections_wikt_block.get("forms_list", []):
|
| 2049 |
+
form_text = form_entry.get("form_text", "")
|
| 2050 |
+
bare_form = re.sub(r"\(.*\)", "", form_text).strip()
|
| 2051 |
+
bare_form = re.sub(r"^(der|die|das|ein|eine|am)\s+", "", bare_form, flags=re.IGNORECASE).strip()
|
| 2052 |
+
bare_form = bare_form.rstrip("!.")
|
| 2053 |
+
|
| 2054 |
+
if bare_form.lower() == word_lower:
|
| 2055 |
+
is_valid = True
|
| 2056 |
+
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word found in form: '{form_text}'")
|
| 2057 |
+
break
|
| 2058 |
+
|
| 2059 |
+
# --- F. Add to final result if valid ---
|
| 2060 |
+
if is_valid:
|
| 2061 |
+
if pos_key not in final_result["analysis"]:
|
| 2062 |
+
final_result["analysis"][pos_key] = []
|
| 2063 |
+
final_result["analysis"][pos_key].append(pos_entry_report)
|
| 2064 |
+
else:
|
| 2065 |
+
log(f"[DEBUG] Wiktionary: DROPPING entry '{lemma}' ({pos_key}, {pos_title}) because input word '{word}' was not found in its valid forms.")
|
| 2066 |
+
|
| 2067 |
+
# --- END OF VALIDATION ---
|
| 2068 |
|
| 2069 |
+
final_result["info"] = f"Analysis from Wiktionary (Primary Engine). Found {len(wiktionary_reports)} matching entries, kept {sum(len(v) for v in final_result.get('analysis', {}).values())}."
|
| 2070 |
return final_result
|
| 2071 |
|
| 2072 |
# ============================================================================
|
|
|
|
| 2764 |
# --- 7b. NEW: Word Encyclopedia (Non-Contextual) Analyzer ---
|
| 2765 |
|
| 2766 |
# --- THIS IS THE NEW PUBLIC DISPATCHER FUNCTION ---
|
| 2767 |
+
def analyze_word_encyclopedia(word: str, top_n_value: Optional[float] = 0, engine_choice: str = "wiktionary") -> Dict[str, Any]:
|
| 2768 |
"""
|
| 2769 |
+
(PUBLIC DISPATCHER V20) Analyzes a single word using the selected engine.
|
| 2770 |
|
| 2771 |
+
The user can now choose which engine to run.
|
|
|
|
|
|
|
|
|
|
| 2772 |
"""
|
| 2773 |
if not word or not word.strip():
|
| 2774 |
return {"info": "Please enter a word."}
|
| 2775 |
|
| 2776 |
word = word.strip()
|
| 2777 |
top_n = int(top_n_value) if top_n_value is not None else 0
|
| 2778 |
+
result = {}
|
| 2779 |
|
| 2780 |
+
log(f"\n[Word Encyclopedia] User selected engine: '{engine_choice}' for word: '{word}'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2781 |
|
| 2782 |
+
try:
|
| 2783 |
+
if engine_choice == "wiktionary":
|
| 2784 |
+
result = _analyze_word_with_wiktionary(word, top_n)
|
| 2785 |
+
if not result or not result.get("analysis"):
|
| 2786 |
+
result["info"] = f"Wiktionary (Primary Engine) found no results for '{word}'. You can try a fallback engine."
|
| 2787 |
+
|
| 2788 |
+
elif engine_choice == "hanta":
|
| 2789 |
+
result = _analyze_word_with_hanta(word, top_n)
|
| 2790 |
+
if not result or not result.get("analysis"):
|
| 2791 |
+
result["info"] = f"HanTa (Fallback 1) found no results for '{word}'."
|
| 2792 |
+
|
| 2793 |
+
elif engine_choice == "iwnlp":
|
| 2794 |
+
result = _analyze_word_with_iwnlp(word, top_n)
|
| 2795 |
+
if not result or not result.get("analysis"):
|
| 2796 |
+
result["info"] = f"IWNLP (Fallback 2) found no results for '{word}'."
|
| 2797 |
+
|
| 2798 |
else:
|
| 2799 |
+
result = {"error": f"Unknown engine choice: {engine_choice}"}
|
| 2800 |
|
| 2801 |
+
except Exception as e:
|
| 2802 |
+
log(f"--- Dispatcher FAILED for engine {engine_choice}: {e} ---")
|
| 2803 |
+
traceback.print_exc()
|
| 2804 |
+
return {
|
| 2805 |
+
"input_word": word,
|
| 2806 |
+
"error": f"The '{engine_choice}' engine failed during analysis.",
|
| 2807 |
+
"traceback": traceback.format_exc()
|
| 2808 |
+
}
|
| 2809 |
|
| 2810 |
+
# If the engine ran but found nothing, return a clear info message
|
| 2811 |
+
if not result.get("analysis"):
|
| 2812 |
+
return {
|
| 2813 |
+
"input_word": word,
|
| 2814 |
+
"info": result.get("info", f"The selected engine '{engine_choice}' found no valid analysis for this word.")
|
| 2815 |
+
}
|
| 2816 |
+
|
| 2817 |
+
return result
|
| 2818 |
|
| 2819 |
|
| 2820 |
# ============================================================================
|
|
|
|
| 3010 |
def create_word_encyclopedia_tab():
|
| 3011 |
"""--- NEW: Creates the UI for the NON-CONTEXTUAL Word Analyzer tab ---"""
|
| 3012 |
gr.Markdown("# π Word Encyclopedia (Non-Contextual)")
|
| 3013 |
+
gr.Markdown("This tool analyzes a **single word** for *all possible* grammatical and semantic forms. It finds ambiguities (e.g., 'Lauf' as noun and verb) and groups all data by Part-of-Speech.")
|
| 3014 |
+
|
| 3015 |
with gr.Column():
|
| 3016 |
word_input = gr.Textbox(
|
| 3017 |
label="Single German Word",
|
| 3018 |
+
placeholder="e.g., Lauf, See, schnell, heute"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3019 |
)
|
| 3020 |
+
|
| 3021 |
+
with gr.Row():
|
| 3022 |
+
top_n_number = gr.Number(
|
| 3023 |
+
label="Limit Semantic Senses per POS (0 for all)",
|
| 3024 |
+
value=0,
|
| 3025 |
+
step=1,
|
| 3026 |
+
minimum=0,
|
| 3027 |
+
interactive=True
|
| 3028 |
+
)
|
| 3029 |
+
|
| 3030 |
+
# --- THIS IS THE NEW UI ELEMENT ---
|
| 3031 |
+
engine_radio = gr.Radio(
|
| 3032 |
+
label="Select Analysis Engine",
|
| 3033 |
+
choices=[
|
| 3034 |
+
("Wiktionary (Default)", "wiktionary"),
|
| 3035 |
+
("HanTa (Fallback 1)", "hanta"),
|
| 3036 |
+
("IWNLP (Fallback 2)", "iwnlp")
|
| 3037 |
+
],
|
| 3038 |
+
value="wiktionary",
|
| 3039 |
+
interactive=True
|
| 3040 |
+
)
|
| 3041 |
+
# --- END OF NEW UI ELEMENT ---
|
| 3042 |
+
|
| 3043 |
analyze_button = gr.Button("Analyze Word", variant="primary")
|
| 3044 |
|
| 3045 |
output = gr.JSON(label="Word Encyclopedia Analysis (JSON)")
|
| 3046 |
|
| 3047 |
+
# --- UPDATE THE CLICK FUNCTION ---
|
| 3048 |
analyze_button.click(
|
| 3049 |
fn=analyze_word_encyclopedia,
|
| 3050 |
+
# Add 'engine_radio' to the inputs
|
| 3051 |
+
inputs=[word_input, top_n_number, engine_radio],
|
| 3052 |
outputs=[output],
|
| 3053 |
api_name="analyze_word"
|
| 3054 |
)
|
| 3055 |
|
| 3056 |
+
# Update the examples to include the radio button
|
| 3057 |
gr.Examples(
|
| 3058 |
+
[["Lauf", 3, "wiktionary"],
|
| 3059 |
+
["See", 0, "wiktionary"],
|
| 3060 |
+
["schnell", 3, "wiktionary"],
|
| 3061 |
+
["heute", 0, "wiktionary"],
|
| 3062 |
+
["heute", 0, "hanta"]], # Example to show a different engine
|
| 3063 |
+
inputs=[word_input, top_n_number, engine_radio],
|
| 3064 |
outputs=[output],
|
| 3065 |
fn=analyze_word_encyclopedia
|
| 3066 |
)
|
| 3067 |
|
| 3068 |
+
def create_wiktionary_tab():
|
| 3069 |
+
"""Creates the UI for the standalone Wiktionary lookup tab."""
|
| 3070 |
+
gr.Markdown("# π Wiktionary Lookup (Raw Engine)")
|
| 3071 |
+
gr.Markdown("Directly query the Wiktionary (Primary) engine. This shows the raw, combined data from the database, Pattern.de, and semantic sources.")
|
| 3072 |
+
with gr.Column():
|
| 3073 |
+
word_input = gr.Textbox(
|
| 3074 |
+
label="Single German Word",
|
| 3075 |
+
placeholder="e.g., Haus, gehe, heute"
|
| 3076 |
+
)
|
| 3077 |
+
analyze_button = gr.Button("Lookup Word in Wiktionary", variant="primary")
|
| 3078 |
+
|
| 3079 |
+
output = gr.JSON(label="Wiktionary Engine Analysis (JSON)")
|
| 3080 |
+
|
| 3081 |
+
# Call the internal engine function directly, hardcoding top_n=0
|
| 3082 |
+
analyze_button.click(
|
| 3083 |
+
fn=lambda word: _analyze_word_with_wiktionary(word, 0),
|
| 3084 |
+
inputs=[word_input],
|
| 3085 |
+
outputs=[output],
|
| 3086 |
+
api_name="wiktionary_lookup"
|
| 3087 |
+
)
|
| 3088 |
+
gr.Examples(
|
| 3089 |
+
[["Haus"], ["gehe"], ["heute"], ["Lauf"]],
|
| 3090 |
+
inputs=[word_input], outputs=[output], fn=lambda word: _analyze_word_with_wiktionary(word, 0)
|
| 3091 |
+
)
|
| 3092 |
+
|
| 3093 |
+
def create_hanta_tab():
|
| 3094 |
+
"""Creates the UI for the standalone HanTa Engine tab."""
|
| 3095 |
+
gr.Markdown("# π€ HanTa Lookup (Raw Engine)")
|
| 3096 |
+
gr.Markdown("Directly query the HanTa (Fallback 1) engine. This shows the raw, combined data from HanTa, Pattern.de, and semantic sources.")
|
| 3097 |
+
with gr.Column():
|
| 3098 |
+
word_input = gr.Textbox(
|
| 3099 |
+
label="Single German Word",
|
| 3100 |
+
placeholder="e.g., Haus, gehe, heute"
|
| 3101 |
+
)
|
| 3102 |
+
analyze_button = gr.Button("Lookup Word with HanTa", variant="primary")
|
| 3103 |
+
|
| 3104 |
+
output = gr.JSON(label="HanTa Engine Analysis (JSON)")
|
| 3105 |
+
|
| 3106 |
+
# Call the internal engine function directly, hardcoding top_n=0
|
| 3107 |
+
analyze_button.click(
|
| 3108 |
+
fn=lambda word: _analyze_word_with_hanta(word, 0),
|
| 3109 |
+
inputs=[word_input],
|
| 3110 |
+
outputs=[output],
|
| 3111 |
+
api_name="hanta_lookup"
|
| 3112 |
+
)
|
| 3113 |
+
gr.Examples(
|
| 3114 |
+
[["Haus"], ["gehe"], ["heute"], ["Lauf"]],
|
| 3115 |
+
inputs=[word_input], outputs=[output], fn=lambda word: _analyze_word_with_hanta(word, 0)
|
| 3116 |
+
)
|
| 3117 |
+
|
| 3118 |
+
def create_iwnlp_tab():
|
| 3119 |
+
"""Creates the UI for the standalone IWNLP Engine tab."""
|
| 3120 |
+
gr.Markdown("# π¬ IWNLP-spaCy Lookup (Raw Engine)")
|
| 3121 |
+
gr.Markdown("Directly query the IWNLP-spaCy (Fallback 2) engine. This shows the raw, combined data from spaCy, IWNLP, Pattern.de, and semantic sources.")
|
| 3122 |
+
with gr.Column():
|
| 3123 |
+
word_input = gr.Textbox(
|
| 3124 |
+
label="Single German Word",
|
| 3125 |
+
placeholder="e.g., Haus, gehe, heute"
|
| 3126 |
+
)
|
| 3127 |
+
analyze_button = gr.Button("Lookup Word with IWNLP", variant="primary")
|
| 3128 |
+
|
| 3129 |
+
output = gr.JSON(label="IWNLP Engine Analysis (JSON)")
|
| 3130 |
+
|
| 3131 |
+
# Call the internal engine function directly, hardcoding top_n=0
|
| 3132 |
+
analyze_button.click(
|
| 3133 |
+
fn=lambda word: _analyze_word_with_iwnlp(word, 0),
|
| 3134 |
+
inputs=[word_input],
|
| 3135 |
+
outputs=[output],
|
| 3136 |
+
api_name="iwnlp_lookup"
|
| 3137 |
+
)
|
| 3138 |
+
gr.Examples(
|
| 3139 |
+
[["Haus"], ["gehe"], ["heute"], ["Lauf"]],
|
| 3140 |
+
inputs=[word_input], outputs=[output], fn=lambda word: _analyze_word_with_iwnlp(word, 0)
|
| 3141 |
+
)
|
| 3142 |
+
|
| 3143 |
# --- Main UI Builder ---
|
| 3144 |
def create_consolidated_interface():
|
| 3145 |
"""Builds the final Gradio app with all tabs."""
|
|
|
|
| 3148 |
gr.Markdown("A suite of advanced tools for German linguistics, providing both contextual and non-contextual analysis.")
|
| 3149 |
|
| 3150 |
with gr.Tabs():
|
| 3151 |
+
# --- Main Tools ---
|
| 3152 |
with gr.Tab("π Word Encyclopedia (DE)"):
|
| 3153 |
create_word_encyclopedia_tab()
|
| 3154 |
|
|
|
|
| 3160 |
|
| 3161 |
with gr.Tab("β
Grammar Check (DE)"):
|
| 3162 |
create_languagetool_tab()
|
| 3163 |
+
|
| 3164 |
+
# --- Standalone Engine Tabs (NEW) ---
|
| 3165 |
+
with gr.Tab("π Engine: Wiktionary (DE)"):
|
| 3166 |
+
create_wiktionary_tab()
|
| 3167 |
|
| 3168 |
+
with gr.Tab("π€ Engine: HanTa (DE)"):
|
| 3169 |
+
create_hanta_tab()
|
| 3170 |
+
|
| 3171 |
+
with gr.Tab("π¬ Engine: IWNLP-spaCy (DE)"):
|
| 3172 |
+
create_iwnlp_tab()
|
| 3173 |
+
|
| 3174 |
+
# --- Standalone Component Tabs ---
|
| 3175 |
+
with gr.Tab("π Component: Inflections (DE)"):
|
| 3176 |
create_pattern_tab()
|
| 3177 |
|
| 3178 |
+
with gr.Tab("π Component: Thesaurus (DE)"):
|
| 3179 |
create_odenet_tab()
|
| 3180 |
|
| 3181 |
+
with gr.Tab("π Component: ConceptNet (Direct)"):
|
| 3182 |
create_conceptnet_tab()
|
| 3183 |
|
| 3184 |
return demo
|