Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1837,13 +1837,23 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1837 |
|
| 1838 |
def _wiktionary_format_semantics_block(
|
| 1839 |
wikt_report: Dict[str, Any],
|
| 1840 |
-
lemma: str,
|
| 1841 |
top_n: int
|
| 1842 |
) -> Dict[str, Any]:
|
| 1843 |
"""
|
| 1844 |
-
Combines Wiktionary senses with OdeNet/ConceptNet senses
|
|
|
|
| 1845 |
"""
|
| 1846 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1847 |
wiktionary_senses = []
|
| 1848 |
for sense in wikt_report.get("senses", []):
|
| 1849 |
wiktionary_senses.append({
|
|
@@ -1851,26 +1861,24 @@ def _wiktionary_format_semantics_block(
|
|
| 1851 |
"source": "wiktionary"
|
| 1852 |
})
|
| 1853 |
|
| 1854 |
-
# 2. Get OdeNet senses for
|
| 1855 |
-
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1856 |
odenet_senses = []
|
| 1857 |
if WN_AVAILABLE:
|
| 1858 |
try:
|
| 1859 |
-
|
| 1860 |
-
senses_by_pos = _get_odenet_senses_by_pos(lemma)
|
| 1861 |
odenet_senses_raw = senses_by_pos.get(pos_key, [])
|
| 1862 |
|
| 1863 |
# Filter out placeholder
|
| 1864 |
if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
|
| 1865 |
odenet_senses = odenet_senses_raw
|
| 1866 |
except Exception as e:
|
| 1867 |
-
log(f"OdeNet lookup failed for {
|
| 1868 |
|
| 1869 |
-
# 3. Get ConceptNet relations
|
| 1870 |
conceptnet_relations = []
|
| 1871 |
if REQUESTS_AVAILABLE:
|
| 1872 |
try:
|
| 1873 |
-
conceptnet_result = conceptnet_get_relations(
|
| 1874 |
conceptnet_relations = conceptnet_result.get("relations", [])
|
| 1875 |
except Exception as e:
|
| 1876 |
conceptnet_relations = [{"error": str(e)}]
|
|
@@ -1883,7 +1891,7 @@ def _wiktionary_format_semantics_block(
|
|
| 1883 |
conceptnet_relations = conceptnet_relations[:top_n]
|
| 1884 |
|
| 1885 |
return {
|
| 1886 |
-
"lemma": lemma
|
| 1887 |
"wiktionary_senses": wiktionary_senses,
|
| 1888 |
"odenet_senses": odenet_senses,
|
| 1889 |
"conceptnet_relations": conceptnet_relations,
|
|
@@ -1905,6 +1913,22 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1905 |
if not conn:
|
| 1906 |
return {} # Return empty dict to signal failure
|
| 1907 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1908 |
try:
|
| 1909 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1910 |
except Exception as e:
|
|
@@ -1914,21 +1938,46 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1914 |
if not wiktionary_reports:
|
| 1915 |
return {} # No results, signal to fallback
|
| 1916 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1917 |
for wikt_report in wiktionary_reports:
|
| 1918 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1919 |
lemma = wikt_report.get("lemma", word)
|
| 1920 |
|
| 1921 |
-
# Build
|
| 1922 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1923 |
"base_form": lemma,
|
| 1924 |
"forms_list": wikt_report.get("forms", []),
|
| 1925 |
"source": "wiktionary"
|
| 1926 |
}
|
| 1927 |
|
| 1928 |
-
# Build
|
| 1929 |
-
semantics_block = _wiktionary_format_semantics_block(wikt_report, lemma, top_n)
|
| 1930 |
-
|
| 1931 |
-
# Add Pattern.de analysis for comparison/completeness
|
| 1932 |
pattern_block = {}
|
| 1933 |
if PATTERN_DE_AVAILABLE:
|
| 1934 |
try:
|
|
@@ -1938,12 +1987,14 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1938 |
pattern_block = pattern_analyze_as_verb(lemma)
|
| 1939 |
elif pos_key == "adjective":
|
| 1940 |
pattern_block = pattern_analyze_as_adjective(lemma)
|
| 1941 |
-
|
| 1942 |
-
|
|
|
|
|
|
|
| 1943 |
|
| 1944 |
-
#
|
| 1945 |
pos_entry_report = {
|
| 1946 |
-
"inflections_wiktionary":
|
| 1947 |
"inflections_pattern": pattern_block,
|
| 1948 |
"semantics_combined": semantics_block,
|
| 1949 |
"wiktionary_metadata": {
|
|
|
|
| 1837 |
|
| 1838 |
def _wiktionary_format_semantics_block(
|
| 1839 |
wikt_report: Dict[str, Any],
|
|
|
|
| 1840 |
top_n: int
|
| 1841 |
) -> Dict[str, Any]:
|
| 1842 |
"""
|
| 1843 |
+
Combines Wiktionary senses with OdeNet/ConceptNet senses,
|
| 1844 |
+
using the ground-truth lemma from THIS Wiktionary report.
|
| 1845 |
"""
|
| 1846 |
+
|
| 1847 |
+
# --- THIS IS THE FIX ---
|
| 1848 |
+
# The semantic lemma IS the lemma from this specific wiktionary entry.
|
| 1849 |
+
# e.g., for the "heuen" entry, this will be "heuen".
|
| 1850 |
+
# e.g., for the "heute" entry, this will be "heute".
|
| 1851 |
+
semantic_lemma = wikt_report.get("lemma", "")
|
| 1852 |
+
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1853 |
+
log(f"Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
|
| 1854 |
+
# --- END OF FIX ---
|
| 1855 |
+
|
| 1856 |
+
# 1. Get Wiktionary senses (from the original report)
|
| 1857 |
wiktionary_senses = []
|
| 1858 |
for sense in wikt_report.get("senses", []):
|
| 1859 |
wiktionary_senses.append({
|
|
|
|
| 1861 |
"source": "wiktionary"
|
| 1862 |
})
|
| 1863 |
|
| 1864 |
+
# 2. Get OdeNet senses for the *semantic_lemma*
|
|
|
|
| 1865 |
odenet_senses = []
|
| 1866 |
if WN_AVAILABLE:
|
| 1867 |
try:
|
| 1868 |
+
senses_by_pos = _get_odenet_senses_by_pos(semantic_lemma)
|
|
|
|
| 1869 |
odenet_senses_raw = senses_by_pos.get(pos_key, [])
|
| 1870 |
|
| 1871 |
# Filter out placeholder
|
| 1872 |
if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
|
| 1873 |
odenet_senses = odenet_senses_raw
|
| 1874 |
except Exception as e:
|
| 1875 |
+
log(f"OdeNet lookup failed for {semantic_lemma} ({pos_key}): {e}")
|
| 1876 |
|
| 1877 |
+
# 3. Get ConceptNet relations for the *semantic_lemma*
|
| 1878 |
conceptnet_relations = []
|
| 1879 |
if REQUESTS_AVAILABLE:
|
| 1880 |
try:
|
| 1881 |
+
conceptnet_result = conceptnet_get_relations(semantic_lemma, language='de')
|
| 1882 |
conceptnet_relations = conceptnet_result.get("relations", [])
|
| 1883 |
except Exception as e:
|
| 1884 |
conceptnet_relations = [{"error": str(e)}]
|
|
|
|
| 1891 |
conceptnet_relations = conceptnet_relations[:top_n]
|
| 1892 |
|
| 1893 |
return {
|
| 1894 |
+
"lemma": semantic_lemma, # Return the *correct* lemma for this path
|
| 1895 |
"wiktionary_senses": wiktionary_senses,
|
| 1896 |
"odenet_senses": odenet_senses,
|
| 1897 |
"conceptnet_relations": conceptnet_relations,
|
|
|
|
| 1913 |
if not conn:
|
| 1914 |
return {} # Return empty dict to signal failure
|
| 1915 |
|
| 1916 |
+
# --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
|
| 1917 |
+
spacy_pos_hint = None
|
| 1918 |
+
spacy_lemma_hint = None
|
| 1919 |
+
if IWNLP_AVAILABLE:
|
| 1920 |
+
try:
|
| 1921 |
+
iwnlp = iwnlp_get_pipeline()
|
| 1922 |
+
if iwnlp:
|
| 1923 |
+
doc = iwnlp(word)
|
| 1924 |
+
token = doc[0]
|
| 1925 |
+
spacy_pos_hint = token.pos_.lower()
|
| 1926 |
+
spacy_lemma_hint = token.lemma_
|
| 1927 |
+
log(f"Wiktionary Priority Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
|
| 1928 |
+
except Exception as e:
|
| 1929 |
+
log(f"Wiktionary Priority Hint: spaCy/IWNLP failed: {e}")
|
| 1930 |
+
|
| 1931 |
+
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 1932 |
try:
|
| 1933 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1934 |
except Exception as e:
|
|
|
|
| 1938 |
if not wiktionary_reports:
|
| 1939 |
return {} # No results, signal to fallback
|
| 1940 |
|
| 1941 |
+
# --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
|
| 1942 |
+
def get_priority_score(report):
|
| 1943 |
+
wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
|
| 1944 |
+
wikt_lemma = report.get("lemma")
|
| 1945 |
+
|
| 1946 |
+
# Priority 1: Exact POS match with spaCy hint
|
| 1947 |
+
if spacy_pos_hint and wikt_pos == spacy_pos_hint:
|
| 1948 |
+
# Bonus if lemma also matches
|
| 1949 |
+
if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint:
|
| 1950 |
+
return 1
|
| 1951 |
+
return 2
|
| 1952 |
+
|
| 1953 |
+
# Priority 2: Input word is the lemma (e.g., "Haus" -> "Haus")
|
| 1954 |
+
if wikt_lemma.lower() == word.lower():
|
| 1955 |
+
return 3
|
| 1956 |
+
|
| 1957 |
+
# Priority 3: Other inflected forms (e.g. "gehe" -> "gehen")
|
| 1958 |
+
return 4
|
| 1959 |
+
|
| 1960 |
+
wiktionary_reports.sort(key=get_priority_score)
|
| 1961 |
+
log(f"Wiktionary: Sorted entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
|
| 1962 |
+
|
| 1963 |
+
|
| 1964 |
+
# --- 4. BUILD THE FINAL REPORT (PATH-PURE) ---
|
| 1965 |
for wikt_report in wiktionary_reports:
|
| 1966 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1967 |
lemma = wikt_report.get("lemma", word)
|
| 1968 |
|
| 1969 |
+
# --- A. Build Semantics Block (USING WIKT LEMMA) ---
|
| 1970 |
+
# This is the pure path. "heuen" entry looks up "heuen". "heute" entry looks up "heute".
|
| 1971 |
+
semantics_block = _wiktionary_format_semantics_block(wikt_report, top_n)
|
| 1972 |
+
|
| 1973 |
+
# --- B. Build Wiktionary Inflection Block ---
|
| 1974 |
+
inflections_wikt_block = {
|
| 1975 |
"base_form": lemma,
|
| 1976 |
"forms_list": wikt_report.get("forms", []),
|
| 1977 |
"source": "wiktionary"
|
| 1978 |
}
|
| 1979 |
|
| 1980 |
+
# --- C. Build Pattern Inflection Block (for comparison) ---
|
|
|
|
|
|
|
|
|
|
| 1981 |
pattern_block = {}
|
| 1982 |
if PATTERN_DE_AVAILABLE:
|
| 1983 |
try:
|
|
|
|
| 1987 |
pattern_block = pattern_analyze_as_verb(lemma)
|
| 1988 |
elif pos_key == "adjective":
|
| 1989 |
pattern_block = pattern_analyze_as_adjective(lemma)
|
| 1990 |
+
elif pos_key == "adverb":
|
| 1991 |
+
pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 1992 |
+
except Exception as e:
|
| 1993 |
+
pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
|
| 1994 |
|
| 1995 |
+
# --- D. Assemble the final report for this entry ---
|
| 1996 |
pos_entry_report = {
|
| 1997 |
+
"inflections_wiktionary": inflections_wikt_block,
|
| 1998 |
"inflections_pattern": pattern_block,
|
| 1999 |
"semantics_combined": semantics_block,
|
| 2000 |
"wiktionary_metadata": {
|