cstr commited on
Commit
3613031
·
verified ·
1 Parent(s): dc2f3ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -21
app.py CHANGED
@@ -1837,13 +1837,23 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
1837
 
1838
  def _wiktionary_format_semantics_block(
1839
  wikt_report: Dict[str, Any],
1840
- lemma: str,
1841
  top_n: int
1842
  ) -> Dict[str, Any]:
1843
  """
1844
- Combines Wiktionary senses with OdeNet/ConceptNet senses.
 
1845
  """
1846
- # 1. Get Wiktionary senses
 
 
 
 
 
 
 
 
 
 
1847
  wiktionary_senses = []
1848
  for sense in wikt_report.get("senses", []):
1849
  wiktionary_senses.append({
@@ -1851,26 +1861,24 @@ def _wiktionary_format_semantics_block(
1851
  "source": "wiktionary"
1852
  })
1853
 
1854
- # 2. Get OdeNet senses for this lemma
1855
- pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1856
  odenet_senses = []
1857
  if WN_AVAILABLE:
1858
  try:
1859
- # Use the corrected helper from your V18 code
1860
- senses_by_pos = _get_odenet_senses_by_pos(lemma)
1861
  odenet_senses_raw = senses_by_pos.get(pos_key, [])
1862
 
1863
  # Filter out placeholder
1864
  if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
1865
  odenet_senses = odenet_senses_raw
1866
  except Exception as e:
1867
- log(f"OdeNet lookup failed for {lemma} ({pos_key}): {e}")
1868
 
1869
- # 3. Get ConceptNet relations
1870
  conceptnet_relations = []
1871
  if REQUESTS_AVAILABLE:
1872
  try:
1873
- conceptnet_result = conceptnet_get_relations(lemma, language='de')
1874
  conceptnet_relations = conceptnet_result.get("relations", [])
1875
  except Exception as e:
1876
  conceptnet_relations = [{"error": str(e)}]
@@ -1883,7 +1891,7 @@ def _wiktionary_format_semantics_block(
1883
  conceptnet_relations = conceptnet_relations[:top_n]
1884
 
1885
  return {
1886
- "lemma": lemma,
1887
  "wiktionary_senses": wiktionary_senses,
1888
  "odenet_senses": odenet_senses,
1889
  "conceptnet_relations": conceptnet_relations,
@@ -1905,6 +1913,22 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1905
  if not conn:
1906
  return {} # Return empty dict to signal failure
1907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1908
  try:
1909
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
1910
  except Exception as e:
@@ -1914,21 +1938,46 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1914
  if not wiktionary_reports:
1915
  return {} # No results, signal to fallback
1916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1917
  for wikt_report in wiktionary_reports:
1918
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1919
  lemma = wikt_report.get("lemma", word)
1920
 
1921
- # Build the inflection block from Wiktionary data
1922
- inflections_block = {
 
 
 
 
1923
  "base_form": lemma,
1924
  "forms_list": wikt_report.get("forms", []),
1925
  "source": "wiktionary"
1926
  }
1927
 
1928
- # Build the semantics block
1929
- semantics_block = _wiktionary_format_semantics_block(wikt_report, lemma, top_n)
1930
-
1931
- # Add Pattern.de analysis for comparison/completeness
1932
  pattern_block = {}
1933
  if PATTERN_DE_AVAILABLE:
1934
  try:
@@ -1938,12 +1987,14 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1938
  pattern_block = pattern_analyze_as_verb(lemma)
1939
  elif pos_key == "adjective":
1940
  pattern_block = pattern_analyze_as_adjective(lemma)
1941
- except Exception:
1942
- pattern_block = {"error": "Pattern.de analysis failed."}
 
 
1943
 
1944
- # Build the final report for this POS entry
1945
  pos_entry_report = {
1946
- "inflections_wiktionary": inflections_block,
1947
  "inflections_pattern": pattern_block,
1948
  "semantics_combined": semantics_block,
1949
  "wiktionary_metadata": {
 
1837
 
1838
  def _wiktionary_format_semantics_block(
1839
  wikt_report: Dict[str, Any],
 
1840
  top_n: int
1841
  ) -> Dict[str, Any]:
1842
  """
1843
+ Combines Wiktionary senses with OdeNet/ConceptNet senses,
1844
+ using the ground-truth lemma from THIS Wiktionary report.
1845
  """
1846
+
1847
+ # --- THIS IS THE FIX ---
1848
+ # The semantic lemma IS the lemma from this specific wiktionary entry.
1849
+ # e.g., for the "heuen" entry, this will be "heuen".
1850
+ # e.g., for the "heute" entry, this will be "heute".
1851
+ semantic_lemma = wikt_report.get("lemma", "")
1852
+ pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1853
+ log(f"Wiktionary Semantics: Building block for lemma='{semantic_lemma}', pos='{pos_key}'")
1854
+ # --- END OF FIX ---
1855
+
1856
+ # 1. Get Wiktionary senses (from the original report)
1857
  wiktionary_senses = []
1858
  for sense in wikt_report.get("senses", []):
1859
  wiktionary_senses.append({
 
1861
  "source": "wiktionary"
1862
  })
1863
 
1864
+ # 2. Get OdeNet senses for the *semantic_lemma*
 
1865
  odenet_senses = []
1866
  if WN_AVAILABLE:
1867
  try:
1868
+ senses_by_pos = _get_odenet_senses_by_pos(semantic_lemma)
 
1869
  odenet_senses_raw = senses_by_pos.get(pos_key, [])
1870
 
1871
  # Filter out placeholder
1872
  if odenet_senses_raw and "info" not in odenet_senses_raw[0]:
1873
  odenet_senses = odenet_senses_raw
1874
  except Exception as e:
1875
+ log(f"OdeNet lookup failed for {semantic_lemma} ({pos_key}): {e}")
1876
 
1877
+ # 3. Get ConceptNet relations for the *semantic_lemma*
1878
  conceptnet_relations = []
1879
  if REQUESTS_AVAILABLE:
1880
  try:
1881
+ conceptnet_result = conceptnet_get_relations(semantic_lemma, language='de')
1882
  conceptnet_relations = conceptnet_result.get("relations", [])
1883
  except Exception as e:
1884
  conceptnet_relations = [{"error": str(e)}]
 
1891
  conceptnet_relations = conceptnet_relations[:top_n]
1892
 
1893
  return {
1894
+ "lemma": semantic_lemma, # Return the *correct* lemma for this path
1895
  "wiktionary_senses": wiktionary_senses,
1896
  "odenet_senses": odenet_senses,
1897
  "conceptnet_relations": conceptnet_relations,
 
1913
  if not conn:
1914
  return {} # Return empty dict to signal failure
1915
 
1916
+ # --- 1. GET SPACY/IWNLP HINT FOR PRIORITIZATION ---
1917
+ spacy_pos_hint = None
1918
+ spacy_lemma_hint = None
1919
+ if IWNLP_AVAILABLE:
1920
+ try:
1921
+ iwnlp = iwnlp_get_pipeline()
1922
+ if iwnlp:
1923
+ doc = iwnlp(word)
1924
+ token = doc[0]
1925
+ spacy_pos_hint = token.pos_.lower()
1926
+ spacy_lemma_hint = token.lemma_
1927
+ log(f"Wiktionary Priority Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
1928
+ except Exception as e:
1929
+ log(f"Wiktionary Priority Hint: spaCy/IWNLP failed: {e}")
1930
+
1931
+ # --- 2. FIND ALL WIKTIONARY ENTRIES ---
1932
  try:
1933
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
1934
  except Exception as e:
 
1938
  if not wiktionary_reports:
1939
  return {} # No results, signal to fallback
1940
 
1941
+ # --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
1942
+ def get_priority_score(report):
1943
+ wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
1944
+ wikt_lemma = report.get("lemma")
1945
+
1946
+ # Priority 1: Exact POS match with spaCy hint
1947
+ if spacy_pos_hint and wikt_pos == spacy_pos_hint:
1948
+ # Bonus if lemma also matches
1949
+ if spacy_lemma_hint and wikt_lemma == spacy_lemma_hint:
1950
+ return 1
1951
+ return 2
1952
+
1953
+ # Priority 2: Input word is the lemma (e.g., "Haus" -> "Haus")
1954
+ if wikt_lemma.lower() == word.lower():
1955
+ return 3
1956
+
1957
+ # Priority 3: Other inflected forms (e.g. "gehe" -> "gehen")
1958
+ return 4
1959
+
1960
+ wiktionary_reports.sort(key=get_priority_score)
1961
+ log(f"Wiktionary: Sorted entries: {[r.get('lemma') + ' (' + r.get('pos') + ')' for r in wiktionary_reports]}")
1962
+
1963
+
1964
+ # --- 4. BUILD THE FINAL REPORT (PATH-PURE) ---
1965
  for wikt_report in wiktionary_reports:
1966
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1967
  lemma = wikt_report.get("lemma", word)
1968
 
1969
+ # --- A. Build Semantics Block (USING WIKT LEMMA) ---
1970
+ # This is the pure path. "heuen" entry looks up "heuen". "heute" entry looks up "heute".
1971
+ semantics_block = _wiktionary_format_semantics_block(wikt_report, top_n)
1972
+
1973
+ # --- B. Build Wiktionary Inflection Block ---
1974
+ inflections_wikt_block = {
1975
  "base_form": lemma,
1976
  "forms_list": wikt_report.get("forms", []),
1977
  "source": "wiktionary"
1978
  }
1979
 
1980
+ # --- C. Build Pattern Inflection Block (for comparison) ---
 
 
 
1981
  pattern_block = {}
1982
  if PATTERN_DE_AVAILABLE:
1983
  try:
 
1987
  pattern_block = pattern_analyze_as_verb(lemma)
1988
  elif pos_key == "adjective":
1989
  pattern_block = pattern_analyze_as_adjective(lemma)
1990
+ elif pos_key == "adverb":
1991
+ pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
1992
+ except Exception as e:
1993
+ pattern_block = {"error": f"Pattern.de analysis for {pos_key}('{lemma}') failed: {e}"}
1994
 
1995
+ # --- D. Assemble the final report for this entry ---
1996
  pos_entry_report = {
1997
+ "inflections_wiktionary": inflections_wikt_block,
1998
  "inflections_pattern": pattern_block,
1999
  "semantics_combined": semantics_block,
2000
  "wiktionary_metadata": {