Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -85,40 +85,64 @@ except ImportError:
|
|
| 85 |
WN_AVAILABLE = False
|
| 86 |
print("CRITICAL WARNING: `wn` library not found.")
|
| 87 |
|
|
|
|
| 88 |
# --- Pattern.en Import (ENGLISH) ---
|
| 89 |
PATTERN_EN_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
try:
|
| 91 |
-
# Try importing from the standard pattern library
|
| 92 |
import pattern.en
|
|
|
|
| 93 |
from pattern.en import (
|
| 94 |
-
pluralize, singularize,
|
| 95 |
-
|
| 96 |
-
article, MALE, FEMALE, NEUTRAL, PLURAL,
|
| 97 |
-
INFINITIVE, PRESENT, PAST, PARTICIPLE,
|
| 98 |
-
FIRST, SECOND, THIRD, SINGULAR, PLURAL as PL,
|
| 99 |
-
INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
|
| 100 |
comparative, superlative,
|
| 101 |
-
|
|
|
|
| 102 |
parse, split
|
| 103 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
PATTERN_EN_AVAILABLE = True
|
| 105 |
print("✓ Successfully imported pattern.en")
|
|
|
|
| 106 |
except ImportError:
|
| 107 |
-
print("Using PatternLite fallback...")
|
| 108 |
try:
|
| 109 |
-
#
|
| 110 |
import pattern.en
|
| 111 |
-
from pattern.en import
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
|
| 118 |
-
comparative, superlative,
|
| 119 |
-
NOUN, VERB, ADJECTIVE,
|
| 120 |
-
parse, split
|
| 121 |
-
)
|
| 122 |
PATTERN_EN_AVAILABLE = True
|
| 123 |
print("✓ Successfully imported pattern.en (via PatternLite)")
|
| 124 |
except ImportError as e:
|
|
@@ -136,20 +160,45 @@ except ImportError:
|
|
| 136 |
HANTA_AVAILABLE = False
|
| 137 |
print("CRITICAL WARNING: `HanTa` library not found.")
|
| 138 |
|
| 139 |
-
# --- NLTK Import ---
|
| 140 |
try:
|
| 141 |
import nltk
|
| 142 |
from nltk.corpus import wordnet as nltk_wn
|
| 143 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
NLTK_AVAILABLE = True
|
| 145 |
-
print("✓ Successfully imported nltk")
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
nltk.download('punkt', quiet=True, raise_on_error=True) # For TextBlob
|
| 150 |
except Exception as e:
|
| 151 |
NLTK_AVAILABLE = False
|
| 152 |
-
print(f"WARNING: `nltk`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# --- Stanza Import ---
|
| 155 |
try:
|
|
@@ -160,14 +209,6 @@ except ImportError:
|
|
| 160 |
STANZA_AVAILABLE = False
|
| 161 |
print("WARNING: `stanza` library not found.")
|
| 162 |
|
| 163 |
-
# --- TextBlob Import ---
|
| 164 |
-
try:
|
| 165 |
-
from textblob import TextBlob
|
| 166 |
-
TEXTBLOB_AVAILABLE = True
|
| 167 |
-
print("✓ Successfully imported textblob")
|
| 168 |
-
except ImportError:
|
| 169 |
-
TEXTBLOB_AVAILABLE = False
|
| 170 |
-
print("WARNING: `textblob` library not found.")
|
| 171 |
|
| 172 |
# --- German-specific imports are not needed ---
|
| 173 |
IWNLP_AVAILABLE = False
|
|
@@ -220,7 +261,10 @@ def _conjugate_to_person_number_en(verb_lemma: str, person: str, number: str) ->
|
|
| 220 |
return None
|
| 221 |
try:
|
| 222 |
p_num = int(person)
|
|
|
|
| 223 |
n_num = SINGULAR if number == 'sg' else PLURAL
|
|
|
|
|
|
|
| 224 |
return conjugate(verb_lemma, tense=PRESENT, person=p_num, number=n_num)
|
| 225 |
except Exception:
|
| 226 |
return None
|
|
@@ -748,80 +792,122 @@ def pattern_is_good_analysis(analysis, analysis_type):
|
|
| 748 |
|
| 749 |
def pattern_analyze_as_noun_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 750 |
"""Comprehensive noun inflection analysis for English."""
|
| 751 |
-
log(f"
|
| 752 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 753 |
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 758 |
analysis = {
|
| 759 |
-
"base_form":
|
| 760 |
-
"singular":
|
| 761 |
"plural": plural_form,
|
|
|
|
| 762 |
"declension": {
|
| 763 |
-
"Singular": {"form":
|
| 764 |
"Plural": {"form": plural_form}
|
| 765 |
},
|
| 766 |
-
"gender": "
|
| 767 |
}
|
| 768 |
return analysis
|
| 769 |
|
| 770 |
def pattern_analyze_as_verb_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 771 |
"""Comprehensive verb conjugation analysis for English."""
|
| 772 |
-
log(f"
|
| 773 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 774 |
|
| 775 |
-
|
| 776 |
-
|
|
|
|
|
|
|
| 777 |
verb_lemma = word
|
| 778 |
|
| 779 |
analysis = {"infinitive": verb_lemma}
|
|
|
|
|
|
|
| 780 |
try:
|
| 781 |
-
|
|
|
|
|
|
|
|
|
|
| 782 |
except Exception as e:
|
| 783 |
-
log(f"
|
|
|
|
| 784 |
|
|
|
|
| 785 |
analysis["conjugation"] = {}
|
| 786 |
try:
|
|
|
|
| 787 |
analysis["conjugation"]["Present"] = {
|
| 788 |
-
"I":
|
| 789 |
-
"you":
|
| 790 |
-
"he/she
|
| 791 |
-
"we":
|
| 792 |
-
"you (
|
| 793 |
-
"they":
|
| 794 |
}
|
|
|
|
|
|
|
| 795 |
analysis["conjugation"]["Past"] = {
|
| 796 |
-
"I":
|
| 797 |
-
"he/she
|
|
|
|
| 798 |
}
|
|
|
|
|
|
|
| 799 |
analysis["participles"] = {
|
| 800 |
-
"Present Participle": conjugate(verb_lemma,
|
| 801 |
-
"Past Participle": conjugate(verb_lemma,
|
| 802 |
}
|
| 803 |
except Exception as e:
|
| 804 |
-
log(f"
|
| 805 |
|
| 806 |
return analysis
|
| 807 |
|
| 808 |
def pattern_analyze_as_adjective_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 809 |
"""Comprehensive adjective inflection analysis for English."""
|
| 810 |
-
log(f"
|
| 811 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 812 |
|
| 813 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
analysis = {}
|
| 815 |
analysis["predicative"] = base
|
|
|
|
| 816 |
try:
|
| 817 |
-
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
except Exception as e:
|
| 820 |
-
log(f"
|
|
|
|
| 821 |
|
| 822 |
-
analysis["attributive"] = {
|
| 823 |
-
"Base": {"form": base, "example": f"a {base} [noun]"}
|
| 824 |
-
}
|
| 825 |
return analysis
|
| 826 |
|
| 827 |
# --- Public API (Adapted) ---
|
|
@@ -1635,15 +1721,29 @@ def nltk_get_lemmatizer() -> Optional[WordNetLemmatizer]:
|
|
| 1635 |
""" Thread-safe function to get the NLTK Lemmatizer. """
|
| 1636 |
global NLTK_LEMMATIZER
|
| 1637 |
if not NLTK_AVAILABLE:
|
| 1638 |
-
raise
|
|
|
|
| 1639 |
if NLTK_LEMMATIZER:
|
| 1640 |
return NLTK_LEMMATIZER
|
|
|
|
| 1641 |
with NLTK_LEMMATIZER_LOCK:
|
| 1642 |
if NLTK_LEMMATIZER:
|
| 1643 |
return NLTK_LEMMATIZER
|
| 1644 |
-
|
| 1645 |
-
|
| 1646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1647 |
|
| 1648 |
def _nltk_get_wordnet_pos(treebank_tag):
|
| 1649 |
"""Converts NLTK's Treebank POS tag to a WordNet tag."""
|
|
@@ -1654,7 +1754,7 @@ def _nltk_get_wordnet_pos(treebank_tag):
|
|
| 1654 |
return None
|
| 1655 |
|
| 1656 |
def _analyze_word_with_nltk(word: str, top_n: int) -> Dict[str, Any]:
|
| 1657 |
-
""" (FALLBACK ENGINE 3) Analyzes with NLTK.
|
| 1658 |
if not NLTK_AVAILABLE: return {}
|
| 1659 |
print(f"\n[Word Encyclopedia] Running NLTK fallback for: \"{word}\"")
|
| 1660 |
final_result = {"input_word": word, "analysis": {}}
|
|
@@ -1664,7 +1764,14 @@ def _analyze_word_with_nltk(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1664 |
if not lemmatizer: return {}
|
| 1665 |
|
| 1666 |
# NLTK's POS tagger needs a list
|
| 1667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1668 |
wn_pos = _nltk_get_wordnet_pos(tag)
|
| 1669 |
|
| 1670 |
if not wn_pos:
|
|
@@ -1672,13 +1779,17 @@ def _analyze_word_with_nltk(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1672 |
return {}
|
| 1673 |
|
| 1674 |
lemma = lemmatizer.lemmatize(word, wn_pos)
|
| 1675 |
-
pos_map = {nltk_wn.NOUN: "noun", nltk_wn.VERB: "verb", nltk_wn.ADJ: "adjective", nltk_wn.ADV: "adverb"}
|
| 1676 |
-
pos_key = pos_map[wn_pos]
|
| 1677 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1678 |
log(f"--- Analyzing NLTK path: lemma='{lemma}', pos='{pos_key}' ---")
|
| 1679 |
|
| 1680 |
pattern_block = {}
|
| 1681 |
if PATTERN_EN_AVAILABLE:
|
|
|
|
| 1682 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
|
| 1683 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(lemma)
|
| 1684 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
|
|
@@ -1709,18 +1820,17 @@ def _analyze_word_with_nltk(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1709 |
return final_result
|
| 1710 |
except Exception as e:
|
| 1711 |
log(f"NLTK Engine FAILED: {e}")
|
| 1712 |
-
traceback.print_exc()
|
| 1713 |
return {}
|
| 1714 |
|
| 1715 |
# --- FALLBACK 4: TEXTBLOB ---
|
| 1716 |
def _analyze_word_with_textblob(word: str, top_n: int) -> Dict[str, Any]:
|
| 1717 |
-
""" (FALLBACK ENGINE 4) Analyzes with TextBlob.
|
| 1718 |
if not TEXTBLOB_AVAILABLE: return {}
|
| 1719 |
print(f"\n[Word Encyclopedia] Running TextBlob fallback for: \"{word}\"")
|
| 1720 |
final_result = {"input_word": word, "analysis": {}}
|
| 1721 |
|
| 1722 |
def get_wordnet_pos_tb(treebank_tag):
|
| 1723 |
-
""" Maps Treebank to TextBlob's lemmatizer tags (n, v, a, r) """
|
| 1724 |
if treebank_tag.startswith('J'): return 'a'
|
| 1725 |
if treebank_tag.startswith('V'): return 'v'
|
| 1726 |
if treebank_tag.startswith('N'): return 'n'
|
|
@@ -1728,19 +1838,33 @@ def _analyze_word_with_textblob(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1728 |
return None
|
| 1729 |
|
| 1730 |
try:
|
| 1731 |
-
|
| 1732 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1733 |
|
| 1734 |
-
# Process each tag TextBlob finds
|
| 1735 |
processed_lemmas_pos: Set[Tuple[str, str]] = set()
|
| 1736 |
|
| 1737 |
-
for tb_word, tag in
|
| 1738 |
tb_pos = get_wordnet_pos_tb(tag)
|
| 1739 |
if not tb_pos: continue
|
| 1740 |
|
| 1741 |
lemma = tb_word.lemmatize(tb_pos)
|
| 1742 |
pos_map = {'n': "noun", 'v': "verb", 'a': "adjective", 'r': "adverb"}
|
| 1743 |
-
pos_key = pos_map
|
|
|
|
| 1744 |
|
| 1745 |
if (lemma, pos_key) in processed_lemmas_pos: continue
|
| 1746 |
processed_lemmas_pos.add((lemma, pos_key))
|
|
@@ -1777,7 +1901,6 @@ def _analyze_word_with_textblob(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1777 |
return final_result
|
| 1778 |
except Exception as e:
|
| 1779 |
log(f"TextBlob Engine FAILED: {e}")
|
| 1780 |
-
traceback.print_exc()
|
| 1781 |
return {}
|
| 1782 |
|
| 1783 |
|
|
|
|
| 85 |
WN_AVAILABLE = False
|
| 86 |
print("CRITICAL WARNING: `wn` library not found.")
|
| 87 |
|
| 88 |
+
# --- Pattern.en Import (ENGLISH) ---
|
| 89 |
# --- Pattern.en Import (ENGLISH) ---
|
| 90 |
PATTERN_EN_AVAILABLE = False
|
| 91 |
+
|
| 92 |
+
# Define constants locally as fallbacks (Pattern standard values) to prevent import errors
|
| 93 |
+
# Tenses
|
| 94 |
+
INFINITIVE = "inf"
|
| 95 |
+
PRESENT = "pres"
|
| 96 |
+
PAST = "pst"
|
| 97 |
+
FUTURE = "fut"
|
| 98 |
+
PARTICIPLE = "part"
|
| 99 |
+
# Person/Number
|
| 100 |
+
FIRST = 1
|
| 101 |
+
SECOND = 2
|
| 102 |
+
THIRD = 3
|
| 103 |
+
SINGULAR = "sg"
|
| 104 |
+
PLURAL = "pl"
|
| 105 |
+
# POS
|
| 106 |
+
NOUN = "NN"
|
| 107 |
+
VERB = "VB"
|
| 108 |
+
ADJECTIVE = "JJ"
|
| 109 |
+
|
| 110 |
try:
|
|
|
|
| 111 |
import pattern.en
|
| 112 |
+
# Import functions safely
|
| 113 |
from pattern.en import (
|
| 114 |
+
pluralize, singularize,
|
| 115 |
+
conjugate, lemma, lexeme, tenses,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
comparative, superlative,
|
| 117 |
+
predicative, attributive,
|
| 118 |
+
article,
|
| 119 |
parse, split
|
| 120 |
)
|
| 121 |
+
|
| 122 |
+
# Try to import constants, but don't fail if they are missing (we use fallbacks)
|
| 123 |
+
try:
|
| 124 |
+
from pattern.en import (
|
| 125 |
+
INFINITIVE, PRESENT, PAST, PARTICIPLE,
|
| 126 |
+
FIRST, SECOND, THIRD, SINGULAR, PLURAL,
|
| 127 |
+
NOUN, VERB, ADJECTIVE
|
| 128 |
+
)
|
| 129 |
+
except ImportError:
|
| 130 |
+
print("Using local fallback constants for Pattern.en")
|
| 131 |
+
|
| 132 |
PATTERN_EN_AVAILABLE = True
|
| 133 |
print("✓ Successfully imported pattern.en")
|
| 134 |
+
|
| 135 |
except ImportError:
|
| 136 |
+
print("Using PatternLite fallback logic...")
|
| 137 |
try:
|
| 138 |
+
# Attempt simple import for PatternLite structure
|
| 139 |
import pattern.en
|
| 140 |
+
from pattern.en import pluralize, singularize, conjugate, lemma, lexeme
|
| 141 |
+
|
| 142 |
+
# Manually map functions if they are missing in Lite but available under different names
|
| 143 |
+
if not 'comparative' in dir(pattern.en):
|
| 144 |
+
from pattern.en import comparative, superlative
|
| 145 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
PATTERN_EN_AVAILABLE = True
|
| 147 |
print("✓ Successfully imported pattern.en (via PatternLite)")
|
| 148 |
except ImportError as e:
|
|
|
|
| 160 |
HANTA_AVAILABLE = False
|
| 161 |
print("CRITICAL WARNING: `HanTa` library not found.")
|
| 162 |
|
| 163 |
+
# --- NLTK & TextBlob Import ---
|
| 164 |
try:
|
| 165 |
import nltk
|
| 166 |
from nltk.corpus import wordnet as nltk_wn
|
| 167 |
from nltk.stem import WordNetLemmatizer
|
| 168 |
+
|
| 169 |
+
# --- CRITICAL: Download required NLTK data ---
|
| 170 |
+
# These are the specific packages causing your "LookupError" and "MissingCorpusError"
|
| 171 |
+
print("Downloading NLTK data...")
|
| 172 |
+
_nltk_packages = [
|
| 173 |
+
'wordnet',
|
| 174 |
+
'omw-1.4',
|
| 175 |
+
'averaged_perceptron_tagger',
|
| 176 |
+
'averaged_perceptron_tagger_eng', # Specific for newer NLTK
|
| 177 |
+
'punkt',
|
| 178 |
+
'punkt_tab' # Specific for newer TextBlob/NLTK
|
| 179 |
+
]
|
| 180 |
+
for pkg in _nltk_packages:
|
| 181 |
+
try:
|
| 182 |
+
nltk.download(pkg, quiet=True)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"Warning: Failed to download NLTK package '{pkg}': {e}")
|
| 185 |
+
|
| 186 |
NLTK_AVAILABLE = True
|
| 187 |
+
print("✓ Successfully imported nltk and downloaded data")
|
| 188 |
+
except ImportError:
|
| 189 |
+
NLTK_AVAILABLE = False
|
| 190 |
+
print("WARNING: `nltk` library not found.")
|
|
|
|
| 191 |
except Exception as e:
|
| 192 |
NLTK_AVAILABLE = False
|
| 193 |
+
print(f"WARNING: `nltk` data download failed: {e}")
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
from textblob import TextBlob
|
| 197 |
+
TEXTBLOB_AVAILABLE = True
|
| 198 |
+
print("✓ Successfully imported textblob")
|
| 199 |
+
except ImportError:
|
| 200 |
+
TEXTBLOB_AVAILABLE = False
|
| 201 |
+
print("WARNING: `textblob` library not found.")
|
| 202 |
|
| 203 |
# --- Stanza Import ---
|
| 204 |
try:
|
|
|
|
| 209 |
STANZA_AVAILABLE = False
|
| 210 |
print("WARNING: `stanza` library not found.")
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
# --- German-specific imports are not needed ---
|
| 214 |
IWNLP_AVAILABLE = False
|
|
|
|
| 261 |
return None
|
| 262 |
try:
|
| 263 |
p_num = int(person)
|
| 264 |
+
# Use the constants defined in the import block
|
| 265 |
n_num = SINGULAR if number == 'sg' else PLURAL
|
| 266 |
+
|
| 267 |
+
# Explicitly name arguments for safety across Pattern versions
|
| 268 |
return conjugate(verb_lemma, tense=PRESENT, person=p_num, number=n_num)
|
| 269 |
except Exception:
|
| 270 |
return None
|
|
|
|
| 792 |
|
| 793 |
def pattern_analyze_as_noun_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 794 |
"""Comprehensive noun inflection analysis for English."""
|
| 795 |
+
log(f" Analyzing as noun (hint_lemma={hint_lemma})")
|
| 796 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 797 |
|
| 798 |
+
# 1. Determine Singular/Plural base
|
| 799 |
+
# If the word is already plural, singularize it to get the lemma
|
| 800 |
+
try:
|
| 801 |
+
singular_form = singularize(word)
|
| 802 |
+
plural_form = pluralize(singular_form)
|
| 803 |
+
except Exception as e:
|
| 804 |
+
return {'error': f'Inflection failed: {e}'}
|
| 805 |
+
|
| 806 |
+
# 2. Get Indefinite Article (a/an)
|
| 807 |
+
try:
|
| 808 |
+
art = article(singular_form)
|
| 809 |
+
art_str = f"{art} {singular_form}"
|
| 810 |
+
except Exception:
|
| 811 |
+
art_str = f"a/an {singular_form}"
|
| 812 |
+
|
| 813 |
analysis = {
|
| 814 |
+
"base_form": singular_form,
|
| 815 |
+
"singular": singular_form,
|
| 816 |
"plural": plural_form,
|
| 817 |
+
"article": art_str,
|
| 818 |
"declension": {
|
| 819 |
+
"Singular": {"form": singular_form},
|
| 820 |
"Plural": {"form": plural_form}
|
| 821 |
},
|
| 822 |
+
"gender": "N/A" # English nouns strictly do not have grammatical gender
|
| 823 |
}
|
| 824 |
return analysis
|
| 825 |
|
| 826 |
def pattern_analyze_as_verb_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 827 |
"""Comprehensive verb conjugation analysis for English."""
|
| 828 |
+
log(f" Analyzing as verb (hint_lemma={hint_lemma})")
|
| 829 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 830 |
|
| 831 |
+
# 1. Get Lemma
|
| 832 |
+
try:
|
| 833 |
+
verb_lemma = lemma(word)
|
| 834 |
+
except:
|
| 835 |
verb_lemma = word
|
| 836 |
|
| 837 |
analysis = {"infinitive": verb_lemma}
|
| 838 |
+
|
| 839 |
+
# 2. Get Lexeme (List of all forms)
|
| 840 |
try:
|
| 841 |
+
# lexeme returns: [infinitive, 3sg, present_participle, past, past_participle]
|
| 842 |
+
# e.g., be => ['be', 'is', 'being', 'was', 'been']
|
| 843 |
+
forms = lexeme(verb_lemma)
|
| 844 |
+
analysis["lexeme"] = forms
|
| 845 |
except Exception as e:
|
| 846 |
+
log(f" Failed to get lexeme: {e}")
|
| 847 |
+
analysis["lexeme"] = []
|
| 848 |
|
| 849 |
+
# 3. Conjugation Table
|
| 850 |
analysis["conjugation"] = {}
|
| 851 |
try:
|
| 852 |
+
# Present Tense
|
| 853 |
analysis["conjugation"]["Present"] = {
|
| 854 |
+
"I (1sg)": conjugate(verb_lemma, tense=PRESENT, person=1, number=SINGULAR),
|
| 855 |
+
"you (2sg)": conjugate(verb_lemma, tense=PRESENT, person=2, number=SINGULAR),
|
| 856 |
+
"he/she (3sg)": conjugate(verb_lemma, tense=PRESENT, person=3, number=SINGULAR),
|
| 857 |
+
"we (1pl)": conjugate(verb_lemma, tense=PRESENT, person=1, number=PLURAL),
|
| 858 |
+
"you (2pl)": conjugate(verb_lemma, tense=PRESENT, person=2, number=PLURAL),
|
| 859 |
+
"they (3pl)": conjugate(verb_lemma, tense=PRESENT, person=3, number=PLURAL),
|
| 860 |
}
|
| 861 |
+
|
| 862 |
+
# Past Tense (Pattern usually handles simple past variations)
|
| 863 |
analysis["conjugation"]["Past"] = {
|
| 864 |
+
"I (1sg)": conjugate(verb_lemma, tense=PAST, person=1, number=SINGULAR),
|
| 865 |
+
"he/she (3sg)": conjugate(verb_lemma, tense=PAST, person=3, number=SINGULAR),
|
| 866 |
+
"General": conjugate(verb_lemma, tense=PAST) # For regular verbs where all are same
|
| 867 |
}
|
| 868 |
+
|
| 869 |
+
# Participles
|
| 870 |
analysis["participles"] = {
|
| 871 |
+
"Present Participle (gerund)": conjugate(verb_lemma, tense=PRESENT, aspect="progressive"), # or aspect=PROGRESSIVE
|
| 872 |
+
"Past Participle": conjugate(verb_lemma, tense=PAST, aspect="perfective") # or use PARTICIPLE constant
|
| 873 |
}
|
| 874 |
except Exception as e:
|
| 875 |
+
log(f" Failed to conjugate: {e}")
|
| 876 |
|
| 877 |
return analysis
|
| 878 |
|
| 879 |
def pattern_analyze_as_adjective_en(word: str, hint_lemma: str = None) -> Dict[str, Any]:
|
| 880 |
"""Comprehensive adjective inflection analysis for English."""
|
| 881 |
+
log(f" Analyzing as adjective (hint_lemma={hint_lemma})")
|
| 882 |
if not PATTERN_EN_AVAILABLE: return {'error': 'pattern.en not available'}
|
| 883 |
|
| 884 |
+
try:
|
| 885 |
+
# If the word is comparative/superlative, try to get the base (predicative)
|
| 886 |
+
# Note: Pattern doesn't have a strong 'un-grade' function, so we rely on lemma if available
|
| 887 |
+
# or assumes input is the base.
|
| 888 |
+
base = word
|
| 889 |
+
except Exception:
|
| 890 |
+
base = word
|
| 891 |
+
|
| 892 |
analysis = {}
|
| 893 |
analysis["predicative"] = base
|
| 894 |
+
|
| 895 |
try:
|
| 896 |
+
comp = comparative(base)
|
| 897 |
+
sup = superlative(base)
|
| 898 |
+
|
| 899 |
+
analysis["comparative"] = comp
|
| 900 |
+
analysis["superlative"] = sup
|
| 901 |
+
|
| 902 |
+
analysis["grading"] = {
|
| 903 |
+
"Positive": base,
|
| 904 |
+
"Comparative": comp,
|
| 905 |
+
"Superlative": sup
|
| 906 |
+
}
|
| 907 |
except Exception as e:
|
| 908 |
+
log(f" Failed to get comparison: {e}")
|
| 909 |
+
analysis["grading"] = {"error": "Could not grade adjective"}
|
| 910 |
|
|
|
|
|
|
|
|
|
|
| 911 |
return analysis
|
| 912 |
|
| 913 |
# --- Public API (Adapted) ---
|
|
|
|
| 1721 |
""" Thread-safe function to get the NLTK Lemmatizer. """
|
| 1722 |
global NLTK_LEMMATIZER
|
| 1723 |
if not NLTK_AVAILABLE:
|
| 1724 |
+
return None # Don't raise error, just return None to trigger graceful fallback
|
| 1725 |
+
|
| 1726 |
if NLTK_LEMMATIZER:
|
| 1727 |
return NLTK_LEMMATIZER
|
| 1728 |
+
|
| 1729 |
with NLTK_LEMMATIZER_LOCK:
|
| 1730 |
if NLTK_LEMMATIZER:
|
| 1731 |
return NLTK_LEMMATIZER
|
| 1732 |
+
try:
|
| 1733 |
+
# Ensure data is present one last time before init
|
| 1734 |
+
try:
|
| 1735 |
+
nltk.data.find('corpora/wordnet.zip')
|
| 1736 |
+
except LookupError:
|
| 1737 |
+
nltk.download('wordnet', quiet=True)
|
| 1738 |
+
|
| 1739 |
+
NLTK_LEMMATIZER = WordNetLemmatizer()
|
| 1740 |
+
# Warm up
|
| 1741 |
+
_ = NLTK_LEMMATIZER.lemmatize("cats")
|
| 1742 |
+
print("✓ NLTK Lemmatizer initialized.")
|
| 1743 |
+
return NLTK_LEMMATIZER
|
| 1744 |
+
except Exception as e:
|
| 1745 |
+
print(f"✗ NLTK Init Failed: {e}")
|
| 1746 |
+
return None
|
| 1747 |
|
| 1748 |
def _nltk_get_wordnet_pos(treebank_tag):
|
| 1749 |
"""Converts NLTK's Treebank POS tag to a WordNet tag."""
|
|
|
|
| 1754 |
return None
|
| 1755 |
|
| 1756 |
def _analyze_word_with_nltk(word: str, top_n: int) -> Dict[str, Any]:
|
| 1757 |
+
""" (FALLBACK ENGINE 3) Analyzes with NLTK. """
|
| 1758 |
if not NLTK_AVAILABLE: return {}
|
| 1759 |
print(f"\n[Word Encyclopedia] Running NLTK fallback for: \"{word}\"")
|
| 1760 |
final_result = {"input_word": word, "analysis": {}}
|
|
|
|
| 1764 |
if not lemmatizer: return {}
|
| 1765 |
|
| 1766 |
# NLTK's POS tagger needs a list
|
| 1767 |
+
# This specific call was crashing because 'averaged_perceptron_tagger_eng' was missing
|
| 1768 |
+
try:
|
| 1769 |
+
tag = nltk.pos_tag([word])[0][1]
|
| 1770 |
+
except LookupError:
|
| 1771 |
+
# Last ditch attempt to download if it was missing
|
| 1772 |
+
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
|
| 1773 |
+
tag = nltk.pos_tag([word])[0][1]
|
| 1774 |
+
|
| 1775 |
wn_pos = _nltk_get_wordnet_pos(tag)
|
| 1776 |
|
| 1777 |
if not wn_pos:
|
|
|
|
| 1779 |
return {}
|
| 1780 |
|
| 1781 |
lemma = lemmatizer.lemmatize(word, wn_pos)
|
|
|
|
|
|
|
| 1782 |
|
| 1783 |
+
# Map NLTK WN constants to strings
|
| 1784 |
+
pos_map_rev = {nltk_wn.NOUN: "noun", nltk_wn.VERB: "verb", nltk_wn.ADJ: "adjective", nltk_wn.ADV: "adverb"}
|
| 1785 |
+
pos_key = pos_map_rev.get(wn_pos)
|
| 1786 |
+
if not pos_key: return {}
|
| 1787 |
+
|
| 1788 |
log(f"--- Analyzing NLTK path: lemma='{lemma}', pos='{pos_key}' ---")
|
| 1789 |
|
| 1790 |
pattern_block = {}
|
| 1791 |
if PATTERN_EN_AVAILABLE:
|
| 1792 |
+
# Use the fixed pattern functions from previous step
|
| 1793 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
|
| 1794 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(lemma)
|
| 1795 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
|
|
|
|
| 1820 |
return final_result
|
| 1821 |
except Exception as e:
|
| 1822 |
log(f"NLTK Engine FAILED: {e}")
|
| 1823 |
+
# traceback.print_exc() # Optional: Uncomment for deep debugging
|
| 1824 |
return {}
|
| 1825 |
|
| 1826 |
# --- FALLBACK 4: TEXTBLOB ---
|
| 1827 |
def _analyze_word_with_textblob(word: str, top_n: int) -> Dict[str, Any]:
|
| 1828 |
+
""" (FALLBACK ENGINE 4) Analyzes with TextBlob. """
|
| 1829 |
if not TEXTBLOB_AVAILABLE: return {}
|
| 1830 |
print(f"\n[Word Encyclopedia] Running TextBlob fallback for: \"{word}\"")
|
| 1831 |
final_result = {"input_word": word, "analysis": {}}
|
| 1832 |
|
| 1833 |
def get_wordnet_pos_tb(treebank_tag):
|
|
|
|
| 1834 |
if treebank_tag.startswith('J'): return 'a'
|
| 1835 |
if treebank_tag.startswith('V'): return 'v'
|
| 1836 |
if treebank_tag.startswith('N'): return 'n'
|
|
|
|
| 1838 |
return None
|
| 1839 |
|
| 1840 |
try:
|
| 1841 |
+
try:
|
| 1842 |
+
blob = TextBlob(word)
|
| 1843 |
+
# This access triggers the tokenizer
|
| 1844 |
+
tags = blob.tags
|
| 1845 |
+
except (LookupError, Exception) as e:
|
| 1846 |
+
if "punkt" in str(e):
|
| 1847 |
+
print("Attempting to download missing TextBlob/NLTK data...")
|
| 1848 |
+
import nltk
|
| 1849 |
+
nltk.download('punkt_tab', quiet=True)
|
| 1850 |
+
nltk.download('punkt', quiet=True)
|
| 1851 |
+
blob = TextBlob(word)
|
| 1852 |
+
tags = blob.tags
|
| 1853 |
+
else:
|
| 1854 |
+
raise e
|
| 1855 |
+
|
| 1856 |
+
if not tags: return {}
|
| 1857 |
|
|
|
|
| 1858 |
processed_lemmas_pos: Set[Tuple[str, str]] = set()
|
| 1859 |
|
| 1860 |
+
for tb_word, tag in tags:
|
| 1861 |
tb_pos = get_wordnet_pos_tb(tag)
|
| 1862 |
if not tb_pos: continue
|
| 1863 |
|
| 1864 |
lemma = tb_word.lemmatize(tb_pos)
|
| 1865 |
pos_map = {'n': "noun", 'v': "verb", 'a': "adjective", 'r': "adverb"}
|
| 1866 |
+
pos_key = pos_map.get(tb_pos)
|
| 1867 |
+
if not pos_key: continue
|
| 1868 |
|
| 1869 |
if (lemma, pos_key) in processed_lemmas_pos: continue
|
| 1870 |
processed_lemmas_pos.add((lemma, pos_key))
|
|
|
|
| 1901 |
return final_result
|
| 1902 |
except Exception as e:
|
| 1903 |
log(f"TextBlob Engine FAILED: {e}")
|
|
|
|
| 1904 |
return {}
|
| 1905 |
|
| 1906 |
|