Spaces:
Sleeping
Sleeping
no atd for now
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V23-EN)
|
| 3 |
#
|
| 4 |
# This script adapts the German Linguistics Hub for English analysis,
|
| 5 |
-
# adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN,
|
| 6 |
# It maintains the exact same JSON output structure as the German app.
|
| 7 |
#
|
| 8 |
# ============================================================================
|
|
@@ -76,15 +76,6 @@ except ImportError:
|
|
| 76 |
LT_AVAILABLE = False
|
| 77 |
print("CRITICAL WARNING: `language-tool-python` library not found.")
|
| 78 |
|
| 79 |
-
# --- After the Deadline (AtD) Import ---
|
| 80 |
-
try:
|
| 81 |
-
import AtD
|
| 82 |
-
ATD_AVAILABLE = True
|
| 83 |
-
print("✓ Successfully imported pyAtD")
|
| 84 |
-
except ImportError:
|
| 85 |
-
ATD_AVAILABLE = False
|
| 86 |
-
print("WARNING: `pyAtD` library not found. Grammar check will be LT-only.")
|
| 87 |
-
|
| 88 |
# --- WordNet (wn) Import (for OEWN) ---
|
| 89 |
try:
|
| 90 |
import wn
|
|
@@ -194,10 +185,6 @@ STANZA_PIPELINE_LOCK = threading.Lock()
|
|
| 194 |
NLTK_LEMMATIZER: Optional[WordNetLemmatizer] = None
|
| 195 |
NLTK_LEMMATIZER_LOCK = threading.Lock()
|
| 196 |
|
| 197 |
-
# --- After the Deadline (AtD) ---
|
| 198 |
-
ATD_SERVICE: Optional[AtD.AtD] = None
|
| 199 |
-
ATD_LOCK = threading.Lock()
|
| 200 |
-
|
| 201 |
# --- Helper ---
|
| 202 |
def _html_wrap(content: str, line_height: str = "2.0") -> str:
|
| 203 |
return f'<div style="overflow-x:auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: {line_height};">{content}</div>'
|
|
@@ -226,17 +213,78 @@ SPACY_MODEL_INFO: Dict[str, Tuple[str, str, str]] = {
|
|
| 226 |
"de": ("German", "de_core_news_md", "spacy"),
|
| 227 |
"es": ("Spanish", "es_core_news_md", "spacy"),
|
| 228 |
"grc-proiel-trf": ("Ancient Greek (PROIEL TRF)", "grc_proiel_trf", "grecy"),
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
}
|
| 231 |
SPACY_UI_TEXT = {
|
| 232 |
-
"de": {
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
}
|
| 236 |
SPACY_MODELS: Dict[str, Optional[spacy.Language]] = {}
|
| 237 |
|
| 238 |
# --- Dependency Installation & Model Loading ---
|
| 239 |
-
# (All spacy_... functions are identical to the German app)
|
| 240 |
def spacy_install_spacy_transformers_once():
|
| 241 |
""" Installs spacy-transformers, required for all _trf models. """
|
| 242 |
marker_file = Path(".spacy_transformers_installed")
|
|
@@ -255,9 +303,40 @@ def spacy_install_spacy_transformers_once():
|
|
| 255 |
return False
|
| 256 |
|
| 257 |
def spacy_install_grecy_model_from_github(model_name: str) -> bool:
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
|
| 262 |
"""Load or install a standard spaCy model."""
|
| 263 |
try:
|
|
@@ -272,8 +351,24 @@ def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
|
|
| 272 |
return None
|
| 273 |
|
| 274 |
def spacy_load_grecy_model(model_name: str) -> Optional[spacy.Language]:
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
def spacy_initialize_models():
|
| 279 |
""" Pre-load standard models and ensure _trf dependencies are ready. """
|
|
@@ -303,32 +398,77 @@ def spacy_initialize_models():
|
|
| 303 |
|
| 304 |
def spacy_get_analysis(ui_lang: str, model_lang_key: str, text: str):
|
| 305 |
"""Analyze text and return results."""
|
| 306 |
-
# (Identical to German app)
|
| 307 |
ui_config = SPACY_UI_TEXT.get(ui_lang.lower(), SPACY_UI_TEXT["en"])
|
| 308 |
-
error_prefix = ui_config
|
| 309 |
try:
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
| 311 |
nlp = SPACY_MODELS.get(model_lang_key)
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
doc = nlp(text)
|
| 314 |
-
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
except Exception as e:
|
| 317 |
traceback.print_exc()
|
| 318 |
-
error_html = f"
|
| 319 |
-
return ([], {"error": str(e)}, error_html, error_html, gr.Button(
|
| 320 |
|
| 321 |
|
| 322 |
def spacy_update_ui(ui_lang: str):
|
| 323 |
"""Update UI language for the spaCy tab."""
|
| 324 |
-
#
|
| 325 |
-
|
| 326 |
|
| 327 |
# ============================================================================
|
| 328 |
-
# 4. GRAMMAR CHECKER LOGIC (
|
| 329 |
# ============================================================================
|
| 330 |
|
| 331 |
-
# --- Globals for LanguageTool
|
| 332 |
LT_TOOL_INSTANCES: Dict[str, Optional[language_tool_python.LanguageTool]] = {}
|
| 333 |
LT_TOOL_LOCK = threading.Lock()
|
| 334 |
|
|
@@ -338,7 +478,7 @@ def lt_get_language_tool(lang: str = 'en') -> Optional[language_tool_python.Lang
|
|
| 338 |
if not LT_AVAILABLE:
|
| 339 |
raise ImportError("language-tool-python library is not installed.")
|
| 340 |
|
| 341 |
-
lang_code = 'en-US' if lang == 'en' else 'de-DE'
|
| 342 |
|
| 343 |
if lang_code in LT_TOOL_INSTANCES:
|
| 344 |
return LT_TOOL_INSTANCES[lang_code]
|
|
@@ -387,55 +527,6 @@ def lt_check_grammar(text: str, lang: str = 'en') -> List[Dict[str, Any]]:
|
|
| 387 |
traceback.print_exc()
|
| 388 |
return [{"error": f"An unexpected error occurred: {str(e)}"}]
|
| 389 |
|
| 390 |
-
# --- After the Deadline (AtD) Logic ---
|
| 391 |
-
def atd_get_service() -> Optional[AtD.AtD]:
|
| 392 |
-
""" Thread-safe function to get AtD service. """
|
| 393 |
-
global ATD_SERVICE
|
| 394 |
-
if not ATD_AVAILABLE:
|
| 395 |
-
raise ImportError("pyAtD library is not installed.")
|
| 396 |
-
if ATD_SERVICE:
|
| 397 |
-
return ATD_SERVICE
|
| 398 |
-
with ATD_LOCK:
|
| 399 |
-
if ATD_SERVICE:
|
| 400 |
-
return ATD_SERVICE
|
| 401 |
-
try:
|
| 402 |
-
print("Initializing After the Deadline (AtD) service...")
|
| 403 |
-
ATD_SERVICE = AtD.AtD()
|
| 404 |
-
# Test call
|
| 405 |
-
_ = ATD_SERVICE.check("this is a test")
|
| 406 |
-
print("✓ AtD service initialized.")
|
| 407 |
-
return ATD_SERVICE
|
| 408 |
-
except Exception as e:
|
| 409 |
-
print(f"✗ FAILED to initialize AtD service: {e}")
|
| 410 |
-
return None
|
| 411 |
-
|
| 412 |
-
def atd_check_grammar(text: str) -> List[Dict[str, Any]]:
|
| 413 |
-
""" Checks text using After the Deadline. """
|
| 414 |
-
try:
|
| 415 |
-
service = atd_get_service()
|
| 416 |
-
if not service:
|
| 417 |
-
return [{"error": "AtD service failed to initialize."}]
|
| 418 |
-
if not text or not text.strip():
|
| 419 |
-
return [{"info": "No text provided to check."}]
|
| 420 |
-
|
| 421 |
-
errors = service.check(text)
|
| 422 |
-
error_list = []
|
| 423 |
-
for error in errors:
|
| 424 |
-
error_list.append({
|
| 425 |
-
"message": error.description,
|
| 426 |
-
"rule_id": error.type,
|
| 427 |
-
"category": error.url,
|
| 428 |
-
"incorrect_text": error.string,
|
| 429 |
-
"replacements": error.suggestions,
|
| 430 |
-
"offset": error.precontext_start,
|
| 431 |
-
"length": len(error.string)
|
| 432 |
-
})
|
| 433 |
-
if not error_list:
|
| 434 |
-
return [{"info": "No errors found!", "status": "perfect"}]
|
| 435 |
-
return error_list
|
| 436 |
-
except Exception as e:
|
| 437 |
-
return [{"error": f"AtD check failed: {str(e)}"}]
|
| 438 |
-
|
| 439 |
# ============================================================================
|
| 440 |
# 5. WORDNET THESAURUS LOGIC (OEWN)
|
| 441 |
# ============================================================================
|
|
@@ -534,7 +625,6 @@ def wordnet_worker_loop():
|
|
| 534 |
|
| 535 |
def wordnet_process_word_lookup(word: str, wn_instance: wn.Wordnet) -> Dict[str, Any]:
|
| 536 |
""" Process a single word lookup. Runs in the worker thread. """
|
| 537 |
-
# (This function is identical to the German app)
|
| 538 |
if not word or not word.strip():
|
| 539 |
return {"info": "No word provided to check."}
|
| 540 |
word = word.strip().lower()
|
|
@@ -583,7 +673,13 @@ def wordnet_start_worker():
|
|
| 583 |
return
|
| 584 |
wordnet_worker_thread = threading.Thread(target=wordnet_worker_loop, daemon=True, name="WordNetWorker")
|
| 585 |
wordnet_worker_thread.start()
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
# --- Public API (Adapted) ---
|
| 589 |
def wordnet_get_thesaurus_info(word: str, lang: str = 'en') -> Dict[str, Any]:
|
|
@@ -807,7 +903,6 @@ def get_conceptnet_client() -> Optional[Client]:
|
|
| 807 |
def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
| 808 |
"""
|
| 809 |
Fetches relations from the cstr/conceptnet_normalized Gradio API.
|
| 810 |
-
(Identical robust V2 parser from German app)
|
| 811 |
"""
|
| 812 |
if not GRADIO_CLIENT_AVAILABLE:
|
| 813 |
return {"error": "`gradio_client` library is not installed."}
|
|
@@ -818,7 +913,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
|
| 818 |
word_lower = word.strip().lower()
|
| 819 |
cache_key = (word_lower, language)
|
| 820 |
|
| 821 |
-
# --- 1. Check Cache ---
|
| 822 |
with CONCEPTNET_LOCK:
|
| 823 |
if cache_key in CONCEPTNET_CACHE:
|
| 824 |
log(f"ConceptNet: Found '{word_lower}' in cache.")
|
|
@@ -827,7 +921,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
|
| 827 |
log(f"ConceptNet: Fetching '{word_lower}' from Gradio API...")
|
| 828 |
|
| 829 |
try:
|
| 830 |
-
# --- 2. Call Gradio API ---
|
| 831 |
client = get_conceptnet_client()
|
| 832 |
if not client:
|
| 833 |
return {"error": "ConceptNet Gradio Client is not available."}
|
|
@@ -841,9 +934,7 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
|
| 841 |
api_name="/get_semantic_profile"
|
| 842 |
)
|
| 843 |
|
| 844 |
-
# --- 3. Parse the Markdown Result (Robustly) ---
|
| 845 |
relations_list = []
|
| 846 |
-
# ... (Identical parsing logic from German app) ...
|
| 847 |
if not isinstance(result_markdown, str):
|
| 848 |
raise TypeError(f"ConceptNet API returned type {type(result_markdown)}, expected str.")
|
| 849 |
|
|
@@ -877,7 +968,7 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
|
| 877 |
elif node2.lower() == word_lower and node1.lower() != word_lower:
|
| 878 |
other_node, direction = node1, "<-"
|
| 879 |
else:
|
| 880 |
-
continue
|
| 881 |
|
| 882 |
relations_list.append({
|
| 883 |
"relation": relation, "direction": direction, "other_node": other_node,
|
|
@@ -887,7 +978,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
|
| 887 |
except Exception as e:
|
| 888 |
log(f"ConceptNet Parser: Error parsing line '{line}': {e}")
|
| 889 |
|
| 890 |
-
# --- 4. Finalize and Cache Result ---
|
| 891 |
if not relations_list:
|
| 892 |
final_result = {"info": f"No valid relations found for '{word_lower}'."}
|
| 893 |
else:
|
|
@@ -910,7 +1000,7 @@ def openblp_get_relations(lemma: str) -> List[Dict[str, Any]]:
|
|
| 910 |
Stub function to query OpenBLP.
|
| 911 |
Replace this with your actual OpenBLP database/API query.
|
| 912 |
"""
|
| 913 |
-
#
|
| 914 |
if lemma == "dog":
|
| 915 |
return [
|
| 916 |
{"relation": "HasProperty", "other_node": "loyal", "weight": 0.9, "source": "openblp"},
|
|
@@ -962,10 +1052,7 @@ def _hanta_pos_to_key(hanta_pos: str) -> Optional[str]:
|
|
| 962 |
return None
|
| 963 |
|
| 964 |
def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
| 965 |
-
"""
|
| 966 |
-
(FALLBACK ENGINE 1) Analyzes a single word using HanTa (EN).
|
| 967 |
-
This function MUST return the standard JSON structure.
|
| 968 |
-
"""
|
| 969 |
if not HANTA_AVAILABLE: return {}
|
| 970 |
print(f"\n[Word Encyclopedia] Running HanTa (EN) fallback for: \"{word}\"")
|
| 971 |
final_result = {"input_word": word, "analysis": {}}
|
|
@@ -974,8 +1061,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 974 |
tagger = hanta_get_tagger_en()
|
| 975 |
if not tagger: return {}
|
| 976 |
|
| 977 |
-
# HanTa 'tag_word' gives all possibilities
|
| 978 |
-
# e.g., [('VBG', 0.9), ('NN', 0.1)]
|
| 979 |
possible_tags = tagger.tag_word(word.lower())
|
| 980 |
possible_tags.extend(tagger.tag_word(word.capitalize()))
|
| 981 |
|
|
@@ -985,8 +1070,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 985 |
pos_key = _hanta_pos_to_key(hanta_pos)
|
| 986 |
if not pos_key: continue
|
| 987 |
|
| 988 |
-
# Get the lemma for this specific POS analysis
|
| 989 |
-
# HanTa's 'analyze' gives the single best lemma
|
| 990 |
raw_analysis = tagger.analyze(word.lower() if pos_key != 'noun' else word.capitalize())
|
| 991 |
lemma = raw_analysis[0] # The lemma
|
| 992 |
|
|
@@ -995,7 +1078,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 995 |
processed_lemmas_pos.add((lemma, pos_key))
|
| 996 |
log(f"--- Analyzing HanTa (EN) path: lemma='{lemma}', pos='{pos_key}' ---")
|
| 997 |
|
| 998 |
-
# --- 1. Get Inflections (Pattern) ---
|
| 999 |
pattern_block = {}
|
| 1000 |
if PATTERN_EN_AVAILABLE:
|
| 1001 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
|
|
@@ -1003,12 +1085,10 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1003 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
|
| 1004 |
elif pos_key == "adverb": pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 1005 |
|
| 1006 |
-
# --- 2. Build Semantics Block ---
|
| 1007 |
semantics_block = _build_semantics_block_for_lemma(lemma, pos_key, top_n, 'en')
|
| 1008 |
|
| 1009 |
-
# --- 3. Build Final Report Block ---
|
| 1010 |
pos_entry_report = {
|
| 1011 |
-
"hanta_analysis": {
|
| 1012 |
"lemma": lemma,
|
| 1013 |
"pos_tag": hanta_pos,
|
| 1014 |
"analysis_string": str(raw_analysis),
|
|
@@ -1018,7 +1098,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1018 |
"semantics_combined": semantics_block
|
| 1019 |
}
|
| 1020 |
|
| 1021 |
-
# --- 4. VALIDATION FILTER ---
|
| 1022 |
if word_appears_in_inflections_en(word, pattern_block, pos_key):
|
| 1023 |
if pos_key not in final_result["analysis"]:
|
| 1024 |
final_result["analysis"][pos_key] = []
|
|
@@ -1038,7 +1117,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1038 |
# ============================================================================
|
| 1039 |
# 6d. WIKTIONARY DATABASE LOGIC (EN)
|
| 1040 |
# ============================================================================
|
| 1041 |
-
# (This assumes an English DB with the *exact same schema*)
|
| 1042 |
def wiktionary_download_db() -> bool:
|
| 1043 |
""" Downloads the English Wiktionary DB. """
|
| 1044 |
global WIKTIONARY_AVAILABLE
|
|
@@ -1050,7 +1128,7 @@ def wiktionary_download_db() -> bool:
|
|
| 1050 |
print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
|
| 1051 |
try:
|
| 1052 |
hf_hub_download(
|
| 1053 |
-
repo_id=WIKTIONARY_REPO_ID,
|
| 1054 |
filename=WIKTIONARY_DB_PATH,
|
| 1055 |
repo_type="dataset",
|
| 1056 |
local_dir=".",
|
|
@@ -1086,7 +1164,7 @@ def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
|
|
| 1086 |
log("Creating new read-only connection to Wiktionary DB...")
|
| 1087 |
db_uri = f"file:{WIKTIONARY_DB_PATH}?mode=ro"
|
| 1088 |
conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
|
| 1089 |
-
conn.row_factory = sqlite3.Row
|
| 1090 |
_ = conn.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1").fetchone()
|
| 1091 |
print("✓ Wiktionary DB connection successful.")
|
| 1092 |
WIKTIONARY_CONN = conn
|
|
@@ -1109,7 +1187,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1109 |
""" (REVISED FOR FULL DB V3) Fetches ALL data for a single entry_id. """
|
| 1110 |
report = {}
|
| 1111 |
|
| 1112 |
-
# 1. Get Base Entry Info
|
| 1113 |
entry_data = conn.execute(
|
| 1114 |
"SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
|
| 1115 |
).fetchone()
|
|
@@ -1119,7 +1196,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1119 |
report["entry_id"] = entry_id
|
| 1120 |
report["lemma"] = entry_data["word"]
|
| 1121 |
|
| 1122 |
-
# 2. Get Senses (with Glosses, Tags, Topics, and Examples)
|
| 1123 |
senses_q = conn.execute(
|
| 1124 |
"""
|
| 1125 |
SELECT
|
|
@@ -1136,7 +1212,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1136 |
for sense_row in senses_q:
|
| 1137 |
sense_dict = dict(sense_row)
|
| 1138 |
sense_id = sense_dict["sense_id"]
|
| 1139 |
-
|
| 1140 |
examples_q = conn.execute(
|
| 1141 |
"SELECT text, ref FROM examples WHERE sense_id = ?", (sense_id,)
|
| 1142 |
).fetchall()
|
|
@@ -1144,7 +1219,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1144 |
senses_list.append(sense_dict)
|
| 1145 |
report["senses"] = senses_list
|
| 1146 |
|
| 1147 |
-
# 3. Get Inflected Forms
|
| 1148 |
forms_q = conn.execute(
|
| 1149 |
"""
|
| 1150 |
SELECT f.form_text, f.sense_index,
|
|
@@ -1155,24 +1229,16 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
|
|
| 1155 |
).fetchall()
|
| 1156 |
report["forms"] = [dict(f) for f in forms_q]
|
| 1157 |
|
| 1158 |
-
# ... (All other queries for sounds, synonyms, antonyms, etc. are IDENTICAL to the German app) ...
|
| 1159 |
-
|
| 1160 |
return report
|
| 1161 |
|
| 1162 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1163 |
-
"""
|
| 1164 |
-
Finds all entries related to an English word.
|
| 1165 |
-
"""
|
| 1166 |
log(f"Wiktionary (EN): Querying for '{word}'...")
|
| 1167 |
found_entry_ids: Set[int] = set()
|
| 1168 |
|
| 1169 |
-
# --- ENGLISH REPLACEMENT ---
|
| 1170 |
lang_query = 'English'
|
| 1171 |
-
# These titles are specific to the English Wiktionary dump
|
| 1172 |
form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
|
| 1173 |
-
# --- END REPLACEMENT ---
|
| 1174 |
|
| 1175 |
-
# 1. Check if the word is a lemma (base form)
|
| 1176 |
lemma_q = conn.execute(
|
| 1177 |
f"SELECT id, pos_title FROM entries WHERE word = ? AND lang = '{lang_query}'", (word,)
|
| 1178 |
).fetchall()
|
|
@@ -1202,7 +1268,6 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1202 |
except json.JSONDecodeError:
|
| 1203 |
log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
|
| 1204 |
|
| 1205 |
-
# 2. Check if the word is an inflected form (in the `forms` table)
|
| 1206 |
form_q = conn.execute(
|
| 1207 |
f"""
|
| 1208 |
SELECT DISTINCT e.id
|
|
@@ -1231,7 +1296,6 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1231 |
|
| 1232 |
log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
|
| 1233 |
|
| 1234 |
-
# 3. Build a full report for each unique entry
|
| 1235 |
all_reports = []
|
| 1236 |
for entry_id in found_entry_ids:
|
| 1237 |
try:
|
|
@@ -1242,19 +1306,61 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
|
|
| 1242 |
|
| 1243 |
return all_reports
|
| 1244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
# ============================================================================
|
| 1246 |
# 6e. SHARED SEMANTIC HELPER (OEWN + OpenBLP)
|
| 1247 |
# ============================================================================
|
| 1248 |
|
| 1249 |
def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Dict[str, Any]]]:
|
| 1250 |
-
"""
|
| 1251 |
-
(Helper) Fetches WordNet (OEWN) senses for a word and groups them by POS.
|
| 1252 |
-
"""
|
| 1253 |
senses_by_pos: Dict[str, List[Dict]] = {
|
| 1254 |
"noun": [], "verb": [], "adjective": [], "adverb": []
|
| 1255 |
}
|
| 1256 |
if not WN_AVAILABLE:
|
| 1257 |
-
# Fail-open strategy
|
| 1258 |
return {"noun": [{"info": "WordNet unavailable"}], "verb": [{"info": "WordNet unavailable"}],
|
| 1259 |
"adjective": [{"info": "WordNet unavailable"}], "adverb": [{"info": "WordNet unavailable"}]}
|
| 1260 |
|
|
@@ -1264,36 +1370,19 @@ def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Di
|
|
| 1264 |
if "error" in sense: continue
|
| 1265 |
pos_tag = sense.get("pos")
|
| 1266 |
|
| 1267 |
-
if pos_tag == 'n':
|
| 1268 |
-
|
| 1269 |
-
elif pos_tag == '
|
| 1270 |
-
|
| 1271 |
-
# --- ENGLISH WORDNET (OEWN) FIX ---
|
| 1272 |
-
# 'a' is Adjective, 's' is Adjective Satellite
|
| 1273 |
-
# 'r' is Adverb
|
| 1274 |
-
elif pos_tag == 'a' or pos_tag == 's':
|
| 1275 |
-
senses_by_pos["adjective"].append(sense)
|
| 1276 |
-
elif pos_tag == 'r':
|
| 1277 |
-
senses_by_pos["adverb"].append(sense)
|
| 1278 |
-
# --- END OF FIX ---
|
| 1279 |
-
|
| 1280 |
except Exception as e:
|
| 1281 |
log(f"WordNet helper check failed for '{word}': {e}")
|
| 1282 |
|
| 1283 |
-
log(f"WordNet (EN) senses for '{word}': "
|
| 1284 |
-
f"{len(senses_by_pos['noun'])}N, "
|
| 1285 |
-
f"{len(senses_by_pos['verb'])}V, "
|
| 1286 |
-
f"{len(senses_by_pos['adjective'])}Adj, "
|
| 1287 |
-
f"{len(senses_by_pos['adverb'])}Adv")
|
| 1288 |
return senses_by_pos
|
| 1289 |
|
| 1290 |
def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang: str = 'en') -> Dict[str, Any]:
|
| 1291 |
-
"""
|
| 1292 |
-
(REUSABLE HELPER) Fetches OEWN, ConceptNet, and OpenBLP data.
|
| 1293 |
-
"""
|
| 1294 |
log(f"[DEBUG] Building semantics for lemma='{lemma}', pos='{pos_key}', lang='{lang}'")
|
| 1295 |
|
| 1296 |
-
# 1. Get OEWN senses
|
| 1297 |
oewn_senses = []
|
| 1298 |
if WN_AVAILABLE:
|
| 1299 |
try:
|
|
@@ -1304,7 +1393,6 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
|
|
| 1304 |
except Exception as e:
|
| 1305 |
log(f"[DEBUG] OEWN lookup failed for {lemma} ({pos_key}): {e}")
|
| 1306 |
|
| 1307 |
-
# 2. Get ConceptNet relations
|
| 1308 |
conceptnet_relations = []
|
| 1309 |
if REQUESTS_AVAILABLE:
|
| 1310 |
try:
|
|
@@ -1313,14 +1401,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
|
|
| 1313 |
except Exception as e:
|
| 1314 |
conceptnet_relations = [{"error": str(e)}]
|
| 1315 |
|
| 1316 |
-
# 3. Get OpenBLP relations
|
| 1317 |
openblp_relations = []
|
| 1318 |
try:
|
| 1319 |
openblp_relations = openblp_get_relations(lemma)
|
| 1320 |
except Exception as e:
|
| 1321 |
openblp_relations = [{"error": f"OpenBLP stub failed: {e}"}]
|
| 1322 |
|
| 1323 |
-
# 4. Apply top_n limit
|
| 1324 |
if top_n > 0:
|
| 1325 |
oewn_senses = oewn_senses[:top_n]
|
| 1326 |
conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
|
@@ -1328,13 +1414,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
|
|
| 1328 |
openblp_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
| 1329 |
openblp_relations = openblp_relations[:top_n]
|
| 1330 |
|
| 1331 |
-
# --- MUST MATCH GERMAN JSON STRUCTURE ---
|
| 1332 |
return {
|
| 1333 |
"lemma": lemma,
|
| 1334 |
-
"wiktionary_senses": [],
|
| 1335 |
-
"odenet_senses": oewn_senses,
|
| 1336 |
"conceptnet_relations": conceptnet_relations,
|
| 1337 |
-
"openblp_relations": openblp_relations,
|
| 1338 |
"wiktionary_synonyms": [],
|
| 1339 |
"wiktionary_antonyms": []
|
| 1340 |
}
|
|
@@ -1346,16 +1431,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
|
|
| 1346 |
|
| 1347 |
# --- PRIMARY ENGINE: WIKTIONARY (EN) ---
|
| 1348 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
| 1349 |
-
"""
|
| 1350 |
-
(PRIMARY ENGINE) Analyzes an English word using the Wiktionary DB.
|
| 1351 |
-
"""
|
| 1352 |
final_result: Dict[str, Any] = {"input_word": word, "analysis": {}}
|
| 1353 |
|
| 1354 |
conn = wiktionary_get_connection()
|
| 1355 |
-
if not conn:
|
| 1356 |
-
return {} # Signal failure
|
| 1357 |
|
| 1358 |
-
# --- 1. GET SPACY HINT ---
|
| 1359 |
spacy_pos_hint, spacy_lemma_hint = None, None
|
| 1360 |
try:
|
| 1361 |
nlp_en = spacy_load_spacy_model("en_core_web_md")
|
|
@@ -1364,20 +1445,16 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1364 |
token = doc[0]
|
| 1365 |
spacy_pos_hint = token.pos_.lower()
|
| 1366 |
spacy_lemma_hint = token.lemma_
|
| 1367 |
-
log(f"[DEBUG] Wiktionary (EN) Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
|
| 1368 |
except Exception as e:
|
| 1369 |
log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
|
| 1370 |
|
| 1371 |
-
# --- 2. FIND ALL WIKTIONARY ENTRIES ---
|
| 1372 |
try:
|
| 1373 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1374 |
except Exception as e:
|
| 1375 |
log(f"[DEBUG] Wiktionary (EN) query failed: {e}")
|
| 1376 |
-
return {}
|
| 1377 |
-
if not wiktionary_reports:
|
| 1378 |
-
return {} # No results, signal to fallback
|
| 1379 |
|
| 1380 |
-
# --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
|
| 1381 |
def get_priority_score(report):
|
| 1382 |
wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
|
| 1383 |
wikt_lemma = report.get("lemma")
|
|
@@ -1388,25 +1465,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1388 |
return 4
|
| 1389 |
wiktionary_reports.sort(key=get_priority_score)
|
| 1390 |
|
| 1391 |
-
# --- 4. BUILD AND VALIDATE THE FINAL REPORT ---
|
| 1392 |
word_lower = word.lower()
|
| 1393 |
for wikt_report in wiktionary_reports:
|
| 1394 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1395 |
lemma = wikt_report.get("lemma", word)
|
| 1396 |
pos_title = wikt_report.get("pos_title", "")
|
| 1397 |
|
| 1398 |
-
# --- A. Build Wiktionary Inflection Block ---
|
| 1399 |
inflections_wikt_block = {
|
| 1400 |
"base_form": lemma,
|
| 1401 |
"forms_list": wikt_report.get("forms", []),
|
| 1402 |
"source": "wiktionary"
|
| 1403 |
}
|
| 1404 |
|
| 1405 |
-
# --- B. Build Pattern Inflection Block (using pattern.en) ---
|
| 1406 |
pattern_block = {}
|
| 1407 |
if PATTERN_EN_AVAILABLE:
|
| 1408 |
try:
|
| 1409 |
-
# Use input 'word' for inflected forms to find right lemma
|
| 1410 |
use_word = word if "form" in pos_title.lower() else lemma
|
| 1411 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
|
| 1412 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
|
|
@@ -1415,10 +1488,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1415 |
except Exception as e:
|
| 1416 |
pattern_block = {"error": f"Pattern.en analysis failed: {e}"}
|
| 1417 |
|
| 1418 |
-
# --- C. Build Semantics Block ---
|
| 1419 |
semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
|
| 1420 |
|
| 1421 |
-
# --- D. Assemble the report (pre-validation) ---
|
| 1422 |
pos_entry_report = {
|
| 1423 |
"inflections_wiktionary": inflections_wikt_block,
|
| 1424 |
"inflections_pattern": pattern_block,
|
|
@@ -1427,27 +1498,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1427 |
"pos_title": pos_title,
|
| 1428 |
"etymology": wikt_report.get("etymology_text"),
|
| 1429 |
"pronunciation": wikt_report.get("sounds"),
|
| 1430 |
-
# ... (all other metadata fields) ...
|
| 1431 |
}
|
| 1432 |
}
|
| 1433 |
|
| 1434 |
-
# --- E. VALIDATION FILTER ---
|
| 1435 |
is_valid = False
|
| 1436 |
is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
|
| 1437 |
|
| 1438 |
-
if lemma.lower() == word_lower:
|
| 1439 |
-
is_valid = True
|
| 1440 |
-
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches entry lemma.")
|
| 1441 |
|
| 1442 |
if not is_valid and not is_inflected_entry:
|
| 1443 |
for form_entry in inflections_wikt_block.get("forms_list", []):
|
| 1444 |
form_text = form_entry.get("form_text", "").strip()
|
| 1445 |
if form_text.lower() == word_lower:
|
| 1446 |
is_valid = True
|
| 1447 |
-
log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word found in form: '{form_text}'")
|
| 1448 |
break
|
| 1449 |
|
| 1450 |
-
# --- F. Add to final result if valid ---
|
| 1451 |
if is_valid:
|
| 1452 |
if pos_key not in final_result["analysis"]:
|
| 1453 |
final_result["analysis"][pos_key] = []
|
|
@@ -1998,9 +2063,9 @@ def create_spacy_tab():
|
|
| 1998 |
html_dep_out, df_out, json_out, html_ner_out])
|
| 1999 |
|
| 2000 |
def create_languagetool_tab():
|
| 2001 |
-
"""Creates the UI for the Grammar Checker tab with LT
|
| 2002 |
gr.Markdown("# 🇬🇧 English Grammar & Spelling Checker")
|
| 2003 |
-
gr.Markdown("Powered by `LanguageTool
|
| 2004 |
|
| 2005 |
with gr.Row():
|
| 2006 |
text_input = gr.Textbox(
|
|
@@ -2009,33 +2074,20 @@ def create_languagetool_tab():
|
|
| 2009 |
lines=5,
|
| 2010 |
scale=3
|
| 2011 |
)
|
| 2012 |
-
checker_choice = gr.Radio(
|
| 2013 |
-
label="Checker Engine",
|
| 2014 |
-
choices=["LanguageTool", "After the Deadline"],
|
| 2015 |
-
value="LanguageTool",
|
| 2016 |
-
scale=1
|
| 2017 |
-
)
|
| 2018 |
|
| 2019 |
check_button = gr.Button("Check Text", variant="primary")
|
| 2020 |
output = gr.JSON(label="Detected Errors (JSON)")
|
| 2021 |
|
| 2022 |
-
def dispatch_grammar_check(text, choice):
|
| 2023 |
-
if choice == "LanguageTool":
|
| 2024 |
-
return lt_check_grammar(text, 'en')
|
| 2025 |
-
elif choice == "After the Deadline":
|
| 2026 |
-
return atd_check_grammar(text)
|
| 2027 |
-
return [{"error": "Invalid checker selected."}]
|
| 2028 |
-
|
| 2029 |
check_button.click(
|
| 2030 |
-
fn=
|
| 2031 |
-
inputs=[text_input
|
| 2032 |
outputs=[output],
|
| 2033 |
api_name="check_grammar"
|
| 2034 |
)
|
| 2035 |
gr.Examples(
|
| 2036 |
-
[["This is a houze."
|
| 2037 |
-
["The cat sleep on the table."
|
| 2038 |
-
inputs=[text_input
|
| 2039 |
cache_examples=False
|
| 2040 |
)
|
| 2041 |
|
|
@@ -2422,17 +2474,6 @@ if __name__ == "__main__":
|
|
| 2422 |
print("INFO: NLTK library not available, skipping lemmatizer.")
|
| 2423 |
print("--- NLTK Done ---\n")
|
| 2424 |
|
| 2425 |
-
# --- 7. Check AtD Service ---
|
| 2426 |
-
print("--- Initializing AtD Service ---")
|
| 2427 |
-
if ATD_AVAILABLE:
|
| 2428 |
-
try:
|
| 2429 |
-
atd_get_service()
|
| 2430 |
-
except Exception as e:
|
| 2431 |
-
print(f"✗ FAILED to start AtD: {e}")
|
| 2432 |
-
else:
|
| 2433 |
-
print("INFO: AtD library not available, skipping service.")
|
| 2434 |
-
print("--- AtD Done ---\n")
|
| 2435 |
-
|
| 2436 |
# --- 8. Check Pattern.en ---
|
| 2437 |
print("--- Checking Pattern.en ---")
|
| 2438 |
if not PATTERN_EN_AVAILABLE:
|
|
|
|
| 2 |
# ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V23-EN)
|
| 3 |
#
|
| 4 |
# This script adapts the German Linguistics Hub for English analysis,
|
| 5 |
+
# adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN, and OpenBLP.
|
| 6 |
# It maintains the exact same JSON output structure as the German app.
|
| 7 |
#
|
| 8 |
# ============================================================================
|
|
|
|
| 76 |
LT_AVAILABLE = False
|
| 77 |
print("CRITICAL WARNING: `language-tool-python` library not found.")
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# --- WordNet (wn) Import (for OEWN) ---
|
| 80 |
try:
|
| 81 |
import wn
|
|
|
|
| 185 |
NLTK_LEMMATIZER: Optional[WordNetLemmatizer] = None
|
| 186 |
NLTK_LEMMATIZER_LOCK = threading.Lock()
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
# --- Helper ---
|
| 189 |
def _html_wrap(content: str, line_height: str = "2.0") -> str:
|
| 190 |
return f'<div style="overflow-x:auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: {line_height};">{content}</div>'
|
|
|
|
| 213 |
"de": ("German", "de_core_news_md", "spacy"),
|
| 214 |
"es": ("Spanish", "es_core_news_md", "spacy"),
|
| 215 |
"grc-proiel-trf": ("Ancient Greek (PROIEL TRF)", "grc_proiel_trf", "grecy"),
|
| 216 |
+
"grc-perseus-trf": ("Ancient Greek (Perseus TRF)", "grc_perseus_trf", "grecy"),
|
| 217 |
+
"grc_ner_trf": ("Ancient Greek (NER TRF)", "grc_ner_trf", "grecy"),
|
| 218 |
+
"grc-proiel-lg": ("Ancient Greek (PROIEL LG)", "grc_proiel_lg", "grecy"),
|
| 219 |
+
"grc-perseus-lg": ("Ancient Greek (Perseus LG)", "grc_perseus_lg", "grecy"),
|
| 220 |
+
"grc-proiel-sm": ("Ancient Greek (PROIEL SM)", "grc_proiel_sm", "grecy"),
|
| 221 |
+
"grc-perseus-sm": ("Ancient Greek (Perseus SM)", "grc_perseus_sm", "grecy"),
|
| 222 |
}
|
| 223 |
SPACY_UI_TEXT = {
|
| 224 |
+
"de": {
|
| 225 |
+
"title": "# 🔍 Mehrsprachiger Morpho-Syntaktischer Analysator",
|
| 226 |
+
"subtitle": "Analysieren Sie Texte auf Deutsch, Englisch, Spanisch und Altgriechisch",
|
| 227 |
+
"ui_lang_label": "Benutzeroberflächensprache",
|
| 228 |
+
"model_lang_label": "Textsprache für Analyse",
|
| 229 |
+
"input_label": "Text eingeben",
|
| 230 |
+
"input_placeholder": "Geben Sie hier Ihren Text ein...",
|
| 231 |
+
"button_text": "Text analysieren",
|
| 232 |
+
"button_processing_text": "Verarbeitung läuft...",
|
| 233 |
+
"tab_graphic": "Grafische Darstellung",
|
| 234 |
+
"tab_table": "Tabelle",
|
| 235 |
+
"tab_json": "JSON",
|
| 236 |
+
"tab_ner": "Entitäten",
|
| 237 |
+
"html_label": "Abhängigkeitsparsing",
|
| 238 |
+
"table_label": "Morphologische Analyse",
|
| 239 |
+
"table_headers": ["Wort", "Lemma", "POS", "Tag", "Morphologie", "Abhängigkeit"],
|
| 240 |
+
"json_label": "JSON-Ausgabe",
|
| 241 |
+
"ner_label": "Benannte Entitäten",
|
| 242 |
+
"error_message": "Fehler: "
|
| 243 |
+
},
|
| 244 |
+
"en": {
|
| 245 |
+
"title": "# 🔍 Multilingual Morpho-Syntactic Analyzer",
|
| 246 |
+
"subtitle": "Analyze texts in German, English, Spanish, and Ancient Greek",
|
| 247 |
+
"ui_lang_label": "Interface Language",
|
| 248 |
+
"model_lang_label": "Text Language for Analysis",
|
| 249 |
+
"input_label": "Enter Text",
|
| 250 |
+
"input_placeholder": "Enter your text here...",
|
| 251 |
+
"button_text": "Analyze Text",
|
| 252 |
+
"button_processing_text": "Processing...",
|
| 253 |
+
"tab_graphic": "Graphic View",
|
| 254 |
+
"tab_table": "Table",
|
| 255 |
+
"tab_json": "JSON",
|
| 256 |
+
"tab_ner": "Entities",
|
| 257 |
+
"html_label": "Dependency Parsing",
|
| 258 |
+
"table_label": "Morphological Analysis",
|
| 259 |
+
"table_headers": ["Word", "Lemma", "POS", "Tag", "Morphology", "Dependency"],
|
| 260 |
+
"json_label": "JSON Output",
|
| 261 |
+
"ner_label": "Named Entities",
|
| 262 |
+
"error_message": "Error: "
|
| 263 |
+
},
|
| 264 |
+
"es": {
|
| 265 |
+
"title": "# 🔍 Analizador Morfo-Sintáctico Multilingüe",
|
| 266 |
+
"subtitle": "Analice textos en alemán, inglés, español y griego antiguo",
|
| 267 |
+
"ui_lang_label": "Idioma de la Interfaz",
|
| 268 |
+
"model_lang_label": "Idioma del Texto para Análisis",
|
| 269 |
+
"input_label": "Introducir Texto",
|
| 270 |
+
"input_placeholder": "Ingrese su texto aquí...",
|
| 271 |
+
"button_text": "Analizar Texto",
|
| 272 |
+
"button_processing_text": "Procesando...",
|
| 273 |
+
"tab_graphic": "Vista Gráfica",
|
| 274 |
+
"tab_table": "Tabla",
|
| 275 |
+
"tab_json": "JSON",
|
| 276 |
+
"tab_ner": "Entidades",
|
| 277 |
+
"html_label": "Análisis de Dependencias",
|
| 278 |
+
"table_label": "Análisis Morfológico",
|
| 279 |
+
"table_headers": ["Palabra", "Lema", "POS", "Etiqueta", "Morfología", "Dependencia"],
|
| 280 |
+
"json_label": "Salida JSON",
|
| 281 |
+
"ner_label": "Entidades Nombradas",
|
| 282 |
+
"error_message": "Error: "
|
| 283 |
+
}
|
| 284 |
}
|
| 285 |
SPACY_MODELS: Dict[str, Optional[spacy.Language]] = {}
|
| 286 |
|
| 287 |
# --- Dependency Installation & Model Loading ---
|
|
|
|
| 288 |
def spacy_install_spacy_transformers_once():
|
| 289 |
""" Installs spacy-transformers, required for all _trf models. """
|
| 290 |
marker_file = Path(".spacy_transformers_installed")
|
|
|
|
| 303 |
return False
|
| 304 |
|
| 305 |
def spacy_install_grecy_model_from_github(model_name: str) -> bool:
|
| 306 |
+
""" Installs a greCy model from GitHub Release. """
|
| 307 |
+
marker_file = Path(f".{model_name}_installed")
|
| 308 |
+
if marker_file.exists():
|
| 309 |
+
print(f"✓ {model_name} already installed (marker found)")
|
| 310 |
+
return True
|
| 311 |
+
print(f"Installing grecy model: {model_name}...")
|
| 312 |
+
if model_name == "grc_proiel_trf":
|
| 313 |
+
wheel_filename = "grc_proiel_trf-3.7.5-py3-none-any.whl"
|
| 314 |
+
elif model_name in ["grc_perseus_trf", "grc_proiel_lg", "grc_perseus_lg",
|
| 315 |
+
"grc_proiel_sm", "grc_perseus_sm", "grc_ner_trf"]:
|
| 316 |
+
wheel_filename = f"{model_name}-0.0.0-py3-none-any.whl"
|
| 317 |
+
else:
|
| 318 |
+
print(f"✗ Unknown grecy model: {model_name}")
|
| 319 |
+
return False
|
| 320 |
+
install_url = f"https://github.com/CrispStrobe/greCy/releases/download/v1.0-models/{wheel_filename}"
|
| 321 |
+
cmd = [sys.executable, "-m", "pip", "install", install_url, "--no-deps"]
|
| 322 |
+
print(f"Running: {' '.join(cmd)}")
|
| 323 |
+
try:
|
| 324 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=900)
|
| 325 |
+
if result.stdout: print("STDOUT:", result.stdout)
|
| 326 |
+
if result.stderr: print("STDERR:", result.stderr)
|
| 327 |
+
print(f"✓ Successfully installed {model_name} from GitHub")
|
| 328 |
+
marker_file.touch()
|
| 329 |
+
return True
|
| 330 |
+
except subprocess.CalledProcessError as e:
|
| 331 |
+
print(f"✗ Installation subprocess FAILED with code {e.returncode}")
|
| 332 |
+
print("STDOUT:", e.stdout)
|
| 333 |
+
print("STDERR:", e.stderr)
|
| 334 |
+
return False
|
| 335 |
+
except Exception as e:
|
| 336 |
+
print(f"✗ Installation exception: {e}")
|
| 337 |
+
traceback.print_exc()
|
| 338 |
+
return False
|
| 339 |
+
|
| 340 |
def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
|
| 341 |
"""Load or install a standard spaCy model."""
|
| 342 |
try:
|
|
|
|
| 351 |
return None
|
| 352 |
|
| 353 |
def spacy_load_grecy_model(model_name: str) -> Optional[spacy.Language]:
|
| 354 |
+
""" Load a grecy model, installing from GitHub if needed. """
|
| 355 |
+
if not spacy_install_grecy_model_from_github(model_name):
|
| 356 |
+
print(f"✗ Cannot load {model_name} because installation failed.")
|
| 357 |
+
return None
|
| 358 |
+
try:
|
| 359 |
+
print("Refreshing importlib to find new package...")
|
| 360 |
+
importlib.invalidate_caches()
|
| 361 |
+
try: importlib.reload(site)
|
| 362 |
+
except Exception: pass
|
| 363 |
+
print(f"Trying: spacy.load('{model_name}')")
|
| 364 |
+
nlp = spacy.load(model_name)
|
| 365 |
+
print(f"✓ Successfully loaded {model_name}")
|
| 366 |
+
return nlp
|
| 367 |
+
except Exception as e:
|
| 368 |
+
print(f"✗ Model {model_name} is installed but FAILED to load.")
|
| 369 |
+
print(f" Error: {e}")
|
| 370 |
+
traceback.print_exc()
|
| 371 |
+
return None
|
| 372 |
|
| 373 |
def spacy_initialize_models():
|
| 374 |
""" Pre-load standard models and ensure _trf dependencies are ready. """
|
|
|
|
| 398 |
|
| 399 |
def spacy_get_analysis(ui_lang: str, model_lang_key: str, text: str):
|
| 400 |
"""Analyze text and return results."""
|
|
|
|
| 401 |
ui_config = SPACY_UI_TEXT.get(ui_lang.lower(), SPACY_UI_TEXT["en"])
|
| 402 |
+
error_prefix = ui_config.get("error_message", "Error: ")
|
| 403 |
try:
|
| 404 |
+
if not text.strip():
|
| 405 |
+
return ([], [], "<p style='color: orange;'>No text provided.</p>", "<p>No text provided.</p>",
|
| 406 |
+
gr.Button(value=ui_config.get("button_text", "Analyze"), interactive=True))
|
| 407 |
+
|
| 408 |
nlp = SPACY_MODELS.get(model_lang_key)
|
| 409 |
+
if nlp is None:
|
| 410 |
+
# Try loading one last time
|
| 411 |
+
if model_lang_key in SPACY_MODEL_INFO:
|
| 412 |
+
_, model_name, model_type = SPACY_MODEL_INFO[model_lang_key]
|
| 413 |
+
if model_type == 'grecy': nlp = spacy_load_grecy_model(model_name)
|
| 414 |
+
else: nlp = spacy_load_spacy_model(model_name)
|
| 415 |
+
SPACY_MODELS[model_lang_key] = nlp
|
| 416 |
+
|
| 417 |
+
if nlp is None:
|
| 418 |
+
return ([], {"error": "Model load failed"}, "Error", "Error", gr.Button(interactive=True))
|
| 419 |
+
|
| 420 |
doc = nlp(text)
|
| 421 |
+
dataframe_output = []
|
| 422 |
+
json_output = []
|
| 423 |
+
for token in doc:
|
| 424 |
+
lemma_str = token.lemma_
|
| 425 |
+
morph_str = str(token.morph) if token.morph else ''
|
| 426 |
+
dep_str = token.dep_ if doc.is_parsed else ''
|
| 427 |
+
tag_str = token.tag_ or ''
|
| 428 |
+
pos_str = token.pos_ or ''
|
| 429 |
+
json_output.append({
|
| 430 |
+
"word": token.text, "lemma": lemma_str, "pos": pos_str,
|
| 431 |
+
"tag": tag_str, "morphology": morph_str, "dependency": dep_str,
|
| 432 |
+
"is_stopword": token.is_stop
|
| 433 |
+
})
|
| 434 |
+
dataframe_output.append([token.text, lemma_str, pos_str, tag_str, morph_str, dep_str])
|
| 435 |
+
|
| 436 |
+
html_dep_out = ""
|
| 437 |
+
if "parser" in nlp.pipe_names and doc.is_parsed:
|
| 438 |
+
try:
|
| 439 |
+
options = {"compact": True, "bg": "#ffffff", "color": "#000000", "font": "Source Sans Pro"}
|
| 440 |
+
html_svg = displacy.render(doc, style="dep", jupyter=False, options=options)
|
| 441 |
+
html_dep_out = _html_wrap(html_svg, line_height="2.5")
|
| 442 |
+
except Exception as e:
|
| 443 |
+
html_dep_out = f"<p>Visualization error: {e}</p>"
|
| 444 |
+
|
| 445 |
+
html_ner_out = ""
|
| 446 |
+
if "ner" in nlp.pipe_names:
|
| 447 |
+
if doc.ents:
|
| 448 |
+
try:
|
| 449 |
+
html_ner = displacy.render(doc, style="ent", jupyter=False)
|
| 450 |
+
html_ner_out = _html_wrap(html_ner, line_height="2.5")
|
| 451 |
+
except Exception: html_ner_out = "<p>Error rendering NER</p>"
|
| 452 |
+
else: html_ner_out = "<p>No entities found.</p>"
|
| 453 |
+
|
| 454 |
+
return (dataframe_output, json_output, html_dep_out, html_ner_out,
|
| 455 |
+
gr.Button(value=ui_config.get("button_text", "Analyze"), interactive=True))
|
| 456 |
except Exception as e:
|
| 457 |
traceback.print_exc()
|
| 458 |
+
error_html = f"<div style='color: red;'>{error_prefix} {str(e)}</div>"
|
| 459 |
+
return ([], {"error": str(e)}, error_html, error_html, gr.Button(interactive=True))
|
| 460 |
|
| 461 |
|
| 462 |
def spacy_update_ui(ui_lang: str):
|
| 463 |
"""Update UI language for the spaCy tab."""
|
| 464 |
+
# Placeholder - actual implementation would update labels
|
| 465 |
+
return [gr.update()] * 14
|
| 466 |
|
| 467 |
# ============================================================================
|
| 468 |
+
# 4. GRAMMAR CHECKER LOGIC (LanguageTool Only)
|
| 469 |
# ============================================================================
|
| 470 |
|
| 471 |
+
# --- Globals for LanguageTool ---
|
| 472 |
LT_TOOL_INSTANCES: Dict[str, Optional[language_tool_python.LanguageTool]] = {}
|
| 473 |
LT_TOOL_LOCK = threading.Lock()
|
| 474 |
|
|
|
|
| 478 |
if not LT_AVAILABLE:
|
| 479 |
raise ImportError("language-tool-python library is not installed.")
|
| 480 |
|
| 481 |
+
lang_code = 'en-US' if lang == 'en' else 'de-DE'
|
| 482 |
|
| 483 |
if lang_code in LT_TOOL_INSTANCES:
|
| 484 |
return LT_TOOL_INSTANCES[lang_code]
|
|
|
|
| 527 |
traceback.print_exc()
|
| 528 |
return [{"error": f"An unexpected error occurred: {str(e)}"}]
|
| 529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
# ============================================================================
|
| 531 |
# 5. WORDNET THESAURUS LOGIC (OEWN)
|
| 532 |
# ============================================================================
|
|
|
|
| 625 |
|
| 626 |
def wordnet_process_word_lookup(word: str, wn_instance: wn.Wordnet) -> Dict[str, Any]:
|
| 627 |
""" Process a single word lookup. Runs in the worker thread. """
|
|
|
|
| 628 |
if not word or not word.strip():
|
| 629 |
return {"info": "No word provided to check."}
|
| 630 |
word = word.strip().lower()
|
|
|
|
| 673 |
return
|
| 674 |
wordnet_worker_thread = threading.Thread(target=wordnet_worker_loop, daemon=True, name="WordNetWorker")
|
| 675 |
wordnet_worker_thread.start()
|
| 676 |
+
timeout = 30
|
| 677 |
+
for _ in range(timeout * 10):
|
| 678 |
+
if wordnet_worker_state in (WordNetWorkerState.READY, WordNetWorkerState.ERROR):
|
| 679 |
+
break
|
| 680 |
+
threading.Event().wait(0.1)
|
| 681 |
+
if wordnet_worker_state != WordNetWorkerState.READY:
|
| 682 |
+
raise Exception("OdeNet Worker failed to initialize")
|
| 683 |
|
| 684 |
# --- Public API (Adapted) ---
|
| 685 |
def wordnet_get_thesaurus_info(word: str, lang: str = 'en') -> Dict[str, Any]:
|
|
|
|
| 903 |
def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
|
| 904 |
"""
|
| 905 |
Fetches relations from the cstr/conceptnet_normalized Gradio API.
|
|
|
|
| 906 |
"""
|
| 907 |
if not GRADIO_CLIENT_AVAILABLE:
|
| 908 |
return {"error": "`gradio_client` library is not installed."}
|
|
|
|
| 913 |
word_lower = word.strip().lower()
|
| 914 |
cache_key = (word_lower, language)
|
| 915 |
|
|
|
|
| 916 |
with CONCEPTNET_LOCK:
|
| 917 |
if cache_key in CONCEPTNET_CACHE:
|
| 918 |
log(f"ConceptNet: Found '{word_lower}' in cache.")
|
|
|
|
| 921 |
log(f"ConceptNet: Fetching '{word_lower}' from Gradio API...")
|
| 922 |
|
| 923 |
try:
|
|
|
|
| 924 |
client = get_conceptnet_client()
|
| 925 |
if not client:
|
| 926 |
return {"error": "ConceptNet Gradio Client is not available."}
|
|
|
|
| 934 |
api_name="/get_semantic_profile"
|
| 935 |
)
|
| 936 |
|
|
|
|
| 937 |
relations_list = []
|
|
|
|
| 938 |
if not isinstance(result_markdown, str):
|
| 939 |
raise TypeError(f"ConceptNet API returned type {type(result_markdown)}, expected str.")
|
| 940 |
|
|
|
|
| 968 |
elif node2.lower() == word_lower and node1.lower() != word_lower:
|
| 969 |
other_node, direction = node1, "<-"
|
| 970 |
else:
|
| 971 |
+
continue
|
| 972 |
|
| 973 |
relations_list.append({
|
| 974 |
"relation": relation, "direction": direction, "other_node": other_node,
|
|
|
|
| 978 |
except Exception as e:
|
| 979 |
log(f"ConceptNet Parser: Error parsing line '{line}': {e}")
|
| 980 |
|
|
|
|
| 981 |
if not relations_list:
|
| 982 |
final_result = {"info": f"No valid relations found for '{word_lower}'."}
|
| 983 |
else:
|
|
|
|
| 1000 |
Stub function to query OpenBLP.
|
| 1001 |
Replace this with your actual OpenBLP database/API query.
|
| 1002 |
"""
|
| 1003 |
+
# Placeholder logic
|
| 1004 |
if lemma == "dog":
|
| 1005 |
return [
|
| 1006 |
{"relation": "HasProperty", "other_node": "loyal", "weight": 0.9, "source": "openblp"},
|
|
|
|
| 1052 |
return None
|
| 1053 |
|
| 1054 |
def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
| 1055 |
+
""" (FALLBACK ENGINE 1) Analyzes a single word using HanTa (EN). """
|
|
|
|
|
|
|
|
|
|
| 1056 |
if not HANTA_AVAILABLE: return {}
|
| 1057 |
print(f"\n[Word Encyclopedia] Running HanTa (EN) fallback for: \"{word}\"")
|
| 1058 |
final_result = {"input_word": word, "analysis": {}}
|
|
|
|
| 1061 |
tagger = hanta_get_tagger_en()
|
| 1062 |
if not tagger: return {}
|
| 1063 |
|
|
|
|
|
|
|
| 1064 |
possible_tags = tagger.tag_word(word.lower())
|
| 1065 |
possible_tags.extend(tagger.tag_word(word.capitalize()))
|
| 1066 |
|
|
|
|
| 1070 |
pos_key = _hanta_pos_to_key(hanta_pos)
|
| 1071 |
if not pos_key: continue
|
| 1072 |
|
|
|
|
|
|
|
| 1073 |
raw_analysis = tagger.analyze(word.lower() if pos_key != 'noun' else word.capitalize())
|
| 1074 |
lemma = raw_analysis[0] # The lemma
|
| 1075 |
|
|
|
|
| 1078 |
processed_lemmas_pos.add((lemma, pos_key))
|
| 1079 |
log(f"--- Analyzing HanTa (EN) path: lemma='{lemma}', pos='{pos_key}' ---")
|
| 1080 |
|
|
|
|
| 1081 |
pattern_block = {}
|
| 1082 |
if PATTERN_EN_AVAILABLE:
|
| 1083 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
|
|
|
|
| 1085 |
elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
|
| 1086 |
elif pos_key == "adverb": pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
|
| 1087 |
|
|
|
|
| 1088 |
semantics_block = _build_semantics_block_for_lemma(lemma, pos_key, top_n, 'en')
|
| 1089 |
|
|
|
|
| 1090 |
pos_entry_report = {
|
| 1091 |
+
"hanta_analysis": {
|
| 1092 |
"lemma": lemma,
|
| 1093 |
"pos_tag": hanta_pos,
|
| 1094 |
"analysis_string": str(raw_analysis),
|
|
|
|
| 1098 |
"semantics_combined": semantics_block
|
| 1099 |
}
|
| 1100 |
|
|
|
|
| 1101 |
if word_appears_in_inflections_en(word, pattern_block, pos_key):
|
| 1102 |
if pos_key not in final_result["analysis"]:
|
| 1103 |
final_result["analysis"][pos_key] = []
|
|
|
|
| 1117 |
# ============================================================================
|
| 1118 |
# 6d. WIKTIONARY DATABASE LOGIC (EN)
|
| 1119 |
# ============================================================================
|
|
|
|
| 1120 |
def wiktionary_download_db() -> bool:
|
| 1121 |
""" Downloads the English Wiktionary DB. """
|
| 1122 |
global WIKTIONARY_AVAILABLE
|
|
|
|
| 1128 |
print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
|
| 1129 |
try:
|
| 1130 |
hf_hub_download(
|
| 1131 |
+
repo_id=WIKTIONARY_REPO_ID,
|
| 1132 |
filename=WIKTIONARY_DB_PATH,
|
| 1133 |
repo_type="dataset",
|
| 1134 |
local_dir=".",
|
|
|
|
| 1164 |
log("Creating new read-only connection to Wiktionary DB...")
|
| 1165 |
db_uri = f"file:{WIKTIONARY_DB_PATH}?mode=ro"
|
| 1166 |
conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
|
| 1167 |
+
conn.row_factory = sqlite3.Row
|
| 1168 |
_ = conn.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1").fetchone()
|
| 1169 |
print("✓ Wiktionary DB connection successful.")
|
| 1170 |
WIKTIONARY_CONN = conn
|
|
|
|
| 1187 |
""" (REVISED FOR FULL DB V3) Fetches ALL data for a single entry_id. """
|
| 1188 |
report = {}
|
| 1189 |
|
|
|
|
| 1190 |
entry_data = conn.execute(
|
| 1191 |
"SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
|
| 1192 |
).fetchone()
|
|
|
|
| 1196 |
report["entry_id"] = entry_id
|
| 1197 |
report["lemma"] = entry_data["word"]
|
| 1198 |
|
|
|
|
| 1199 |
senses_q = conn.execute(
|
| 1200 |
"""
|
| 1201 |
SELECT
|
|
|
|
| 1212 |
for sense_row in senses_q:
|
| 1213 |
sense_dict = dict(sense_row)
|
| 1214 |
sense_id = sense_dict["sense_id"]
|
|
|
|
| 1215 |
examples_q = conn.execute(
|
| 1216 |
"SELECT text, ref FROM examples WHERE sense_id = ?", (sense_id,)
|
| 1217 |
).fetchall()
|
|
|
|
| 1219 |
senses_list.append(sense_dict)
|
| 1220 |
report["senses"] = senses_list
|
| 1221 |
|
|
|
|
| 1222 |
forms_q = conn.execute(
|
| 1223 |
"""
|
| 1224 |
SELECT f.form_text, f.sense_index,
|
|
|
|
| 1229 |
).fetchall()
|
| 1230 |
report["forms"] = [dict(f) for f in forms_q]
|
| 1231 |
|
|
|
|
|
|
|
| 1232 |
return report
|
| 1233 |
|
| 1234 |
def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
|
| 1235 |
+
""" Finds all entries related to an English word. """
|
|
|
|
|
|
|
| 1236 |
log(f"Wiktionary (EN): Querying for '{word}'...")
|
| 1237 |
found_entry_ids: Set[int] = set()
|
| 1238 |
|
|
|
|
| 1239 |
lang_query = 'English'
|
|
|
|
| 1240 |
form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
|
|
|
|
| 1241 |
|
|
|
|
| 1242 |
lemma_q = conn.execute(
|
| 1243 |
f"SELECT id, pos_title FROM entries WHERE word = ? AND lang = '{lang_query}'", (word,)
|
| 1244 |
).fetchall()
|
|
|
|
| 1268 |
except json.JSONDecodeError:
|
| 1269 |
log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
|
| 1270 |
|
|
|
|
| 1271 |
form_q = conn.execute(
|
| 1272 |
f"""
|
| 1273 |
SELECT DISTINCT e.id
|
|
|
|
| 1296 |
|
| 1297 |
log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
|
| 1298 |
|
|
|
|
| 1299 |
all_reports = []
|
| 1300 |
for entry_id in found_entry_ids:
|
| 1301 |
try:
|
|
|
|
| 1306 |
|
| 1307 |
return all_reports
|
| 1308 |
|
| 1309 |
+
def _wiktionary_format_semantics_block(wikt_report: Dict[str, Any], pattern_block: Dict[str, Any], top_n: int) -> Dict[str, Any]:
|
| 1310 |
+
""" Combines English Wiktionary senses with OEWN/ConceptNet. """
|
| 1311 |
+
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1312 |
+
semantic_lemma = wikt_report.get("lemma")
|
| 1313 |
+
|
| 1314 |
+
wiktionary_senses = []
|
| 1315 |
+
for sense in wikt_report.get("senses", []):
|
| 1316 |
+
wiktionary_senses.append({
|
| 1317 |
+
"definition": sense.get("glosses"),
|
| 1318 |
+
"source": "wiktionary"
|
| 1319 |
+
})
|
| 1320 |
+
|
| 1321 |
+
oewn_senses = []
|
| 1322 |
+
if WN_AVAILABLE:
|
| 1323 |
+
try:
|
| 1324 |
+
senses_by_pos = _get_wordnet_senses_by_pos(semantic_lemma, 'en')
|
| 1325 |
+
oewn_senses_raw = senses_by_pos.get(pos_key, [])
|
| 1326 |
+
if oewn_senses_raw and "info" not in oewn_senses_raw[0]:
|
| 1327 |
+
oewn_senses = oewn_senses_raw
|
| 1328 |
+
except Exception as e:
|
| 1329 |
+
log(f"[DEBUG] OEWN lookup failed for {semantic_lemma} ({pos_key}): {e}")
|
| 1330 |
+
|
| 1331 |
+
conceptnet_relations = []
|
| 1332 |
+
if REQUESTS_AVAILABLE:
|
| 1333 |
+
try:
|
| 1334 |
+
conceptnet_result = conceptnet_get_relations(semantic_lemma, language='en')
|
| 1335 |
+
conceptnet_relations = conceptnet_result.get("relations", [])
|
| 1336 |
+
except Exception: pass
|
| 1337 |
+
|
| 1338 |
+
if top_n > 0:
|
| 1339 |
+
wiktionary_senses = wiktionary_senses[:top_n]
|
| 1340 |
+
oewn_senses = oewn_senses[:top_n]
|
| 1341 |
+
conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
| 1342 |
+
conceptnet_relations = conceptnet_relations[:top_n]
|
| 1343 |
+
|
| 1344 |
+
return {
|
| 1345 |
+
"lemma": semantic_lemma,
|
| 1346 |
+
"wiktionary_senses": wiktionary_senses,
|
| 1347 |
+
"odenet_senses": oewn_senses, # Key name preserved
|
| 1348 |
+
"conceptnet_relations": conceptnet_relations,
|
| 1349 |
+
"wiktionary_synonyms": wikt_report.get("synonyms", []),
|
| 1350 |
+
"wiktionary_antonyms": wikt_report.get("antonyms", [])
|
| 1351 |
+
}
|
| 1352 |
+
|
| 1353 |
+
|
| 1354 |
# ============================================================================
|
| 1355 |
# 6e. SHARED SEMANTIC HELPER (OEWN + OpenBLP)
|
| 1356 |
# ============================================================================
|
| 1357 |
|
| 1358 |
def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Dict[str, Any]]]:
|
| 1359 |
+
""" (Helper) Fetches WordNet (OEWN) senses for a word and groups them by POS. """
|
|
|
|
|
|
|
| 1360 |
senses_by_pos: Dict[str, List[Dict]] = {
|
| 1361 |
"noun": [], "verb": [], "adjective": [], "adverb": []
|
| 1362 |
}
|
| 1363 |
if not WN_AVAILABLE:
|
|
|
|
| 1364 |
return {"noun": [{"info": "WordNet unavailable"}], "verb": [{"info": "WordNet unavailable"}],
|
| 1365 |
"adjective": [{"info": "WordNet unavailable"}], "adverb": [{"info": "WordNet unavailable"}]}
|
| 1366 |
|
|
|
|
| 1370 |
if "error" in sense: continue
|
| 1371 |
pos_tag = sense.get("pos")
|
| 1372 |
|
| 1373 |
+
if pos_tag == 'n': senses_by_pos["noun"].append(sense)
|
| 1374 |
+
elif pos_tag == 'v': senses_by_pos["verb"].append(sense)
|
| 1375 |
+
elif pos_tag == 'a' or pos_tag == 's': senses_by_pos["adjective"].append(sense)
|
| 1376 |
+
elif pos_tag == 'r': senses_by_pos["adverb"].append(sense)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1377 |
except Exception as e:
|
| 1378 |
log(f"WordNet helper check failed for '{word}': {e}")
|
| 1379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1380 |
return senses_by_pos
|
| 1381 |
|
| 1382 |
def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang: str = 'en') -> Dict[str, Any]:
|
| 1383 |
+
""" (REUSABLE HELPER) Fetches OEWN, ConceptNet, and OpenBLP data. """
|
|
|
|
|
|
|
| 1384 |
log(f"[DEBUG] Building semantics for lemma='{lemma}', pos='{pos_key}', lang='{lang}'")
|
| 1385 |
|
|
|
|
| 1386 |
oewn_senses = []
|
| 1387 |
if WN_AVAILABLE:
|
| 1388 |
try:
|
|
|
|
| 1393 |
except Exception as e:
|
| 1394 |
log(f"[DEBUG] OEWN lookup failed for {lemma} ({pos_key}): {e}")
|
| 1395 |
|
|
|
|
| 1396 |
conceptnet_relations = []
|
| 1397 |
if REQUESTS_AVAILABLE:
|
| 1398 |
try:
|
|
|
|
| 1401 |
except Exception as e:
|
| 1402 |
conceptnet_relations = [{"error": str(e)}]
|
| 1403 |
|
|
|
|
| 1404 |
openblp_relations = []
|
| 1405 |
try:
|
| 1406 |
openblp_relations = openblp_get_relations(lemma)
|
| 1407 |
except Exception as e:
|
| 1408 |
openblp_relations = [{"error": f"OpenBLP stub failed: {e}"}]
|
| 1409 |
|
|
|
|
| 1410 |
if top_n > 0:
|
| 1411 |
oewn_senses = oewn_senses[:top_n]
|
| 1412 |
conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
|
|
|
| 1414 |
openblp_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
|
| 1415 |
openblp_relations = openblp_relations[:top_n]
|
| 1416 |
|
|
|
|
| 1417 |
return {
|
| 1418 |
"lemma": lemma,
|
| 1419 |
+
"wiktionary_senses": [],
|
| 1420 |
+
"odenet_senses": oewn_senses,
|
| 1421 |
"conceptnet_relations": conceptnet_relations,
|
| 1422 |
+
"openblp_relations": openblp_relations,
|
| 1423 |
"wiktionary_synonyms": [],
|
| 1424 |
"wiktionary_antonyms": []
|
| 1425 |
}
|
|
|
|
| 1431 |
|
| 1432 |
# --- PRIMARY ENGINE: WIKTIONARY (EN) ---
|
| 1433 |
def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
|
| 1434 |
+
""" (PRIMARY ENGINE) Analyzes an English word using the Wiktionary DB. """
|
|
|
|
|
|
|
| 1435 |
final_result: Dict[str, Any] = {"input_word": word, "analysis": {}}
|
| 1436 |
|
| 1437 |
conn = wiktionary_get_connection()
|
| 1438 |
+
if not conn: return {}
|
|
|
|
| 1439 |
|
|
|
|
| 1440 |
spacy_pos_hint, spacy_lemma_hint = None, None
|
| 1441 |
try:
|
| 1442 |
nlp_en = spacy_load_spacy_model("en_core_web_md")
|
|
|
|
| 1445 |
token = doc[0]
|
| 1446 |
spacy_pos_hint = token.pos_.lower()
|
| 1447 |
spacy_lemma_hint = token.lemma_
|
|
|
|
| 1448 |
except Exception as e:
|
| 1449 |
log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
|
| 1450 |
|
|
|
|
| 1451 |
try:
|
| 1452 |
wiktionary_reports = _wiktionary_find_all_entries(word, conn)
|
| 1453 |
except Exception as e:
|
| 1454 |
log(f"[DEBUG] Wiktionary (EN) query failed: {e}")
|
| 1455 |
+
return {}
|
| 1456 |
+
if not wiktionary_reports: return {}
|
|
|
|
| 1457 |
|
|
|
|
| 1458 |
def get_priority_score(report):
|
| 1459 |
wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
|
| 1460 |
wikt_lemma = report.get("lemma")
|
|
|
|
| 1465 |
return 4
|
| 1466 |
wiktionary_reports.sort(key=get_priority_score)
|
| 1467 |
|
|
|
|
| 1468 |
word_lower = word.lower()
|
| 1469 |
for wikt_report in wiktionary_reports:
|
| 1470 |
pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
|
| 1471 |
lemma = wikt_report.get("lemma", word)
|
| 1472 |
pos_title = wikt_report.get("pos_title", "")
|
| 1473 |
|
|
|
|
| 1474 |
inflections_wikt_block = {
|
| 1475 |
"base_form": lemma,
|
| 1476 |
"forms_list": wikt_report.get("forms", []),
|
| 1477 |
"source": "wiktionary"
|
| 1478 |
}
|
| 1479 |
|
|
|
|
| 1480 |
pattern_block = {}
|
| 1481 |
if PATTERN_EN_AVAILABLE:
|
| 1482 |
try:
|
|
|
|
| 1483 |
use_word = word if "form" in pos_title.lower() else lemma
|
| 1484 |
if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
|
| 1485 |
elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
|
|
|
|
| 1488 |
except Exception as e:
|
| 1489 |
pattern_block = {"error": f"Pattern.en analysis failed: {e}"}
|
| 1490 |
|
|
|
|
| 1491 |
semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
|
| 1492 |
|
|
|
|
| 1493 |
pos_entry_report = {
|
| 1494 |
"inflections_wiktionary": inflections_wikt_block,
|
| 1495 |
"inflections_pattern": pattern_block,
|
|
|
|
| 1498 |
"pos_title": pos_title,
|
| 1499 |
"etymology": wikt_report.get("etymology_text"),
|
| 1500 |
"pronunciation": wikt_report.get("sounds"),
|
|
|
|
| 1501 |
}
|
| 1502 |
}
|
| 1503 |
|
|
|
|
| 1504 |
is_valid = False
|
| 1505 |
is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
|
| 1506 |
|
| 1507 |
+
if lemma.lower() == word_lower: is_valid = True
|
|
|
|
|
|
|
| 1508 |
|
| 1509 |
if not is_valid and not is_inflected_entry:
|
| 1510 |
for form_entry in inflections_wikt_block.get("forms_list", []):
|
| 1511 |
form_text = form_entry.get("form_text", "").strip()
|
| 1512 |
if form_text.lower() == word_lower:
|
| 1513 |
is_valid = True
|
|
|
|
| 1514 |
break
|
| 1515 |
|
|
|
|
| 1516 |
if is_valid:
|
| 1517 |
if pos_key not in final_result["analysis"]:
|
| 1518 |
final_result["analysis"][pos_key] = []
|
|
|
|
| 2063 |
html_dep_out, df_out, json_out, html_ner_out])
|
| 2064 |
|
| 2065 |
def create_languagetool_tab():
|
| 2066 |
+
"""Creates the UI for the Grammar Checker tab with LT."""
|
| 2067 |
gr.Markdown("# 🇬🇧 English Grammar & Spelling Checker")
|
| 2068 |
+
gr.Markdown("Powered by `LanguageTool`.")
|
| 2069 |
|
| 2070 |
with gr.Row():
|
| 2071 |
text_input = gr.Textbox(
|
|
|
|
| 2074 |
lines=5,
|
| 2075 |
scale=3
|
| 2076 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2077 |
|
| 2078 |
check_button = gr.Button("Check Text", variant="primary")
|
| 2079 |
output = gr.JSON(label="Detected Errors (JSON)")
|
| 2080 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2081 |
check_button.click(
|
| 2082 |
+
fn=lambda text: lt_check_grammar(text, 'en'),
|
| 2083 |
+
inputs=[text_input],
|
| 2084 |
outputs=[output],
|
| 2085 |
api_name="check_grammar"
|
| 2086 |
)
|
| 2087 |
gr.Examples(
|
| 2088 |
+
[["This is a houze."], ["I seen the man."],
|
| 2089 |
+
["The cat sleep on the table."], ["He asks if he can go."]],
|
| 2090 |
+
inputs=[text_input], outputs=[output], fn=lambda text: lt_check_grammar(text, 'en'),
|
| 2091 |
cache_examples=False
|
| 2092 |
)
|
| 2093 |
|
|
|
|
| 2474 |
print("INFO: NLTK library not available, skipping lemmatizer.")
|
| 2475 |
print("--- NLTK Done ---\n")
|
| 2476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2477 |
# --- 8. Check Pattern.en ---
|
| 2478 |
print("--- Checking Pattern.en ---")
|
| 2479 |
if not PATTERN_EN_AVAILABLE:
|