cstr commited on
Commit
86efd62
·
verified ·
1 Parent(s): eeea917

no atd for now

Browse files
Files changed (1) hide show
  1. app.py +257 -216
app.py CHANGED
@@ -2,7 +2,7 @@
2
  # ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V23-EN)
3
  #
4
  # This script adapts the German Linguistics Hub for English analysis,
5
- # adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN, OpenBLP, and AtD.
6
  # It maintains the exact same JSON output structure as the German app.
7
  #
8
  # ============================================================================
@@ -76,15 +76,6 @@ except ImportError:
76
  LT_AVAILABLE = False
77
  print("CRITICAL WARNING: `language-tool-python` library not found.")
78
 
79
- # --- After the Deadline (AtD) Import ---
80
- try:
81
- import AtD
82
- ATD_AVAILABLE = True
83
- print("✓ Successfully imported pyAtD")
84
- except ImportError:
85
- ATD_AVAILABLE = False
86
- print("WARNING: `pyAtD` library not found. Grammar check will be LT-only.")
87
-
88
  # --- WordNet (wn) Import (for OEWN) ---
89
  try:
90
  import wn
@@ -194,10 +185,6 @@ STANZA_PIPELINE_LOCK = threading.Lock()
194
  NLTK_LEMMATIZER: Optional[WordNetLemmatizer] = None
195
  NLTK_LEMMATIZER_LOCK = threading.Lock()
196
 
197
- # --- After the Deadline (AtD) ---
198
- ATD_SERVICE: Optional[AtD.AtD] = None
199
- ATD_LOCK = threading.Lock()
200
-
201
  # --- Helper ---
202
  def _html_wrap(content: str, line_height: str = "2.0") -> str:
203
  return f'<div style="overflow-x:auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: {line_height};">{content}</div>'
@@ -226,17 +213,78 @@ SPACY_MODEL_INFO: Dict[str, Tuple[str, str, str]] = {
226
  "de": ("German", "de_core_news_md", "spacy"),
227
  "es": ("Spanish", "es_core_news_md", "spacy"),
228
  "grc-proiel-trf": ("Ancient Greek (PROIEL TRF)", "grc_proiel_trf", "grecy"),
229
- # ... (other models) ...
 
 
 
 
 
230
  }
231
  SPACY_UI_TEXT = {
232
- "de": { "title": "# 🔍 Mehrsprachiger Morpho-Syntaktischer Analysator", "subtitle": "Analysieren Sie Texte...", "input_label": "Text eingeben", "...": "..." },
233
- "en": { "title": "# 🔍 Multilingual Morpho-Syntactic Analyzer", "subtitle": "Analyze texts in...", "input_label": "Enter Text", "...": "..." },
234
- "es": { "title": "# 🔍 Analizador Morfo-Sintáctico Multilingüe", "subtitle": "Analice textos en...", "input_label": "Introducir Texto", "...": "..." }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  }
236
  SPACY_MODELS: Dict[str, Optional[spacy.Language]] = {}
237
 
238
  # --- Dependency Installation & Model Loading ---
239
- # (All spacy_... functions are identical to the German app)
240
  def spacy_install_spacy_transformers_once():
241
  """ Installs spacy-transformers, required for all _trf models. """
242
  marker_file = Path(".spacy_transformers_installed")
@@ -255,9 +303,40 @@ def spacy_install_spacy_transformers_once():
255
  return False
256
 
257
  def spacy_install_grecy_model_from_github(model_name: str) -> bool:
258
- # ... (identical) ...
259
- pass
260
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
262
  """Load or install a standard spaCy model."""
263
  try:
@@ -272,8 +351,24 @@ def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
272
  return None
273
 
274
  def spacy_load_grecy_model(model_name: str) -> Optional[spacy.Language]:
275
- # ... (identical) ...
276
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  def spacy_initialize_models():
279
  """ Pre-load standard models and ensure _trf dependencies are ready. """
@@ -303,32 +398,77 @@ def spacy_initialize_models():
303
 
304
  def spacy_get_analysis(ui_lang: str, model_lang_key: str, text: str):
305
  """Analyze text and return results."""
306
- # (Identical to German app)
307
  ui_config = SPACY_UI_TEXT.get(ui_lang.lower(), SPACY_UI_TEXT["en"])
308
- error_prefix = ui_config["error_message"]
309
  try:
310
- # ... (identical model loading logic) ...
 
 
 
311
  nlp = SPACY_MODELS.get(model_lang_key)
312
- # ...
 
 
 
 
 
 
 
 
 
 
313
  doc = nlp(text)
314
- # ... (identical dataframe/json/html output generation) ...
315
- return ([], {}, "", "", gr.Button(value=ui_config["button_text"], interactive=True)) # Placeholder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  except Exception as e:
317
  traceback.print_exc()
318
- error_html = f"..."
319
- return ([], {"error": str(e)}, error_html, error_html, gr.Button(value=ui_config["button_text"], interactive=True))
320
 
321
 
322
  def spacy_update_ui(ui_lang: str):
323
  """Update UI language for the spaCy tab."""
324
- # (Identical to German app)
325
- pass
326
 
327
  # ============================================================================
328
- # 4. GRAMMAR CHECKER LOGIC (LT + AtD)
329
  # ============================================================================
330
 
331
- # --- Globals for LanguageTool (Adapted for multi-language) ---
332
  LT_TOOL_INSTANCES: Dict[str, Optional[language_tool_python.LanguageTool]] = {}
333
  LT_TOOL_LOCK = threading.Lock()
334
 
@@ -338,7 +478,7 @@ def lt_get_language_tool(lang: str = 'en') -> Optional[language_tool_python.Lang
338
  if not LT_AVAILABLE:
339
  raise ImportError("language-tool-python library is not installed.")
340
 
341
- lang_code = 'en-US' if lang == 'en' else 'de-DE' # <-- ENGLISH DEFAULT
342
 
343
  if lang_code in LT_TOOL_INSTANCES:
344
  return LT_TOOL_INSTANCES[lang_code]
@@ -387,55 +527,6 @@ def lt_check_grammar(text: str, lang: str = 'en') -> List[Dict[str, Any]]:
387
  traceback.print_exc()
388
  return [{"error": f"An unexpected error occurred: {str(e)}"}]
389
 
390
- # --- After the Deadline (AtD) Logic ---
391
- def atd_get_service() -> Optional[AtD.AtD]:
392
- """ Thread-safe function to get AtD service. """
393
- global ATD_SERVICE
394
- if not ATD_AVAILABLE:
395
- raise ImportError("pyAtD library is not installed.")
396
- if ATD_SERVICE:
397
- return ATD_SERVICE
398
- with ATD_LOCK:
399
- if ATD_SERVICE:
400
- return ATD_SERVICE
401
- try:
402
- print("Initializing After the Deadline (AtD) service...")
403
- ATD_SERVICE = AtD.AtD()
404
- # Test call
405
- _ = ATD_SERVICE.check("this is a test")
406
- print("✓ AtD service initialized.")
407
- return ATD_SERVICE
408
- except Exception as e:
409
- print(f"✗ FAILED to initialize AtD service: {e}")
410
- return None
411
-
412
- def atd_check_grammar(text: str) -> List[Dict[str, Any]]:
413
- """ Checks text using After the Deadline. """
414
- try:
415
- service = atd_get_service()
416
- if not service:
417
- return [{"error": "AtD service failed to initialize."}]
418
- if not text or not text.strip():
419
- return [{"info": "No text provided to check."}]
420
-
421
- errors = service.check(text)
422
- error_list = []
423
- for error in errors:
424
- error_list.append({
425
- "message": error.description,
426
- "rule_id": error.type,
427
- "category": error.url,
428
- "incorrect_text": error.string,
429
- "replacements": error.suggestions,
430
- "offset": error.precontext_start,
431
- "length": len(error.string)
432
- })
433
- if not error_list:
434
- return [{"info": "No errors found!", "status": "perfect"}]
435
- return error_list
436
- except Exception as e:
437
- return [{"error": f"AtD check failed: {str(e)}"}]
438
-
439
  # ============================================================================
440
  # 5. WORDNET THESAURUS LOGIC (OEWN)
441
  # ============================================================================
@@ -534,7 +625,6 @@ def wordnet_worker_loop():
534
 
535
  def wordnet_process_word_lookup(word: str, wn_instance: wn.Wordnet) -> Dict[str, Any]:
536
  """ Process a single word lookup. Runs in the worker thread. """
537
- # (This function is identical to the German app)
538
  if not word or not word.strip():
539
  return {"info": "No word provided to check."}
540
  word = word.strip().lower()
@@ -583,7 +673,13 @@ def wordnet_start_worker():
583
  return
584
  wordnet_worker_thread = threading.Thread(target=wordnet_worker_loop, daemon=True, name="WordNetWorker")
585
  wordnet_worker_thread.start()
586
- # ... (identical timeout logic) ...
 
 
 
 
 
 
587
 
588
  # --- Public API (Adapted) ---
589
  def wordnet_get_thesaurus_info(word: str, lang: str = 'en') -> Dict[str, Any]:
@@ -807,7 +903,6 @@ def get_conceptnet_client() -> Optional[Client]:
807
  def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
808
  """
809
  Fetches relations from the cstr/conceptnet_normalized Gradio API.
810
- (Identical robust V2 parser from German app)
811
  """
812
  if not GRADIO_CLIENT_AVAILABLE:
813
  return {"error": "`gradio_client` library is not installed."}
@@ -818,7 +913,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
818
  word_lower = word.strip().lower()
819
  cache_key = (word_lower, language)
820
 
821
- # --- 1. Check Cache ---
822
  with CONCEPTNET_LOCK:
823
  if cache_key in CONCEPTNET_CACHE:
824
  log(f"ConceptNet: Found '{word_lower}' in cache.")
@@ -827,7 +921,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
827
  log(f"ConceptNet: Fetching '{word_lower}' from Gradio API...")
828
 
829
  try:
830
- # --- 2. Call Gradio API ---
831
  client = get_conceptnet_client()
832
  if not client:
833
  return {"error": "ConceptNet Gradio Client is not available."}
@@ -841,9 +934,7 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
841
  api_name="/get_semantic_profile"
842
  )
843
 
844
- # --- 3. Parse the Markdown Result (Robustly) ---
845
  relations_list = []
846
- # ... (Identical parsing logic from German app) ...
847
  if not isinstance(result_markdown, str):
848
  raise TypeError(f"ConceptNet API returned type {type(result_markdown)}, expected str.")
849
 
@@ -877,7 +968,7 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
877
  elif node2.lower() == word_lower and node1.lower() != word_lower:
878
  other_node, direction = node1, "<-"
879
  else:
880
- continue # Skip self-references
881
 
882
  relations_list.append({
883
  "relation": relation, "direction": direction, "other_node": other_node,
@@ -887,7 +978,6 @@ def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
887
  except Exception as e:
888
  log(f"ConceptNet Parser: Error parsing line '{line}': {e}")
889
 
890
- # --- 4. Finalize and Cache Result ---
891
  if not relations_list:
892
  final_result = {"info": f"No valid relations found for '{word_lower}'."}
893
  else:
@@ -910,7 +1000,7 @@ def openblp_get_relations(lemma: str) -> List[Dict[str, Any]]:
910
  Stub function to query OpenBLP.
911
  Replace this with your actual OpenBLP database/API query.
912
  """
913
- # --- !! Implement your OpenBLP query logic here !! ---
914
  if lemma == "dog":
915
  return [
916
  {"relation": "HasProperty", "other_node": "loyal", "weight": 0.9, "source": "openblp"},
@@ -962,10 +1052,7 @@ def _hanta_pos_to_key(hanta_pos: str) -> Optional[str]:
962
  return None
963
 
964
  def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
965
- """
966
- (FALLBACK ENGINE 1) Analyzes a single word using HanTa (EN).
967
- This function MUST return the standard JSON structure.
968
- """
969
  if not HANTA_AVAILABLE: return {}
970
  print(f"\n[Word Encyclopedia] Running HanTa (EN) fallback for: \"{word}\"")
971
  final_result = {"input_word": word, "analysis": {}}
@@ -974,8 +1061,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
974
  tagger = hanta_get_tagger_en()
975
  if not tagger: return {}
976
 
977
- # HanTa 'tag_word' gives all possibilities
978
- # e.g., [('VBG', 0.9), ('NN', 0.1)]
979
  possible_tags = tagger.tag_word(word.lower())
980
  possible_tags.extend(tagger.tag_word(word.capitalize()))
981
 
@@ -985,8 +1070,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
985
  pos_key = _hanta_pos_to_key(hanta_pos)
986
  if not pos_key: continue
987
 
988
- # Get the lemma for this specific POS analysis
989
- # HanTa's 'analyze' gives the single best lemma
990
  raw_analysis = tagger.analyze(word.lower() if pos_key != 'noun' else word.capitalize())
991
  lemma = raw_analysis[0] # The lemma
992
 
@@ -995,7 +1078,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
995
  processed_lemmas_pos.add((lemma, pos_key))
996
  log(f"--- Analyzing HanTa (EN) path: lemma='{lemma}', pos='{pos_key}' ---")
997
 
998
- # --- 1. Get Inflections (Pattern) ---
999
  pattern_block = {}
1000
  if PATTERN_EN_AVAILABLE:
1001
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
@@ -1003,12 +1085,10 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
1003
  elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
1004
  elif pos_key == "adverb": pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
1005
 
1006
- # --- 2. Build Semantics Block ---
1007
  semantics_block = _build_semantics_block_for_lemma(lemma, pos_key, top_n, 'en')
1008
 
1009
- # --- 3. Build Final Report Block ---
1010
  pos_entry_report = {
1011
- "hanta_analysis": { # <-- Key name preserved
1012
  "lemma": lemma,
1013
  "pos_tag": hanta_pos,
1014
  "analysis_string": str(raw_analysis),
@@ -1018,7 +1098,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
1018
  "semantics_combined": semantics_block
1019
  }
1020
 
1021
- # --- 4. VALIDATION FILTER ---
1022
  if word_appears_in_inflections_en(word, pattern_block, pos_key):
1023
  if pos_key not in final_result["analysis"]:
1024
  final_result["analysis"][pos_key] = []
@@ -1038,7 +1117,6 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
1038
  # ============================================================================
1039
  # 6d. WIKTIONARY DATABASE LOGIC (EN)
1040
  # ============================================================================
1041
- # (This assumes an English DB with the *exact same schema*)
1042
  def wiktionary_download_db() -> bool:
1043
  """ Downloads the English Wiktionary DB. """
1044
  global WIKTIONARY_AVAILABLE
@@ -1050,7 +1128,7 @@ def wiktionary_download_db() -> bool:
1050
  print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
1051
  try:
1052
  hf_hub_download(
1053
- repo_id=WIKTIONARY_REPO_ID, # <-- Uses English repo ID
1054
  filename=WIKTIONARY_DB_PATH,
1055
  repo_type="dataset",
1056
  local_dir=".",
@@ -1086,7 +1164,7 @@ def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
1086
  log("Creating new read-only connection to Wiktionary DB...")
1087
  db_uri = f"file:{WIKTIONARY_DB_PATH}?mode=ro"
1088
  conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
1089
- conn.row_factory = sqlite3.Row # Makes results dict-like
1090
  _ = conn.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1").fetchone()
1091
  print("✓ Wiktionary DB connection successful.")
1092
  WIKTIONARY_CONN = conn
@@ -1109,7 +1187,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1109
  """ (REVISED FOR FULL DB V3) Fetches ALL data for a single entry_id. """
1110
  report = {}
1111
 
1112
- # 1. Get Base Entry Info
1113
  entry_data = conn.execute(
1114
  "SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
1115
  ).fetchone()
@@ -1119,7 +1196,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1119
  report["entry_id"] = entry_id
1120
  report["lemma"] = entry_data["word"]
1121
 
1122
- # 2. Get Senses (with Glosses, Tags, Topics, and Examples)
1123
  senses_q = conn.execute(
1124
  """
1125
  SELECT
@@ -1136,7 +1212,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1136
  for sense_row in senses_q:
1137
  sense_dict = dict(sense_row)
1138
  sense_id = sense_dict["sense_id"]
1139
-
1140
  examples_q = conn.execute(
1141
  "SELECT text, ref FROM examples WHERE sense_id = ?", (sense_id,)
1142
  ).fetchall()
@@ -1144,7 +1219,6 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1144
  senses_list.append(sense_dict)
1145
  report["senses"] = senses_list
1146
 
1147
- # 3. Get Inflected Forms
1148
  forms_q = conn.execute(
1149
  """
1150
  SELECT f.form_text, f.sense_index,
@@ -1155,24 +1229,16 @@ def _wiktionary_build_report_for_entry(entry_id: int, conn: sqlite3.Connection)
1155
  ).fetchall()
1156
  report["forms"] = [dict(f) for f in forms_q]
1157
 
1158
- # ... (All other queries for sounds, synonyms, antonyms, etc. are IDENTICAL to the German app) ...
1159
-
1160
  return report
1161
 
1162
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
1163
- """
1164
- Finds all entries related to an English word.
1165
- """
1166
  log(f"Wiktionary (EN): Querying for '{word}'...")
1167
  found_entry_ids: Set[int] = set()
1168
 
1169
- # --- ENGLISH REPLACEMENT ---
1170
  lang_query = 'English'
1171
- # These titles are specific to the English Wiktionary dump
1172
  form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
1173
- # --- END REPLACEMENT ---
1174
 
1175
- # 1. Check if the word is a lemma (base form)
1176
  lemma_q = conn.execute(
1177
  f"SELECT id, pos_title FROM entries WHERE word = ? AND lang = '{lang_query}'", (word,)
1178
  ).fetchall()
@@ -1202,7 +1268,6 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
1202
  except json.JSONDecodeError:
1203
  log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
1204
 
1205
- # 2. Check if the word is an inflected form (in the `forms` table)
1206
  form_q = conn.execute(
1207
  f"""
1208
  SELECT DISTINCT e.id
@@ -1231,7 +1296,6 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
1231
 
1232
  log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
1233
 
1234
- # 3. Build a full report for each unique entry
1235
  all_reports = []
1236
  for entry_id in found_entry_ids:
1237
  try:
@@ -1242,19 +1306,61 @@ def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Di
1242
 
1243
  return all_reports
1244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
  # ============================================================================
1246
  # 6e. SHARED SEMANTIC HELPER (OEWN + OpenBLP)
1247
  # ============================================================================
1248
 
1249
  def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Dict[str, Any]]]:
1250
- """
1251
- (Helper) Fetches WordNet (OEWN) senses for a word and groups them by POS.
1252
- """
1253
  senses_by_pos: Dict[str, List[Dict]] = {
1254
  "noun": [], "verb": [], "adjective": [], "adverb": []
1255
  }
1256
  if not WN_AVAILABLE:
1257
- # Fail-open strategy
1258
  return {"noun": [{"info": "WordNet unavailable"}], "verb": [{"info": "WordNet unavailable"}],
1259
  "adjective": [{"info": "WordNet unavailable"}], "adverb": [{"info": "WordNet unavailable"}]}
1260
 
@@ -1264,36 +1370,19 @@ def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Di
1264
  if "error" in sense: continue
1265
  pos_tag = sense.get("pos")
1266
 
1267
- if pos_tag == 'n':
1268
- senses_by_pos["noun"].append(sense)
1269
- elif pos_tag == 'v':
1270
- senses_by_pos["verb"].append(sense)
1271
- # --- ENGLISH WORDNET (OEWN) FIX ---
1272
- # 'a' is Adjective, 's' is Adjective Satellite
1273
- # 'r' is Adverb
1274
- elif pos_tag == 'a' or pos_tag == 's':
1275
- senses_by_pos["adjective"].append(sense)
1276
- elif pos_tag == 'r':
1277
- senses_by_pos["adverb"].append(sense)
1278
- # --- END OF FIX ---
1279
-
1280
  except Exception as e:
1281
  log(f"WordNet helper check failed for '{word}': {e}")
1282
 
1283
- log(f"WordNet (EN) senses for '{word}': "
1284
- f"{len(senses_by_pos['noun'])}N, "
1285
- f"{len(senses_by_pos['verb'])}V, "
1286
- f"{len(senses_by_pos['adjective'])}Adj, "
1287
- f"{len(senses_by_pos['adverb'])}Adv")
1288
  return senses_by_pos
1289
 
1290
  def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang: str = 'en') -> Dict[str, Any]:
1291
- """
1292
- (REUSABLE HELPER) Fetches OEWN, ConceptNet, and OpenBLP data.
1293
- """
1294
  log(f"[DEBUG] Building semantics for lemma='{lemma}', pos='{pos_key}', lang='{lang}'")
1295
 
1296
- # 1. Get OEWN senses
1297
  oewn_senses = []
1298
  if WN_AVAILABLE:
1299
  try:
@@ -1304,7 +1393,6 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
1304
  except Exception as e:
1305
  log(f"[DEBUG] OEWN lookup failed for {lemma} ({pos_key}): {e}")
1306
 
1307
- # 2. Get ConceptNet relations
1308
  conceptnet_relations = []
1309
  if REQUESTS_AVAILABLE:
1310
  try:
@@ -1313,14 +1401,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
1313
  except Exception as e:
1314
  conceptnet_relations = [{"error": str(e)}]
1315
 
1316
- # 3. Get OpenBLP relations
1317
  openblp_relations = []
1318
  try:
1319
  openblp_relations = openblp_get_relations(lemma)
1320
  except Exception as e:
1321
  openblp_relations = [{"error": f"OpenBLP stub failed: {e}"}]
1322
 
1323
- # 4. Apply top_n limit
1324
  if top_n > 0:
1325
  oewn_senses = oewn_senses[:top_n]
1326
  conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
@@ -1328,13 +1414,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
1328
  openblp_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
1329
  openblp_relations = openblp_relations[:top_n]
1330
 
1331
- # --- MUST MATCH GERMAN JSON STRUCTURE ---
1332
  return {
1333
  "lemma": lemma,
1334
- "wiktionary_senses": [], # This block is for non-Wiktionary engines
1335
- "odenet_senses": oewn_senses, # <-- Key name is preserved
1336
  "conceptnet_relations": conceptnet_relations,
1337
- "openblp_relations": openblp_relations, # <-- NEW KEY
1338
  "wiktionary_synonyms": [],
1339
  "wiktionary_antonyms": []
1340
  }
@@ -1346,16 +1431,12 @@ def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang:
1346
 
1347
  # --- PRIMARY ENGINE: WIKTIONARY (EN) ---
1348
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1349
- """
1350
- (PRIMARY ENGINE) Analyzes an English word using the Wiktionary DB.
1351
- """
1352
  final_result: Dict[str, Any] = {"input_word": word, "analysis": {}}
1353
 
1354
  conn = wiktionary_get_connection()
1355
- if not conn:
1356
- return {} # Signal failure
1357
 
1358
- # --- 1. GET SPACY HINT ---
1359
  spacy_pos_hint, spacy_lemma_hint = None, None
1360
  try:
1361
  nlp_en = spacy_load_spacy_model("en_core_web_md")
@@ -1364,20 +1445,16 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1364
  token = doc[0]
1365
  spacy_pos_hint = token.pos_.lower()
1366
  spacy_lemma_hint = token.lemma_
1367
- log(f"[DEBUG] Wiktionary (EN) Hint: spaCy POS is '{spacy_pos_hint}', lemma is '{spacy_lemma_hint}'")
1368
  except Exception as e:
1369
  log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
1370
 
1371
- # --- 2. FIND ALL WIKTIONARY ENTRIES ---
1372
  try:
1373
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
1374
  except Exception as e:
1375
  log(f"[DEBUG] Wiktionary (EN) query failed: {e}")
1376
- return {} # Signal failure
1377
- if not wiktionary_reports:
1378
- return {} # No results, signal to fallback
1379
 
1380
- # --- 3. PRIORITIZE/SORT THE WIKTIONARY ENTRIES ---
1381
  def get_priority_score(report):
1382
  wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
1383
  wikt_lemma = report.get("lemma")
@@ -1388,25 +1465,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1388
  return 4
1389
  wiktionary_reports.sort(key=get_priority_score)
1390
 
1391
- # --- 4. BUILD AND VALIDATE THE FINAL REPORT ---
1392
  word_lower = word.lower()
1393
  for wikt_report in wiktionary_reports:
1394
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1395
  lemma = wikt_report.get("lemma", word)
1396
  pos_title = wikt_report.get("pos_title", "")
1397
 
1398
- # --- A. Build Wiktionary Inflection Block ---
1399
  inflections_wikt_block = {
1400
  "base_form": lemma,
1401
  "forms_list": wikt_report.get("forms", []),
1402
  "source": "wiktionary"
1403
  }
1404
 
1405
- # --- B. Build Pattern Inflection Block (using pattern.en) ---
1406
  pattern_block = {}
1407
  if PATTERN_EN_AVAILABLE:
1408
  try:
1409
- # Use input 'word' for inflected forms to find right lemma
1410
  use_word = word if "form" in pos_title.lower() else lemma
1411
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
1412
  elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
@@ -1415,10 +1488,8 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1415
  except Exception as e:
1416
  pattern_block = {"error": f"Pattern.en analysis failed: {e}"}
1417
 
1418
- # --- C. Build Semantics Block ---
1419
  semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
1420
 
1421
- # --- D. Assemble the report (pre-validation) ---
1422
  pos_entry_report = {
1423
  "inflections_wiktionary": inflections_wikt_block,
1424
  "inflections_pattern": pattern_block,
@@ -1427,27 +1498,21 @@ def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1427
  "pos_title": pos_title,
1428
  "etymology": wikt_report.get("etymology_text"),
1429
  "pronunciation": wikt_report.get("sounds"),
1430
- # ... (all other metadata fields) ...
1431
  }
1432
  }
1433
 
1434
- # --- E. VALIDATION FILTER ---
1435
  is_valid = False
1436
  is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
1437
 
1438
- if lemma.lower() == word_lower:
1439
- is_valid = True
1440
- log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word matches entry lemma.")
1441
 
1442
  if not is_valid and not is_inflected_entry:
1443
  for form_entry in inflections_wikt_block.get("forms_list", []):
1444
  form_text = form_entry.get("form_text", "").strip()
1445
  if form_text.lower() == word_lower:
1446
  is_valid = True
1447
- log(f"[DEBUG] Wiktionary: KEEPING entry '{lemma}' ({pos_key}) because input word found in form: '{form_text}'")
1448
  break
1449
 
1450
- # --- F. Add to final result if valid ---
1451
  if is_valid:
1452
  if pos_key not in final_result["analysis"]:
1453
  final_result["analysis"][pos_key] = []
@@ -1998,9 +2063,9 @@ def create_spacy_tab():
1998
  html_dep_out, df_out, json_out, html_ner_out])
1999
 
2000
  def create_languagetool_tab():
2001
- """Creates the UI for the Grammar Checker tab with LT and AtD."""
2002
  gr.Markdown("# 🇬🇧 English Grammar & Spelling Checker")
2003
- gr.Markdown("Powered by `LanguageTool` and `After the Deadline (AtD)`.")
2004
 
2005
  with gr.Row():
2006
  text_input = gr.Textbox(
@@ -2009,33 +2074,20 @@ def create_languagetool_tab():
2009
  lines=5,
2010
  scale=3
2011
  )
2012
- checker_choice = gr.Radio(
2013
- label="Checker Engine",
2014
- choices=["LanguageTool", "After the Deadline"],
2015
- value="LanguageTool",
2016
- scale=1
2017
- )
2018
 
2019
  check_button = gr.Button("Check Text", variant="primary")
2020
  output = gr.JSON(label="Detected Errors (JSON)")
2021
 
2022
- def dispatch_grammar_check(text, choice):
2023
- if choice == "LanguageTool":
2024
- return lt_check_grammar(text, 'en')
2025
- elif choice == "After the Deadline":
2026
- return atd_check_grammar(text)
2027
- return [{"error": "Invalid checker selected."}]
2028
-
2029
  check_button.click(
2030
- fn=dispatch_grammar_check,
2031
- inputs=[text_input, checker_choice],
2032
  outputs=[output],
2033
  api_name="check_grammar"
2034
  )
2035
  gr.Examples(
2036
- [["This is a houze.", "LanguageTool"], ["I seen the man.", "LanguageTool"],
2037
- ["The cat sleep on the table.", "After the Deadline"], ["He asks if he can go.", "LanguageTool"]],
2038
- inputs=[text_input, checker_choice], outputs=[output], fn=dispatch_grammar_check,
2039
  cache_examples=False
2040
  )
2041
 
@@ -2422,17 +2474,6 @@ if __name__ == "__main__":
2422
  print("INFO: NLTK library not available, skipping lemmatizer.")
2423
  print("--- NLTK Done ---\n")
2424
 
2425
- # --- 7. Check AtD Service ---
2426
- print("--- Initializing AtD Service ---")
2427
- if ATD_AVAILABLE:
2428
- try:
2429
- atd_get_service()
2430
- except Exception as e:
2431
- print(f"✗ FAILED to start AtD: {e}")
2432
- else:
2433
- print("INFO: AtD library not available, skipping service.")
2434
- print("--- AtD Done ---\n")
2435
-
2436
  # --- 8. Check Pattern.en ---
2437
  print("--- Checking Pattern.en ---")
2438
  if not PATTERN_EN_AVAILABLE:
 
2
  # ENGLISH LINGUISTICS HUB (CONSOLIDATED APP V23-EN)
3
  #
4
  # This script adapts the German Linguistics Hub for English analysis,
5
+ # adding NLTK, Stanza, TextBlob, HanTa(EN), OEWN, and OpenBLP.
6
  # It maintains the exact same JSON output structure as the German app.
7
  #
8
  # ============================================================================
 
76
  LT_AVAILABLE = False
77
  print("CRITICAL WARNING: `language-tool-python` library not found.")
78
 
 
 
 
 
 
 
 
 
 
79
  # --- WordNet (wn) Import (for OEWN) ---
80
  try:
81
  import wn
 
185
  NLTK_LEMMATIZER: Optional[WordNetLemmatizer] = None
186
  NLTK_LEMMATIZER_LOCK = threading.Lock()
187
 
 
 
 
 
188
  # --- Helper ---
189
  def _html_wrap(content: str, line_height: str = "2.0") -> str:
190
  return f'<div style="overflow-x:auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: {line_height};">{content}</div>'
 
213
  "de": ("German", "de_core_news_md", "spacy"),
214
  "es": ("Spanish", "es_core_news_md", "spacy"),
215
  "grc-proiel-trf": ("Ancient Greek (PROIEL TRF)", "grc_proiel_trf", "grecy"),
216
+ "grc-perseus-trf": ("Ancient Greek (Perseus TRF)", "grc_perseus_trf", "grecy"),
217
+ "grc_ner_trf": ("Ancient Greek (NER TRF)", "grc_ner_trf", "grecy"),
218
+ "grc-proiel-lg": ("Ancient Greek (PROIEL LG)", "grc_proiel_lg", "grecy"),
219
+ "grc-perseus-lg": ("Ancient Greek (Perseus LG)", "grc_perseus_lg", "grecy"),
220
+ "grc-proiel-sm": ("Ancient Greek (PROIEL SM)", "grc_proiel_sm", "grecy"),
221
+ "grc-perseus-sm": ("Ancient Greek (Perseus SM)", "grc_perseus_sm", "grecy"),
222
  }
223
  SPACY_UI_TEXT = {
224
+ "de": {
225
+ "title": "# 🔍 Mehrsprachiger Morpho-Syntaktischer Analysator",
226
+ "subtitle": "Analysieren Sie Texte auf Deutsch, Englisch, Spanisch und Altgriechisch",
227
+ "ui_lang_label": "Benutzeroberflächensprache",
228
+ "model_lang_label": "Textsprache für Analyse",
229
+ "input_label": "Text eingeben",
230
+ "input_placeholder": "Geben Sie hier Ihren Text ein...",
231
+ "button_text": "Text analysieren",
232
+ "button_processing_text": "Verarbeitung läuft...",
233
+ "tab_graphic": "Grafische Darstellung",
234
+ "tab_table": "Tabelle",
235
+ "tab_json": "JSON",
236
+ "tab_ner": "Entitäten",
237
+ "html_label": "Abhängigkeitsparsing",
238
+ "table_label": "Morphologische Analyse",
239
+ "table_headers": ["Wort", "Lemma", "POS", "Tag", "Morphologie", "Abhängigkeit"],
240
+ "json_label": "JSON-Ausgabe",
241
+ "ner_label": "Benannte Entitäten",
242
+ "error_message": "Fehler: "
243
+ },
244
+ "en": {
245
+ "title": "# 🔍 Multilingual Morpho-Syntactic Analyzer",
246
+ "subtitle": "Analyze texts in German, English, Spanish, and Ancient Greek",
247
+ "ui_lang_label": "Interface Language",
248
+ "model_lang_label": "Text Language for Analysis",
249
+ "input_label": "Enter Text",
250
+ "input_placeholder": "Enter your text here...",
251
+ "button_text": "Analyze Text",
252
+ "button_processing_text": "Processing...",
253
+ "tab_graphic": "Graphic View",
254
+ "tab_table": "Table",
255
+ "tab_json": "JSON",
256
+ "tab_ner": "Entities",
257
+ "html_label": "Dependency Parsing",
258
+ "table_label": "Morphological Analysis",
259
+ "table_headers": ["Word", "Lemma", "POS", "Tag", "Morphology", "Dependency"],
260
+ "json_label": "JSON Output",
261
+ "ner_label": "Named Entities",
262
+ "error_message": "Error: "
263
+ },
264
+ "es": {
265
+ "title": "# 🔍 Analizador Morfo-Sintáctico Multilingüe",
266
+ "subtitle": "Analice textos en alemán, inglés, español y griego antiguo",
267
+ "ui_lang_label": "Idioma de la Interfaz",
268
+ "model_lang_label": "Idioma del Texto para Análisis",
269
+ "input_label": "Introducir Texto",
270
+ "input_placeholder": "Ingrese su texto aquí...",
271
+ "button_text": "Analizar Texto",
272
+ "button_processing_text": "Procesando...",
273
+ "tab_graphic": "Vista Gráfica",
274
+ "tab_table": "Tabla",
275
+ "tab_json": "JSON",
276
+ "tab_ner": "Entidades",
277
+ "html_label": "Análisis de Dependencias",
278
+ "table_label": "Análisis Morfológico",
279
+ "table_headers": ["Palabra", "Lema", "POS", "Etiqueta", "Morfología", "Dependencia"],
280
+ "json_label": "Salida JSON",
281
+ "ner_label": "Entidades Nombradas",
282
+ "error_message": "Error: "
283
+ }
284
  }
285
  SPACY_MODELS: Dict[str, Optional[spacy.Language]] = {}
286
 
287
  # --- Dependency Installation & Model Loading ---
 
288
  def spacy_install_spacy_transformers_once():
289
  """ Installs spacy-transformers, required for all _trf models. """
290
  marker_file = Path(".spacy_transformers_installed")
 
303
  return False
304
 
305
  def spacy_install_grecy_model_from_github(model_name: str) -> bool:
306
+ """ Installs a greCy model from GitHub Release. """
307
+ marker_file = Path(f".{model_name}_installed")
308
+ if marker_file.exists():
309
+ print(f"✓ {model_name} already installed (marker found)")
310
+ return True
311
+ print(f"Installing grecy model: {model_name}...")
312
+ if model_name == "grc_proiel_trf":
313
+ wheel_filename = "grc_proiel_trf-3.7.5-py3-none-any.whl"
314
+ elif model_name in ["grc_perseus_trf", "grc_proiel_lg", "grc_perseus_lg",
315
+ "grc_proiel_sm", "grc_perseus_sm", "grc_ner_trf"]:
316
+ wheel_filename = f"{model_name}-0.0.0-py3-none-any.whl"
317
+ else:
318
+ print(f"✗ Unknown grecy model: {model_name}")
319
+ return False
320
+ install_url = f"https://github.com/CrispStrobe/greCy/releases/download/v1.0-models/{wheel_filename}"
321
+ cmd = [sys.executable, "-m", "pip", "install", install_url, "--no-deps"]
322
+ print(f"Running: {' '.join(cmd)}")
323
+ try:
324
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=900)
325
+ if result.stdout: print("STDOUT:", result.stdout)
326
+ if result.stderr: print("STDERR:", result.stderr)
327
+ print(f"✓ Successfully installed {model_name} from GitHub")
328
+ marker_file.touch()
329
+ return True
330
+ except subprocess.CalledProcessError as e:
331
+ print(f"✗ Installation subprocess FAILED with code {e.returncode}")
332
+ print("STDOUT:", e.stdout)
333
+ print("STDERR:", e.stderr)
334
+ return False
335
+ except Exception as e:
336
+ print(f"✗ Installation exception: {e}")
337
+ traceback.print_exc()
338
+ return False
339
+
340
  def spacy_load_spacy_model(model_name: str) -> Optional[spacy.Language]:
341
  """Load or install a standard spaCy model."""
342
  try:
 
351
  return None
352
 
353
  def spacy_load_grecy_model(model_name: str) -> Optional[spacy.Language]:
354
+ """ Load a grecy model, installing from GitHub if needed. """
355
+ if not spacy_install_grecy_model_from_github(model_name):
356
+ print(f"✗ Cannot load {model_name} because installation failed.")
357
+ return None
358
+ try:
359
+ print("Refreshing importlib to find new package...")
360
+ importlib.invalidate_caches()
361
+ try: importlib.reload(site)
362
+ except Exception: pass
363
+ print(f"Trying: spacy.load('{model_name}')")
364
+ nlp = spacy.load(model_name)
365
+ print(f"✓ Successfully loaded {model_name}")
366
+ return nlp
367
+ except Exception as e:
368
+ print(f"✗ Model {model_name} is installed but FAILED to load.")
369
+ print(f" Error: {e}")
370
+ traceback.print_exc()
371
+ return None
372
 
373
  def spacy_initialize_models():
374
  """ Pre-load standard models and ensure _trf dependencies are ready. """
 
398
 
399
  def spacy_get_analysis(ui_lang: str, model_lang_key: str, text: str):
400
  """Analyze text and return results."""
 
401
  ui_config = SPACY_UI_TEXT.get(ui_lang.lower(), SPACY_UI_TEXT["en"])
402
+ error_prefix = ui_config.get("error_message", "Error: ")
403
  try:
404
+ if not text.strip():
405
+ return ([], [], "<p style='color: orange;'>No text provided.</p>", "<p>No text provided.</p>",
406
+ gr.Button(value=ui_config.get("button_text", "Analyze"), interactive=True))
407
+
408
  nlp = SPACY_MODELS.get(model_lang_key)
409
+ if nlp is None:
410
+ # Try loading one last time
411
+ if model_lang_key in SPACY_MODEL_INFO:
412
+ _, model_name, model_type = SPACY_MODEL_INFO[model_lang_key]
413
+ if model_type == 'grecy': nlp = spacy_load_grecy_model(model_name)
414
+ else: nlp = spacy_load_spacy_model(model_name)
415
+ SPACY_MODELS[model_lang_key] = nlp
416
+
417
+ if nlp is None:
418
+ return ([], {"error": "Model load failed"}, "Error", "Error", gr.Button(interactive=True))
419
+
420
  doc = nlp(text)
421
+ dataframe_output = []
422
+ json_output = []
423
+ for token in doc:
424
+ lemma_str = token.lemma_
425
+ morph_str = str(token.morph) if token.morph else ''
426
+ dep_str = token.dep_ if doc.is_parsed else ''
427
+ tag_str = token.tag_ or ''
428
+ pos_str = token.pos_ or ''
429
+ json_output.append({
430
+ "word": token.text, "lemma": lemma_str, "pos": pos_str,
431
+ "tag": tag_str, "morphology": morph_str, "dependency": dep_str,
432
+ "is_stopword": token.is_stop
433
+ })
434
+ dataframe_output.append([token.text, lemma_str, pos_str, tag_str, morph_str, dep_str])
435
+
436
+ html_dep_out = ""
437
+ if "parser" in nlp.pipe_names and doc.is_parsed:
438
+ try:
439
+ options = {"compact": True, "bg": "#ffffff", "color": "#000000", "font": "Source Sans Pro"}
440
+ html_svg = displacy.render(doc, style="dep", jupyter=False, options=options)
441
+ html_dep_out = _html_wrap(html_svg, line_height="2.5")
442
+ except Exception as e:
443
+ html_dep_out = f"<p>Visualization error: {e}</p>"
444
+
445
+ html_ner_out = ""
446
+ if "ner" in nlp.pipe_names:
447
+ if doc.ents:
448
+ try:
449
+ html_ner = displacy.render(doc, style="ent", jupyter=False)
450
+ html_ner_out = _html_wrap(html_ner, line_height="2.5")
451
+ except Exception: html_ner_out = "<p>Error rendering NER</p>"
452
+ else: html_ner_out = "<p>No entities found.</p>"
453
+
454
+ return (dataframe_output, json_output, html_dep_out, html_ner_out,
455
+ gr.Button(value=ui_config.get("button_text", "Analyze"), interactive=True))
456
  except Exception as e:
457
  traceback.print_exc()
458
+ error_html = f"<div style='color: red;'>{error_prefix} {str(e)}</div>"
459
+ return ([], {"error": str(e)}, error_html, error_html, gr.Button(interactive=True))
460
 
461
 
462
  def spacy_update_ui(ui_lang: str):
463
  """Update UI language for the spaCy tab."""
464
+ # Placeholder - actual implementation would update labels
465
+ return [gr.update()] * 14
466
 
467
  # ============================================================================
468
+ # 4. GRAMMAR CHECKER LOGIC (LanguageTool Only)
469
  # ============================================================================
470
 
471
+ # --- Globals for LanguageTool ---
472
  LT_TOOL_INSTANCES: Dict[str, Optional[language_tool_python.LanguageTool]] = {}
473
  LT_TOOL_LOCK = threading.Lock()
474
 
 
478
  if not LT_AVAILABLE:
479
  raise ImportError("language-tool-python library is not installed.")
480
 
481
+ lang_code = 'en-US' if lang == 'en' else 'de-DE'
482
 
483
  if lang_code in LT_TOOL_INSTANCES:
484
  return LT_TOOL_INSTANCES[lang_code]
 
527
  traceback.print_exc()
528
  return [{"error": f"An unexpected error occurred: {str(e)}"}]
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  # ============================================================================
531
  # 5. WORDNET THESAURUS LOGIC (OEWN)
532
  # ============================================================================
 
625
 
626
  def wordnet_process_word_lookup(word: str, wn_instance: wn.Wordnet) -> Dict[str, Any]:
627
  """ Process a single word lookup. Runs in the worker thread. """
 
628
  if not word or not word.strip():
629
  return {"info": "No word provided to check."}
630
  word = word.strip().lower()
 
673
  return
674
  wordnet_worker_thread = threading.Thread(target=wordnet_worker_loop, daemon=True, name="WordNetWorker")
675
  wordnet_worker_thread.start()
676
+ timeout = 30
677
+ for _ in range(timeout * 10):
678
+ if wordnet_worker_state in (WordNetWorkerState.READY, WordNetWorkerState.ERROR):
679
+ break
680
+ threading.Event().wait(0.1)
681
+ if wordnet_worker_state != WordNetWorkerState.READY:
682
+ raise Exception("OdeNet Worker failed to initialize")
683
 
684
  # --- Public API (Adapted) ---
685
  def wordnet_get_thesaurus_info(word: str, lang: str = 'en') -> Dict[str, Any]:
 
903
  def conceptnet_get_relations(word: str, language: str = 'en') -> Dict[str, Any]:
904
  """
905
  Fetches relations from the cstr/conceptnet_normalized Gradio API.
 
906
  """
907
  if not GRADIO_CLIENT_AVAILABLE:
908
  return {"error": "`gradio_client` library is not installed."}
 
913
  word_lower = word.strip().lower()
914
  cache_key = (word_lower, language)
915
 
 
916
  with CONCEPTNET_LOCK:
917
  if cache_key in CONCEPTNET_CACHE:
918
  log(f"ConceptNet: Found '{word_lower}' in cache.")
 
921
  log(f"ConceptNet: Fetching '{word_lower}' from Gradio API...")
922
 
923
  try:
 
924
  client = get_conceptnet_client()
925
  if not client:
926
  return {"error": "ConceptNet Gradio Client is not available."}
 
934
  api_name="/get_semantic_profile"
935
  )
936
 
 
937
  relations_list = []
 
938
  if not isinstance(result_markdown, str):
939
  raise TypeError(f"ConceptNet API returned type {type(result_markdown)}, expected str.")
940
 
 
968
  elif node2.lower() == word_lower and node1.lower() != word_lower:
969
  other_node, direction = node1, "<-"
970
  else:
971
+ continue
972
 
973
  relations_list.append({
974
  "relation": relation, "direction": direction, "other_node": other_node,
 
978
  except Exception as e:
979
  log(f"ConceptNet Parser: Error parsing line '{line}': {e}")
980
 
 
981
  if not relations_list:
982
  final_result = {"info": f"No valid relations found for '{word_lower}'."}
983
  else:
 
1000
  Stub function to query OpenBLP.
1001
  Replace this with your actual OpenBLP database/API query.
1002
  """
1003
+ # Placeholder logic
1004
  if lemma == "dog":
1005
  return [
1006
  {"relation": "HasProperty", "other_node": "loyal", "weight": 0.9, "source": "openblp"},
 
1052
  return None
1053
 
1054
  def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
1055
+ """ (FALLBACK ENGINE 1) Analyzes a single word using HanTa (EN). """
 
 
 
1056
  if not HANTA_AVAILABLE: return {}
1057
  print(f"\n[Word Encyclopedia] Running HanTa (EN) fallback for: \"{word}\"")
1058
  final_result = {"input_word": word, "analysis": {}}
 
1061
  tagger = hanta_get_tagger_en()
1062
  if not tagger: return {}
1063
 
 
 
1064
  possible_tags = tagger.tag_word(word.lower())
1065
  possible_tags.extend(tagger.tag_word(word.capitalize()))
1066
 
 
1070
  pos_key = _hanta_pos_to_key(hanta_pos)
1071
  if not pos_key: continue
1072
 
 
 
1073
  raw_analysis = tagger.analyze(word.lower() if pos_key != 'noun' else word.capitalize())
1074
  lemma = raw_analysis[0] # The lemma
1075
 
 
1078
  processed_lemmas_pos.add((lemma, pos_key))
1079
  log(f"--- Analyzing HanTa (EN) path: lemma='{lemma}', pos='{pos_key}' ---")
1080
 
 
1081
  pattern_block = {}
1082
  if PATTERN_EN_AVAILABLE:
1083
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(lemma)
 
1085
  elif pos_key == "adjective": pattern_block = pattern_analyze_as_adjective_en(lemma)
1086
  elif pos_key == "adverb": pattern_block = {"base_form": lemma, "info": "Adverbs are non-inflecting."}
1087
 
 
1088
  semantics_block = _build_semantics_block_for_lemma(lemma, pos_key, top_n, 'en')
1089
 
 
1090
  pos_entry_report = {
1091
+ "hanta_analysis": {
1092
  "lemma": lemma,
1093
  "pos_tag": hanta_pos,
1094
  "analysis_string": str(raw_analysis),
 
1098
  "semantics_combined": semantics_block
1099
  }
1100
 
 
1101
  if word_appears_in_inflections_en(word, pattern_block, pos_key):
1102
  if pos_key not in final_result["analysis"]:
1103
  final_result["analysis"][pos_key] = []
 
1117
  # ============================================================================
1118
  # 6d. WIKTIONARY DATABASE LOGIC (EN)
1119
  # ============================================================================
 
1120
  def wiktionary_download_db() -> bool:
1121
  """ Downloads the English Wiktionary DB. """
1122
  global WIKTIONARY_AVAILABLE
 
1128
  print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
1129
  try:
1130
  hf_hub_download(
1131
+ repo_id=WIKTIONARY_REPO_ID,
1132
  filename=WIKTIONARY_DB_PATH,
1133
  repo_type="dataset",
1134
  local_dir=".",
 
1164
  log("Creating new read-only connection to Wiktionary DB...")
1165
  db_uri = f"file:{WIKTIONARY_DB_PATH}?mode=ro"
1166
  conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
1167
+ conn.row_factory = sqlite3.Row
1168
  _ = conn.execute("SELECT name FROM sqlite_master WHERE type='table' LIMIT 1").fetchone()
1169
  print("✓ Wiktionary DB connection successful.")
1170
  WIKTIONARY_CONN = conn
 
1187
  """ (REVISED FOR FULL DB V3) Fetches ALL data for a single entry_id. """
1188
  report = {}
1189
 
 
1190
  entry_data = conn.execute(
1191
  "SELECT word, title, redirect, pos, pos_title, lang, etymology_text FROM entries WHERE id = ?", (entry_id,)
1192
  ).fetchone()
 
1196
  report["entry_id"] = entry_id
1197
  report["lemma"] = entry_data["word"]
1198
 
 
1199
  senses_q = conn.execute(
1200
  """
1201
  SELECT
 
1212
  for sense_row in senses_q:
1213
  sense_dict = dict(sense_row)
1214
  sense_id = sense_dict["sense_id"]
 
1215
  examples_q = conn.execute(
1216
  "SELECT text, ref FROM examples WHERE sense_id = ?", (sense_id,)
1217
  ).fetchall()
 
1219
  senses_list.append(sense_dict)
1220
  report["senses"] = senses_list
1221
 
 
1222
  forms_q = conn.execute(
1223
  """
1224
  SELECT f.form_text, f.sense_index,
 
1229
  ).fetchall()
1230
  report["forms"] = [dict(f) for f in forms_q]
1231
 
 
 
1232
  return report
1233
 
1234
  def _wiktionary_find_all_entries(word: str, conn: sqlite3.Connection) -> List[Dict[str, Any]]:
1235
+ """ Finds all entries related to an English word. """
 
 
1236
  log(f"Wiktionary (EN): Querying for '{word}'...")
1237
  found_entry_ids: Set[int] = set()
1238
 
 
1239
  lang_query = 'English'
 
1240
  form_titles = ("Inflected form", "verb form", "noun form", "adjective form", "Comparative", "Superlative")
 
1241
 
 
1242
  lemma_q = conn.execute(
1243
  f"SELECT id, pos_title FROM entries WHERE word = ? AND lang = '{lang_query}'", (word,)
1244
  ).fetchall()
 
1268
  except json.JSONDecodeError:
1269
  log(f"Wiktionary: Failed to parse form_of JSON: {form_of_json}")
1270
 
 
1271
  form_q = conn.execute(
1272
  f"""
1273
  SELECT DISTINCT e.id
 
1296
 
1297
  log(f"Wiktionary: Found {len(found_entry_ids)} unique matching entries.")
1298
 
 
1299
  all_reports = []
1300
  for entry_id in found_entry_ids:
1301
  try:
 
1306
 
1307
  return all_reports
1308
 
1309
+ def _wiktionary_format_semantics_block(wikt_report: Dict[str, Any], pattern_block: Dict[str, Any], top_n: int) -> Dict[str, Any]:
1310
+ """ Combines English Wiktionary senses with OEWN/ConceptNet. """
1311
+ pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1312
+ semantic_lemma = wikt_report.get("lemma")
1313
+
1314
+ wiktionary_senses = []
1315
+ for sense in wikt_report.get("senses", []):
1316
+ wiktionary_senses.append({
1317
+ "definition": sense.get("glosses"),
1318
+ "source": "wiktionary"
1319
+ })
1320
+
1321
+ oewn_senses = []
1322
+ if WN_AVAILABLE:
1323
+ try:
1324
+ senses_by_pos = _get_wordnet_senses_by_pos(semantic_lemma, 'en')
1325
+ oewn_senses_raw = senses_by_pos.get(pos_key, [])
1326
+ if oewn_senses_raw and "info" not in oewn_senses_raw[0]:
1327
+ oewn_senses = oewn_senses_raw
1328
+ except Exception as e:
1329
+ log(f"[DEBUG] OEWN lookup failed for {semantic_lemma} ({pos_key}): {e}")
1330
+
1331
+ conceptnet_relations = []
1332
+ if REQUESTS_AVAILABLE:
1333
+ try:
1334
+ conceptnet_result = conceptnet_get_relations(semantic_lemma, language='en')
1335
+ conceptnet_relations = conceptnet_result.get("relations", [])
1336
+ except Exception: pass
1337
+
1338
+ if top_n > 0:
1339
+ wiktionary_senses = wiktionary_senses[:top_n]
1340
+ oewn_senses = oewn_senses[:top_n]
1341
+ conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
1342
+ conceptnet_relations = conceptnet_relations[:top_n]
1343
+
1344
+ return {
1345
+ "lemma": semantic_lemma,
1346
+ "wiktionary_senses": wiktionary_senses,
1347
+ "odenet_senses": oewn_senses, # Key name preserved
1348
+ "conceptnet_relations": conceptnet_relations,
1349
+ "wiktionary_synonyms": wikt_report.get("synonyms", []),
1350
+ "wiktionary_antonyms": wikt_report.get("antonyms", [])
1351
+ }
1352
+
1353
+
1354
  # ============================================================================
1355
  # 6e. SHARED SEMANTIC HELPER (OEWN + OpenBLP)
1356
  # ============================================================================
1357
 
1358
  def _get_wordnet_senses_by_pos(word: str, lang: str = 'en') -> Dict[str, List[Dict[str, Any]]]:
1359
+ """ (Helper) Fetches WordNet (OEWN) senses for a word and groups them by POS. """
 
 
1360
  senses_by_pos: Dict[str, List[Dict]] = {
1361
  "noun": [], "verb": [], "adjective": [], "adverb": []
1362
  }
1363
  if not WN_AVAILABLE:
 
1364
  return {"noun": [{"info": "WordNet unavailable"}], "verb": [{"info": "WordNet unavailable"}],
1365
  "adjective": [{"info": "WordNet unavailable"}], "adverb": [{"info": "WordNet unavailable"}]}
1366
 
 
1370
  if "error" in sense: continue
1371
  pos_tag = sense.get("pos")
1372
 
1373
+ if pos_tag == 'n': senses_by_pos["noun"].append(sense)
1374
+ elif pos_tag == 'v': senses_by_pos["verb"].append(sense)
1375
+ elif pos_tag == 'a' or pos_tag == 's': senses_by_pos["adjective"].append(sense)
1376
+ elif pos_tag == 'r': senses_by_pos["adverb"].append(sense)
 
 
 
 
 
 
 
 
 
1377
  except Exception as e:
1378
  log(f"WordNet helper check failed for '{word}': {e}")
1379
 
 
 
 
 
 
1380
  return senses_by_pos
1381
 
1382
  def _build_semantics_block_for_lemma(lemma: str, pos_key: str, top_n: int, lang: str = 'en') -> Dict[str, Any]:
1383
+ """ (REUSABLE HELPER) Fetches OEWN, ConceptNet, and OpenBLP data. """
 
 
1384
  log(f"[DEBUG] Building semantics for lemma='{lemma}', pos='{pos_key}', lang='{lang}'")
1385
 
 
1386
  oewn_senses = []
1387
  if WN_AVAILABLE:
1388
  try:
 
1393
  except Exception as e:
1394
  log(f"[DEBUG] OEWN lookup failed for {lemma} ({pos_key}): {e}")
1395
 
 
1396
  conceptnet_relations = []
1397
  if REQUESTS_AVAILABLE:
1398
  try:
 
1401
  except Exception as e:
1402
  conceptnet_relations = [{"error": str(e)}]
1403
 
 
1404
  openblp_relations = []
1405
  try:
1406
  openblp_relations = openblp_get_relations(lemma)
1407
  except Exception as e:
1408
  openblp_relations = [{"error": f"OpenBLP stub failed: {e}"}]
1409
 
 
1410
  if top_n > 0:
1411
  oewn_senses = oewn_senses[:top_n]
1412
  conceptnet_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
 
1414
  openblp_relations.sort(key=lambda x: x.get('weight', 0.0), reverse=True)
1415
  openblp_relations = openblp_relations[:top_n]
1416
 
 
1417
  return {
1418
  "lemma": lemma,
1419
+ "wiktionary_senses": [],
1420
+ "odenet_senses": oewn_senses,
1421
  "conceptnet_relations": conceptnet_relations,
1422
+ "openblp_relations": openblp_relations,
1423
  "wiktionary_synonyms": [],
1424
  "wiktionary_antonyms": []
1425
  }
 
1431
 
1432
  # --- PRIMARY ENGINE: WIKTIONARY (EN) ---
1433
  def _analyze_word_with_wiktionary(word: str, top_n: int) -> Dict[str, Any]:
1434
+ """ (PRIMARY ENGINE) Analyzes an English word using the Wiktionary DB. """
 
 
1435
  final_result: Dict[str, Any] = {"input_word": word, "analysis": {}}
1436
 
1437
  conn = wiktionary_get_connection()
1438
+ if not conn: return {}
 
1439
 
 
1440
  spacy_pos_hint, spacy_lemma_hint = None, None
1441
  try:
1442
  nlp_en = spacy_load_spacy_model("en_core_web_md")
 
1445
  token = doc[0]
1446
  spacy_pos_hint = token.pos_.lower()
1447
  spacy_lemma_hint = token.lemma_
 
1448
  except Exception as e:
1449
  log(f"[DEBUG] Wiktionary (EN) Hint: spaCy failed: {e}")
1450
 
 
1451
  try:
1452
  wiktionary_reports = _wiktionary_find_all_entries(word, conn)
1453
  except Exception as e:
1454
  log(f"[DEBUG] Wiktionary (EN) query failed: {e}")
1455
+ return {}
1456
+ if not wiktionary_reports: return {}
 
1457
 
 
1458
  def get_priority_score(report):
1459
  wikt_pos = _wiktionary_map_pos_key(report.get("pos"))
1460
  wikt_lemma = report.get("lemma")
 
1465
  return 4
1466
  wiktionary_reports.sort(key=get_priority_score)
1467
 
 
1468
  word_lower = word.lower()
1469
  for wikt_report in wiktionary_reports:
1470
  pos_key = _wiktionary_map_pos_key(wikt_report.get("pos"))
1471
  lemma = wikt_report.get("lemma", word)
1472
  pos_title = wikt_report.get("pos_title", "")
1473
 
 
1474
  inflections_wikt_block = {
1475
  "base_form": lemma,
1476
  "forms_list": wikt_report.get("forms", []),
1477
  "source": "wiktionary"
1478
  }
1479
 
 
1480
  pattern_block = {}
1481
  if PATTERN_EN_AVAILABLE:
1482
  try:
 
1483
  use_word = word if "form" in pos_title.lower() else lemma
1484
  if pos_key == "noun": pattern_block = pattern_analyze_as_noun_en(use_word)
1485
  elif pos_key == "verb": pattern_block = pattern_analyze_as_verb_en(use_word)
 
1488
  except Exception as e:
1489
  pattern_block = {"error": f"Pattern.en analysis failed: {e}"}
1490
 
 
1491
  semantics_block = _wiktionary_format_semantics_block(wikt_report, pattern_block, top_n)
1492
 
 
1493
  pos_entry_report = {
1494
  "inflections_wiktionary": inflections_wikt_block,
1495
  "inflections_pattern": pattern_block,
 
1498
  "pos_title": pos_title,
1499
  "etymology": wikt_report.get("etymology_text"),
1500
  "pronunciation": wikt_report.get("sounds"),
 
1501
  }
1502
  }
1503
 
 
1504
  is_valid = False
1505
  is_inflected_entry = any(ft in pos_title for ft in ["form", "Comparative", "Superlative"])
1506
 
1507
+ if lemma.lower() == word_lower: is_valid = True
 
 
1508
 
1509
  if not is_valid and not is_inflected_entry:
1510
  for form_entry in inflections_wikt_block.get("forms_list", []):
1511
  form_text = form_entry.get("form_text", "").strip()
1512
  if form_text.lower() == word_lower:
1513
  is_valid = True
 
1514
  break
1515
 
 
1516
  if is_valid:
1517
  if pos_key not in final_result["analysis"]:
1518
  final_result["analysis"][pos_key] = []
 
2063
  html_dep_out, df_out, json_out, html_ner_out])
2064
 
2065
  def create_languagetool_tab():
2066
+ """Creates the UI for the Grammar Checker tab with LT."""
2067
  gr.Markdown("# 🇬🇧 English Grammar & Spelling Checker")
2068
+ gr.Markdown("Powered by `LanguageTool`.")
2069
 
2070
  with gr.Row():
2071
  text_input = gr.Textbox(
 
2074
  lines=5,
2075
  scale=3
2076
  )
 
 
 
 
 
 
2077
 
2078
  check_button = gr.Button("Check Text", variant="primary")
2079
  output = gr.JSON(label="Detected Errors (JSON)")
2080
 
 
 
 
 
 
 
 
2081
  check_button.click(
2082
+ fn=lambda text: lt_check_grammar(text, 'en'),
2083
+ inputs=[text_input],
2084
  outputs=[output],
2085
  api_name="check_grammar"
2086
  )
2087
  gr.Examples(
2088
+ [["This is a houze."], ["I seen the man."],
2089
+ ["The cat sleep on the table."], ["He asks if he can go."]],
2090
+ inputs=[text_input], outputs=[output], fn=lambda text: lt_check_grammar(text, 'en'),
2091
  cache_examples=False
2092
  )
2093
 
 
2474
  print("INFO: NLTK library not available, skipping lemmatizer.")
2475
  print("--- NLTK Done ---\n")
2476
 
 
 
 
 
 
 
 
 
 
 
 
2477
  # --- 8. Check Pattern.en ---
2478
  print("--- Checking Pattern.en ---")
2479
  if not PATTERN_EN_AVAILABLE: