|
|
import gradio as gr |
|
|
import spacy |
|
|
from spacy import displacy |
|
|
import base64 |
|
|
import traceback |
|
|
import subprocess |
|
|
import sys |
|
|
import os |
|
|
from pathlib import Path |
|
|
from typing import Dict, Optional, Tuple |
|
|
import importlib |
|
|
import site |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_INFO: Dict[str, Tuple[str, str, str]] = { |
|
|
"de": ("German", "de_core_news_md", "spacy"), |
|
|
"en": ("English", "en_core_web_md", "spacy"), |
|
|
"es": ("Spanish", "es_core_news_md", "spacy"), |
|
|
"grc-proiel-trf": ("Ancient Greek (PROIEL TRF)", "grc_proiel_trf", "grecy"), |
|
|
"grc-perseus-trf": ("Ancient Greek (Perseus TRF)", "grc_perseus_trf", "grecy"), |
|
|
"grc_ner_trf": ("Ancient Greek (NER TRF)", "grc_ner_trf", "grecy"), |
|
|
"grc-proiel-lg": ("Ancient Greek (PROIEL LG)", "grc_proiel_lg", "grecy"), |
|
|
"grc-perseus-lg": ("Ancient Greek (Perseus LG)", "grc_perseus_lg", "grecy"), |
|
|
"grc-proiel-sm": ("Ancient Greek (PROIEL SM)", "grc_proiel_sm", "grecy"), |
|
|
"grc-perseus-sm": ("Ancient Greek (Perseus SM)", "grc_perseus_sm", "grecy"), |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
UI_TEXT = { |
|
|
"de": { |
|
|
"title": "# 🔍 Mehrsprachiger Morpho-Syntaktischer Analysator", |
|
|
"subtitle": "Analysieren Sie Texte auf Deutsch, Englisch, Spanisch und Altgriechisch", |
|
|
"ui_lang_label": "Benutzeroberflächensprache", |
|
|
"model_lang_label": "Textsprache für Analyse", |
|
|
"input_label": "Text eingeben", |
|
|
"input_placeholder": "Geben Sie hier Ihren Text ein...", |
|
|
"button_text": "Text analysieren", |
|
|
"button_processing_text": "Verarbeitung läuft...", |
|
|
"tab_graphic": "Grafische Darstellung", |
|
|
"tab_table": "Tabelle", |
|
|
"tab_json": "JSON", |
|
|
"tab_ner": "Entitäten", |
|
|
"html_label": "Abhängigkeitsparsing", |
|
|
"table_label": "Morphologische Analyse", |
|
|
"table_headers": ["Wort", "Lemma", "POS", "Tag", "Morphologie", "Abhängigkeit"], |
|
|
"json_label": "JSON-Ausgabe", |
|
|
"ner_label": "Benannte Entitäten", |
|
|
"error_message": "Fehler: " |
|
|
}, |
|
|
"en": { |
|
|
"title": "# 🔍 Multilingual Morpho-Syntactic Analyzer", |
|
|
"subtitle": "Analyze texts in German, English, Spanish, and Ancient Greek", |
|
|
"ui_lang_label": "Interface Language", |
|
|
"model_lang_label": "Text Language for Analysis", |
|
|
"input_label": "Enter Text", |
|
|
"input_placeholder": "Enter your text here...", |
|
|
"button_text": "Analyze Text", |
|
|
"button_processing_text": "Processing...", |
|
|
"tab_graphic": "Graphic View", |
|
|
"tab_table": "Table", |
|
|
"tab_json": "JSON", |
|
|
"tab_ner": "Entities", |
|
|
"html_label": "Dependency Parsing", |
|
|
"table_label": "Morphological Analysis", |
|
|
"table_headers": ["Word", "Lemma", "POS", "Tag", "Morphology", "Dependency"], |
|
|
"json_label": "JSON Output", |
|
|
"ner_label": "Named Entities", |
|
|
"error_message": "Error: " |
|
|
}, |
|
|
"es": { |
|
|
"title": "# 🔍 Analizador Morfo-Sintáctico Multilingüe", |
|
|
"subtitle": "Analice textos en alemán, inglés, español y griego antiguo", |
|
|
"ui_lang_label": "Idioma de la Interfaz", |
|
|
"model_lang_label": "Idioma del Texto para Análisis", |
|
|
"input_label": "Introducir Texto", |
|
|
"input_placeholder": "Ingrese su texto aquí...", |
|
|
"button_text": "Analizar Texto", |
|
|
"button_processing_text": "Procesando...", |
|
|
"tab_graphic": "Vista Gráfica", |
|
|
"tab_table": "Tabla", |
|
|
"tab_json": "JSON", |
|
|
"tab_ner": "Entidades", |
|
|
"html_label": "Análisis de Dependencias", |
|
|
"table_label": "Análisis Morfológico", |
|
|
"table_headers": ["Palabra", "Lema", "POS", "Etiqueta", "Morfología", "Dependencia"], |
|
|
"json_label": "Salida JSON", |
|
|
"ner_label": "Entidades Nombradas", |
|
|
"error_message": "Error: " |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
MODELS: Dict[str, Optional[spacy.Language]] = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def install_spacy_transformers_once(): |
|
|
""" Installs spacy-transformers, required for all _trf models. """ |
|
|
marker_file = Path(".spacy_transformers_installed") |
|
|
if marker_file.exists(): |
|
|
print("✓ spacy-transformers already installed (marker found)") |
|
|
return True |
|
|
|
|
|
print("Installing spacy-transformers (for _trf models)...") |
|
|
cmd = [sys.executable, "-m", "pip", "install", "spacy-transformers"] |
|
|
try: |
|
|
subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=300) |
|
|
print("✓ Successfully installed spacy-transformers") |
|
|
marker_file.touch() |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"✗ FAILED to install spacy-transformers: {e}") |
|
|
if hasattr(e, 'stderr'): print(e.stderr) |
|
|
return False |
|
|
|
|
|
def install_grecy_model_from_github(model_name: str) -> bool: |
|
|
""" Installs a greCy model from your specific GitHub Release. """ |
|
|
marker_file = Path(f".{model_name}_installed") |
|
|
if marker_file.exists(): |
|
|
print(f"✓ {model_name} already installed (marker found)") |
|
|
return True |
|
|
|
|
|
print(f"Installing grecy model: {model_name}...") |
|
|
|
|
|
if model_name == "grc_proiel_trf": |
|
|
wheel_filename = "grc_proiel_trf-3.7.5-py3-none-any.whl" |
|
|
elif model_name in ["grc_perseus_trf", "grc_proiel_lg", "grc_perseus_lg", |
|
|
"grc_proiel_sm", "grc_perseus_sm", "grc_ner_trf"]: |
|
|
|
|
|
wheel_filename = f"{model_name}-0.0.0-py3-none-any.whl" |
|
|
else: |
|
|
print(f"✗ Unknown grecy model: {model_name}") |
|
|
return False |
|
|
|
|
|
install_url = f"https://github.com/CrispStrobe/greCy/releases/download/v1.0-models/{wheel_filename}" |
|
|
cmd = [sys.executable, "-m", "pip", "install", install_url, "--no-deps"] |
|
|
|
|
|
print(f"Running: {' '.join(cmd)}") |
|
|
try: |
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=900) |
|
|
if result.stdout: print("STDOUT:", result.stdout) |
|
|
if result.stderr: print("STDERR:", result.stderr) |
|
|
print(f"✓ Successfully installed {model_name} from GitHub") |
|
|
marker_file.touch() |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"✗ Installation subprocess FAILED with code {e.returncode}") |
|
|
print("STDOUT:", e.stdout) |
|
|
print("STDERR:", e.stderr) |
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"✗ Installation exception: {e}") |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_spacy_model(model_name: str) -> Optional[spacy.Language]: |
|
|
"""Load or install a standard spaCy model.""" |
|
|
try: |
|
|
return spacy.load(model_name) |
|
|
except OSError: |
|
|
print(f"Installing {model_name}...") |
|
|
try: |
|
|
subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name]) |
|
|
return spacy.load(model_name) |
|
|
except Exception as e: |
|
|
print(f"✗ Failed to install {model_name}: {e}") |
|
|
return None |
|
|
|
|
|
def load_grecy_model(model_name: str) -> Optional[spacy.Language]: |
|
|
""" Load a grecy model, installing from GitHub if needed. """ |
|
|
if not install_grecy_model_from_github(model_name): |
|
|
print(f"✗ Cannot load {model_name} because installation failed.") |
|
|
return None |
|
|
try: |
|
|
print("Refreshing importlib to find new package...") |
|
|
importlib.invalidate_caches() |
|
|
try: importlib.reload(site) |
|
|
except Exception: pass |
|
|
|
|
|
print(f"Trying: spacy.load('{model_name}')") |
|
|
nlp = spacy.load(model_name) |
|
|
print(f"✓ Successfully loaded {model_name}") |
|
|
return nlp |
|
|
except Exception as e: |
|
|
print(f"✗ Model {model_name} is installed but FAILED to load.") |
|
|
print(f" Error: {e}") |
|
|
traceback.print_exc() |
|
|
return None |
|
|
|
|
|
def initialize_models(): |
|
|
""" Pre-load standard models and ensure _trf dependencies are ready. """ |
|
|
print("\n" + "="*70) |
|
|
print("INITIALIZING MODELS") |
|
|
print("="*70 + "\n") |
|
|
|
|
|
install_spacy_transformers_once() |
|
|
|
|
|
loaded_count = 0 |
|
|
spacy_model_count = 0 |
|
|
|
|
|
for lang_code, (lang_name, model_name, model_type) in MODEL_INFO.items(): |
|
|
if model_type == "spacy": |
|
|
spacy_model_count += 1 |
|
|
print(f"Loading {lang_name} ({model_name})...") |
|
|
nlp = load_spacy_model(model_name) |
|
|
MODELS[lang_code] = nlp |
|
|
if nlp: |
|
|
print(f"✓ {lang_name} ready\n") |
|
|
loaded_count += 1 |
|
|
else: |
|
|
print(f"✗ {lang_name} FAILED\n") |
|
|
else: |
|
|
print(f"✓ {lang_name} ({model_name}) will be loaded on first use.\n") |
|
|
MODELS[lang_code] = None |
|
|
|
|
|
print(f"Pre-loaded {loaded_count}/{spacy_model_count} standard models.") |
|
|
print("="*70 + "\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_analysis(ui_lang: str, model_lang_key: str, text: str): |
|
|
"""Analyze text and return results.""" |
|
|
ui_config = UI_TEXT.get(ui_lang.lower(), UI_TEXT["en"]) |
|
|
error_prefix = ui_config["error_message"] |
|
|
|
|
|
try: |
|
|
if not text.strip(): |
|
|
|
|
|
return ([], [], "<p style='color: orange;'>No text provided.</p>", "", |
|
|
gr.Button(value=ui_config["button_text"], interactive=True)) |
|
|
|
|
|
nlp = MODELS.get(model_lang_key) |
|
|
|
|
|
if nlp is None: |
|
|
print(f"First use of {model_lang_key}. Loading model...") |
|
|
if model_lang_key not in MODEL_INFO: |
|
|
raise ValueError(f"Unknown model key: {model_lang_key}") |
|
|
_, model_name, model_type = MODEL_INFO[model_lang_key] |
|
|
|
|
|
if model_type == "grecy": |
|
|
nlp = load_grecy_model(model_name) |
|
|
else: |
|
|
nlp = load_spacy_model(model_name) |
|
|
|
|
|
if nlp is None: |
|
|
MODELS.pop(model_lang_key, None) |
|
|
raise ValueError(f"Model for {model_lang_key} ({model_name}) FAILED to load. Check logs.") |
|
|
else: |
|
|
MODELS[model_lang_key] = nlp |
|
|
print(f"✓ {model_lang_key} is now loaded and cached.") |
|
|
|
|
|
doc = nlp(text) |
|
|
|
|
|
dataframe_output = [] |
|
|
json_output = [] |
|
|
|
|
|
for token in doc: |
|
|
lemma_str = token.lemma_ |
|
|
morph_str = str(token.morph) if token.has_morph() else '' |
|
|
dep_str = token.dep_ if doc.has_annotation("DEP") else '' |
|
|
tag_str = token.tag_ if token.tag_ != "" else '' |
|
|
pos_str = token.pos_ if token.pos_ != "" else '' |
|
|
|
|
|
json_output.append({ |
|
|
"word": token.text, "lemma": lemma_str, "pos": pos_str, |
|
|
"tag": tag_str, "morphology": morph_str, "dependency": dep_str, |
|
|
"is_stopword": token.is_stop |
|
|
}) |
|
|
dataframe_output.append([token.text, lemma_str, pos_str, tag_str, morph_str, dep_str]) |
|
|
|
|
|
|
|
|
html_dep_out = "" |
|
|
if "parser" in nlp.pipe_names: |
|
|
try: |
|
|
options = {"compact": True, "bg": "#ffffff", "color": "#000000", "font": "Source Sans Pro"} |
|
|
html_svg = displacy.render(doc, style="dep", jupyter=False, options=options) |
|
|
svg_b64 = base64.b64encode(html_svg.encode("utf-8")).decode("utf-8") |
|
|
html_dep_out = f'<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: 2.5;"><img src="data:image/svg+xml;base64,{svg_b64}" /></div>' |
|
|
except Exception as e: |
|
|
html_dep_out = f"<p style='color: orange;'>Visualization error (DEP): {e}</p>" |
|
|
else: |
|
|
html_dep_out = "<p style='color: orange;'>Dependency parsing ('parser') not available for this model.</p>" |
|
|
|
|
|
|
|
|
html_ner_out = "" |
|
|
if "ner" in nlp.pipe_names: |
|
|
if doc.ents: |
|
|
try: |
|
|
|
|
|
html_ner_out = displacy.render(doc, style="ent", jupyter=False) |
|
|
html_ner_out = f'<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; line-height: 2.5;">{html_ner_out}</div>' |
|
|
except Exception as e: |
|
|
html_ner_out = f"<p style='color: orange;'>Visualization error (NER): {e}</p>" |
|
|
else: |
|
|
html_ner_out = "<p>No named entities found in this text.</p>" |
|
|
else: |
|
|
html_ner_out = "<p style='color: orange;'>Named Entity Recognition ('ner') not available for this model.</p>" |
|
|
|
|
|
return (dataframe_output, json_output, html_dep_out, html_ner_out, |
|
|
gr.Button(value=ui_config["button_text"], interactive=True)) |
|
|
|
|
|
except Exception as e: |
|
|
traceback.print_exc() |
|
|
error_html = f"<div style='color: red; border: 1px solid red; padding: 10px; border-radius: 5px; background-color: #fff5f5;'><strong>{error_prefix}</strong> {str(e)}</div>" |
|
|
|
|
|
return ([[f"{error_prefix}{str(e)}"]], {"error": str(e)}, error_html, error_html, |
|
|
gr.Button(value=ui_config["button_text"], interactive=True)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_ui(ui_lang: str): |
|
|
"""Update UI language.""" |
|
|
ui_config = UI_TEXT.get(ui_lang.lower(), UI_TEXT["en"]) |
|
|
return [ |
|
|
gr.Markdown(value=ui_config["title"]), |
|
|
gr.Markdown(value=ui_config["subtitle"]), |
|
|
gr.Radio(label=ui_config["ui_lang_label"]), |
|
|
gr.Radio(label=ui_config["model_lang_label"]), |
|
|
gr.Textbox(label=ui_config["input_label"], placeholder=ui_config["input_placeholder"]), |
|
|
gr.Button(value=ui_config["button_text"]), |
|
|
gr.Tab(label=ui_config["tab_graphic"]), |
|
|
gr.Tab(label=ui_config["tab_table"]), |
|
|
gr.Tab(label=ui_config["tab_json"]), |
|
|
gr.Tab(label=ui_config["tab_ner"]), |
|
|
gr.HTML(label=ui_config["html_label"]), |
|
|
gr.DataFrame(label=ui_config["table_label"], headers=ui_config["table_headers"], interactive=False), |
|
|
gr.JSON(label=ui_config["json_label"]), |
|
|
gr.HTML(label=ui_config["ner_label"]) |
|
|
] |
|
|
|
|
|
def create_interface(): |
|
|
"""Create Gradio interface.""" |
|
|
config = UI_TEXT["en"] |
|
|
model_choices = list(MODEL_INFO.keys()) |
|
|
|
|
|
with gr.Blocks(title="Multilingual Morpho-Syntactic Analyzer") as demo: |
|
|
with gr.Row(): |
|
|
ui_lang_radio = gr.Radio(["DE", "EN", "ES"], label=config["ui_lang_label"], value="EN") |
|
|
model_lang_radio = gr.Radio( |
|
|
choices=[(MODEL_INFO[k][0], k) for k in model_choices], |
|
|
label=config["model_lang_label"], |
|
|
value=model_choices[0] |
|
|
) |
|
|
|
|
|
markdown_title = gr.Markdown(config["title"]) |
|
|
markdown_subtitle = gr.Markdown(config["subtitle"]) |
|
|
text_input = gr.Textbox(label=config["input_label"], placeholder=config["input_placeholder"], lines=5) |
|
|
analyze_button = gr.Button(config["button_text"], variant="primary") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab(config["tab_graphic"]) as tab_graphic: |
|
|
html_dep_out = gr.HTML(label=config["html_label"]) |
|
|
with gr.Tab(config["tab_ner"]) as tab_ner: |
|
|
html_ner_out = gr.HTML(label=config["ner_label"]) |
|
|
with gr.Tab(config["tab_table"]) as tab_table: |
|
|
|
|
|
df_out = gr.DataFrame(label=config["table_label"], headers=config["table_headers"], interactive=False) |
|
|
with gr.Tab(config["tab_json"]) as tab_json: |
|
|
json_out = gr.JSON(label=config["json_label"]) |
|
|
|
|
|
analyze_button.click(fn=get_analysis, |
|
|
inputs=[ui_lang_radio, model_lang_radio, text_input], |
|
|
outputs=[df_out, json_out, html_dep_out, html_ner_out, analyze_button], |
|
|
api_name="get_morphology") |
|
|
|
|
|
ui_lang_radio.change(fn=update_ui, |
|
|
inputs=ui_lang_radio, |
|
|
outputs=[markdown_title, markdown_subtitle, ui_lang_radio, model_lang_radio, |
|
|
text_input, analyze_button, tab_graphic, tab_table, tab_json, tab_ner, |
|
|
html_dep_out, df_out, json_out, html_ner_out]) |
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n" + "="*70) |
|
|
print("MULTILINGUAL MORPHO-SYNTACTIC ANALYZER") |
|
|
print("="*70 + "\n") |
|
|
|
|
|
initialize_models() |
|
|
|
|
|
demo = create_interface() |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) |