Spaces:

AniseF
/

ddgp-plus-morpho

Sleeping

App Files Files Community

AniseF commited on Jan 11

Commit

8652fbd

verified ·

1 Parent(s): f8a7ed5

Create app.py

Browse files

Files changed (1) hide show

app.py +292 -0

app.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# app.py — DDGP Plus (Gradio)
+# WIC (Morph_Raw / UD) + DDGP
+# -*- coding: utf-8 -*-
+import os
+import json
+import unicodedata
+import re
+import gradio as gr
+# ============================================================
+# STANZA / UD (Morph_Raw)
+# ============================================================
+import stanza
+# baixa modelo uma vez (HF-friendly)
+try:
+    stanza.download(
+        "grc",
+        processors="tokenize,pos,lemma,morph",
+        verbose=False
+    )
+except Exception:
+    pass
+ud_nlp = None
+UD_AVAILABLE = False
+try:
+    ud_nlp = stanza.Pipeline(
+        lang="grc",
+        processors="tokenize,pos,lemma,morph",
+        tokenize_no_ssplit=True,
+        use_gpu=False
+    )
+    UD_AVAILABLE = True
+except Exception:
+    UD_AVAILABLE = False
+def morph_ud_analyze(sentence: str):
+    """Morph_Raw UD: frase -> lista de tokens com lema, UPOS e feats."""
+    if not UD_AVAILABLE or not sentence.strip():
+        return None
+    doc = ud_nlp(sentence)
+    results = []
+    for sent in doc.sentences:
+        for w in sent.words:
+            results.append({
+                "token": w.text,
+                "lema": w.lemma,
+                "upos": w.upos,
+                "feats": w.feats
+            })
+    return results
+# ============================================================
+# DDGP — UTILITÁRIOS
+# ============================================================
+def normalize(text):
+    return unicodedata.normalize("NFC", (text or "")).strip()
+def simplify(text):
+    s = normalize(text)
+    s = unicodedata.normalize("NFD", s)
+    s = "".join(ch for ch in s if not unicodedata.combining(ch))
+    s = "".join(ch for ch in s if not ch.isdigit())
+    s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "")
+    return s.lower()
+# ============================================================
+# DDGP — CARREGAMENTO DE DADOS
+# ============================================================
+BASE_DIR = os.path.dirname(__file__)
+DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data")
+def load_json(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json"))
+DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json"))
+DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json"))
+DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json"))
+# ============================================================
+# DDGP — FORMATAÇÃO
+# ============================================================
+def format_pdesc(pdesc: str) -> str:
+    if not pdesc:
+        return ""
+    p = pdesc.replace("\r\n", "\n").replace("\r", "\n")
+    p = re.sub(r"♦\s+", "\n\n**♦** ", p)
+    p = p.replace("\n", "<br/>")
+    return p
+def find_entry_ids_for_lemma_candidate(cand: str):
+    if not cand:
+        return []
+    base = simplify(cand)
+    results = []
+    seen = set()
+    if base in DDGP_INDEX_LEMAS:
+        eid = DDGP_INDEX_LEMAS[base]
+        results.append(eid)
+        seen.add(eid)
+    for k, eid in DDGP_INDEX_LEMAS.items():
+        k_simp = simplify(k)
+        if k_simp.startswith(base) and eid not in seen:
+            results.append(eid)
+            seen.add(eid)
+    return results
+# ============================================================
+# DDGP — FUNÇÃO DE LOOKUP (GRADIO)
+# ============================================================
+def ddgp_lookup(query: str) -> str:
+    if not query or not query.strip():
+        return "⚠️ Digite uma forma ou lema em grego."
+    palavra = query.strip()
+    simp_form = simplify(palavra)
+    found_entries = []
+    # 1) lookup por forma
+    if simp_form in DDGP_INDEX_FORMAS:
+        for i in DDGP_INDEX_FORMAS[simp_form][:10]:
+            ent = DDGP_ENTRY.get(str(i))
+            if ent:
+                found_entries.append(ent)
+    results = []
+    if found_entries:
+        for ent in found_entries:
+            results.append(
+                f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
+            )
+    else:
+        # 2) forma → lema
+        lemma_candidates = []
+        if simp_form in DDGP_FORMA_TO_LEMA:
+            lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form])
+        for cand in lemma_candidates:
+            for eid in find_entry_ids_for_lemma_candidate(cand):
+                ent = DDGP_ENTRY.get(str(eid))
+                if ent:
+                    results.append(
+                        f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
+                    )
+    if not results:
+        return "❌ Nenhuma entrada do DDGP encontrada."
+    return "\n\n---\n\n".join(results)
+# ============================================================
+# WIC — FUNÇÃO GRADIO
+# ============================================================
+def wic_ud(sentence: str):
+    if not sentence or not sentence.strip():
+        return (
+            "Cole uma frase curta em grego antigo.",
+            None
+        )
+    if not UD_AVAILABLE:
+        return (
+            "⚠️ A análise morfológica em contexto (UD) não está disponível neste ambiente.",
+            None
+        )
+    results = morph_ud_analyze(sentence)
+    if not results:
+        return (
+            "Não foi possível analisar a frase com o pipeline UD.",
+            None
+        )
+    table = []
+    for tok in results:
+        table.append([
+            tok["token"],
+            tok["lema"],
+            tok["upos"],
+            tok["feats"]
+        ])
+    return None, table
+# ============================================================
+# INTERFACE GRADIO
+# ============================================================
+with gr.Blocks(
+    css="""
+    #ddgp-logo { max-width:120px; }
+    """
+) as demo:
+    # LOGO + TÍTULO
+    gr.Markdown(
+        """
+        <div style="display:flex; align-items:center; gap:16px;">
+            <img id="ddgp-logo"
+                 src="https://raw.githubusercontent.com/aniseferreira/DDGP_Plus/main/ddgp/logo.png">
+            <div>
+              <h2>DDGP Plus</h2>
+              <div>Análise morfológica (UD) + Dicionário Grego–Português</div>
+            </div>
+        </div>
+        <hr/>
+        """,
+        unsafe_allow_html=True
+    )
+    # ============================
+    # ABA WIC
+    # ============================
+    with gr.Tab("🧩 Análise morfológica (WIC — UD)"):
+        wic_in = gr.Textbox(
+            label="Cole uma frase curta contendo o vocábulo",
+            placeholder="τὸ προκείμενον ἵνα μὴ μεῖζον ἡμῖν"
+        )
+        wic_btn = gr.Button("Analisar")
+        wic_msg = gr.Markdown()
+        wic_out = gr.Dataframe(
+            headers=["Token", "Lema", "UPOS", "Feições"],
+            interactive=False
+        )
+        wic_btn.click(
+            wic_ud,
+            inputs=wic_in,
+            outputs=[wic_msg, wic_out]
+        )
+    # ============================
+    # ABA DDGP
+    # ============================
+    with gr.Tab("📘 DDGP"):
+        ddgp_in = gr.Textbox(
+            label="Forma ou lema em grego",
+            placeholder="λέγω, πάθος, παθ"
+        )
+        ddgp_btn = gr.Button("Consultar")
+        ddgp_out = gr.Markdown()
+        ddgp_btn.click(
+            ddgp_lookup,
+            inputs=ddgp_in,
+            outputs=ddgp_out
+        )
+    # RODAPÉ
+    gr.Markdown(
+        """
+        <hr/>
+        <small>
+        DDGP Plus — Projeto Letras Clássicas Digitais (FCLAr/UNESP).<br/>
+        Conteúdo lexicográfico licenciado sob CC BY–NC–ND 4.0.
+        </small>
+        """,
+        unsafe_allow_html=True
+    )
+demo.launch()