# app.py — DDGP Plus (Gradio) # WIC (Morph_Raw / UD) + DDGP # -*- coding: utf-8 -*- import os import json import unicodedata import re import gradio as gr # ============================================================ # STANZA / UD (Morph_Raw) # ============================================================ import stanza ud_nlp = None UD_AVAILABLE = False def morph_ud_analyze(sentence: str): global ud_nlp, UD_AVAILABLE if not sentence or not sentence.strip(): return None # inicializa o UD SOMENTE na primeira chamada if ud_nlp is None: try: stanza.download( "grc", processors="tokenize,pos,lemma,morph", verbose=False ) ud_nlp = stanza.Pipeline( lang="grc", processors="tokenize,pos,lemma,morph", tokenize_no_ssplit=True, use_gpu=False ) UD_AVAILABLE = True except Exception: UD_AVAILABLE = False return None # análise propriamente dita doc = ud_nlp(sentence) results = [] for sent in doc.sentences: for w in sent.words: results.append({ "token": w.text, "lema": w.lemma, "upos": w.upos, "feats": w.feats }) return results # ============================================================ # DDGP — UTILITÁRIOS # ============================================================ def normalize(text): return unicodedata.normalize("NFC", (text or "")).strip() def simplify(text): s = normalize(text) s = unicodedata.normalize("NFD", s) s = "".join(ch for ch in s if not unicodedata.combining(ch)) s = "".join(ch for ch in s if not ch.isdigit()) s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "") return s.lower() # ============================================================ # DDGP — CARREGAMENTO DE DADOS # ============================================================ # ============================================================ # CAMINHOS (HF-safe) # ============================================================ BASE_DIR = os.path.abspath(os.getcwd()) DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data") if not os.path.exists(DATA_DIR): raise RuntimeError( f"Diretório de dados do DDGP não encontrado: {DATA_DIR}\n" f"Conteúdo de {BASE_DIR}: {os.listdir(BASE_DIR)}" ) def load_json(path): if not os.path.exists(path): raise FileNotFoundError(f"Arquivo não encontrado: {path}") with open(path, "r", encoding="utf-8") as f: return json.load(f) DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json")) DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json")) DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json")) DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json")) # ============================================================ # DDGP — FORMATAÇÃO # ============================================================ def format_pdesc(pdesc: str) -> str: if not pdesc: return "" p = pdesc.replace("\r\n", "\n").replace("\r", "\n") p = re.sub(r"♦\s+", "\n\n**♦** ", p) p = p.replace("\n", "
") return p def find_entry_ids_for_lemma_candidate(cand: str): if not cand: return [] base = simplify(cand) results = [] seen = set() if base in DDGP_INDEX_LEMAS: eid = DDGP_INDEX_LEMAS[base] results.append(eid) seen.add(eid) for k, eid in DDGP_INDEX_LEMAS.items(): k_simp = simplify(k) if k_simp.startswith(base) and eid not in seen: results.append(eid) seen.add(eid) return results # ============================================================ # DDGP — FUNÇÃO DE LOOKUP (GRADIO) # ============================================================ import unicodedata def strip_greek_diacritics(s: str) -> str: s = unicodedata.normalize("NFD", s) s = "".join(ch for ch in s if not unicodedata.combining(ch)) return unicodedata.normalize("NFC", s) def ddgp_lookup(query: str) -> str: if not query or not query.strip(): return "⚠️ Digite uma forma ou lema em grego." # normaliza (remove diacríticos) palavra = strip_greek_diacritics(query.strip()) simp_form = simplify(palavra) results = [] found_entries = [] # 1) lookup direto por forma if simp_form in DDGP_INDEX_FORMAS: for i in DDGP_INDEX_FORMAS[simp_form][:10]: ent = DDGP_ENTRY.get(str(i)) if ent: found_entries.append(ent) # 2) se achou por forma, mostra if found_entries: for ent in found_entries: results.append( f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" ) else: # 3) fallback: forma → lema lemma_candidates = [] if simp_form in DDGP_FORMA_TO_LEMA: lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form]) for cand in lemma_candidates: for eid in find_entry_ids_for_lemma_candidate(cand): ent = DDGP_ENTRY.get(str(eid)) if ent: results.append( f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" ) # 4) fallback FINAL: prefixo de lema (igual ao Streamlit) if not results: base = simp_form seen = set() for k, eid in DDGP_INDEX_LEMAS.items(): if k.startswith(base): if eid not in seen: ent = DDGP_ENTRY.get(str(eid)) if ent: results.append( f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" ) seen.add(eid) if not results: return "❌ Nenhuma entrada do DDGP encontrada." return "\n\n---\n\n".join(results) # ============================================================ # WIC — FUNÇÃO GRADIO # ============================================================ def wic_ud(sentence: str): if not sentence or not sentence.strip(): return ( "Cole uma frase curta em grego antigo.", None ) results = morph_ud_analyze(sentence) if not results: return ( "Não foi possível analisar a frase com o pipeline UD.", None ) table = [] for tok in results: table.append([ tok["token"], tok["lema"], tok["upos"], tok["feats"] ]) return None, table # ============================================================ # INTERFACE GRADIO # ============================================================ with gr.Blocks() as demo: with gr.Blocks( css=""" #ddgp-logo { max-width: 60px; height: auto; } """ ) as demo: # LOGO + TÍTULO gr.HTML( """

DDGP Plus

Análise morfológica (UD) + Dicionário Grego–Português

""" ) # ============================ # ABA WIC # ============================ with gr.Tab("🧩 Análise morfológica (WIC — UD)"): wic_in = gr.Textbox( label="Cole uma frase curta contendo o vocábulo", placeholder="σαφέστερον δʼ ἂν μάθοις οὕτω." ) wic_btn = gr.Button("Analisar") wic_msg = gr.Markdown() wic_out = gr.Dataframe( headers=["Token", "Lema", "UPOS", "Feições"], interactive=False ) wic_btn.click( wic_ud, inputs=wic_in, outputs=[wic_msg, wic_out] ) # ============================ # ABA DDGP # ============================ with gr.Tab("📘 DDGP"): ddgp_in = gr.Textbox( label="""Digite o lema em grego politônico (ex.: λέγω). Nesta versão (HF), não há suporte a transliteração latina nem buscas parciais.""", placeholder="λέγω, πάθος, παθ" ) ddgp_btn = gr.Button("Consultar") ddgp_out = gr.HTML() ddgp_btn.click( ddgp_lookup, inputs=ddgp_in, outputs=ddgp_out ) # RODAPÉ gr.HTML( """
DDGP Plus — Projeto Letras Clássicas Digitais (FCLAr/UNESP).
Conteúdo lexicográfico licenciado sob CC BY–NC–ND 4.0.
""", ) demo.launch()