Spaces:
Sleeping
Sleeping
| # app.py — DDGP Plus (Gradio) | |
| # WIC (Morph_Raw / UD) + DDGP | |
| # -*- coding: utf-8 -*- | |
| import os | |
| import json | |
| import unicodedata | |
| import re | |
| import gradio as gr | |
| # ============================================================ | |
| # STANZA / UD (Morph_Raw) | |
| # ============================================================ | |
| import stanza | |
| ud_nlp = None | |
| UD_AVAILABLE = False | |
| def morph_ud_analyze(sentence: str): | |
| global ud_nlp, UD_AVAILABLE | |
| if not sentence or not sentence.strip(): | |
| return None | |
| # inicializa o UD SOMENTE na primeira chamada | |
| if ud_nlp is None: | |
| try: | |
| stanza.download( | |
| "grc", | |
| processors="tokenize,pos,lemma,morph", | |
| verbose=False | |
| ) | |
| ud_nlp = stanza.Pipeline( | |
| lang="grc", | |
| processors="tokenize,pos,lemma,morph", | |
| tokenize_no_ssplit=True, | |
| use_gpu=False | |
| ) | |
| UD_AVAILABLE = True | |
| except Exception: | |
| UD_AVAILABLE = False | |
| return None | |
| # análise propriamente dita | |
| doc = ud_nlp(sentence) | |
| results = [] | |
| for sent in doc.sentences: | |
| for w in sent.words: | |
| results.append({ | |
| "token": w.text, | |
| "lema": w.lemma, | |
| "upos": w.upos, | |
| "feats": w.feats | |
| }) | |
| return results | |
| # ============================================================ | |
| # DDGP — UTILITÁRIOS | |
| # ============================================================ | |
| def normalize(text): | |
| return unicodedata.normalize("NFC", (text or "")).strip() | |
| def simplify(text): | |
| s = normalize(text) | |
| s = unicodedata.normalize("NFD", s) | |
| s = "".join(ch for ch in s if not unicodedata.combining(ch)) | |
| s = "".join(ch for ch in s if not ch.isdigit()) | |
| s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "") | |
| return s.lower() | |
| # ============================================================ | |
| # DDGP — CARREGAMENTO DE DADOS | |
| # ============================================================ | |
| # ============================================================ | |
| # CAMINHOS (HF-safe) | |
| # ============================================================ | |
| BASE_DIR = os.path.abspath(os.getcwd()) | |
| DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data") | |
| if not os.path.exists(DATA_DIR): | |
| raise RuntimeError( | |
| f"Diretório de dados do DDGP não encontrado: {DATA_DIR}\n" | |
| f"Conteúdo de {BASE_DIR}: {os.listdir(BASE_DIR)}" | |
| ) | |
| def load_json(path): | |
| if not os.path.exists(path): | |
| raise FileNotFoundError(f"Arquivo não encontrado: {path}") | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json")) | |
| DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json")) | |
| DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json")) | |
| DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json")) | |
| # ============================================================ | |
| # DDGP — FORMATAÇÃO | |
| # ============================================================ | |
| def format_pdesc(pdesc: str) -> str: | |
| if not pdesc: | |
| return "" | |
| p = pdesc.replace("\r\n", "\n").replace("\r", "\n") | |
| p = re.sub(r"♦\s+", "\n\n**♦** ", p) | |
| p = p.replace("\n", "<br/>") | |
| return p | |
| def find_entry_ids_for_lemma_candidate(cand: str): | |
| if not cand: | |
| return [] | |
| base = simplify(cand) | |
| results = [] | |
| seen = set() | |
| if base in DDGP_INDEX_LEMAS: | |
| eid = DDGP_INDEX_LEMAS[base] | |
| results.append(eid) | |
| seen.add(eid) | |
| for k, eid in DDGP_INDEX_LEMAS.items(): | |
| k_simp = simplify(k) | |
| if k_simp.startswith(base) and eid not in seen: | |
| results.append(eid) | |
| seen.add(eid) | |
| return results | |
| # ============================================================ | |
| # DDGP — FUNÇÃO DE LOOKUP (GRADIO) | |
| # ============================================================ | |
| import unicodedata | |
| def strip_greek_diacritics(s: str) -> str: | |
| s = unicodedata.normalize("NFD", s) | |
| s = "".join(ch for ch in s if not unicodedata.combining(ch)) | |
| return unicodedata.normalize("NFC", s) | |
| def ddgp_lookup(query: str) -> str: | |
| if not query or not query.strip(): | |
| return "⚠️ Digite uma forma ou lema em grego." | |
| # normaliza (remove diacríticos) | |
| palavra = strip_greek_diacritics(query.strip()) | |
| simp_form = simplify(palavra) | |
| results = [] | |
| found_entries = [] | |
| # 1) lookup direto por forma | |
| if simp_form in DDGP_INDEX_FORMAS: | |
| for i in DDGP_INDEX_FORMAS[simp_form][:10]: | |
| ent = DDGP_ENTRY.get(str(i)) | |
| if ent: | |
| found_entries.append(ent) | |
| # 2) se achou por forma, mostra | |
| if found_entries: | |
| for ent in found_entries: | |
| results.append( | |
| f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" | |
| ) | |
| else: | |
| # 3) fallback: forma → lema | |
| lemma_candidates = [] | |
| if simp_form in DDGP_FORMA_TO_LEMA: | |
| lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form]) | |
| for cand in lemma_candidates: | |
| for eid in find_entry_ids_for_lemma_candidate(cand): | |
| ent = DDGP_ENTRY.get(str(eid)) | |
| if ent: | |
| results.append( | |
| f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" | |
| ) | |
| # 4) fallback FINAL: prefixo de lema (igual ao Streamlit) | |
| if not results: | |
| base = simp_form | |
| seen = set() | |
| for k, eid in DDGP_INDEX_LEMAS.items(): | |
| if k.startswith(base): | |
| if eid not in seen: | |
| ent = DDGP_ENTRY.get(str(eid)) | |
| if ent: | |
| results.append( | |
| f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}" | |
| ) | |
| seen.add(eid) | |
| if not results: | |
| return "❌ Nenhuma entrada do DDGP encontrada." | |
| return "\n\n---\n\n".join(results) | |
| # ============================================================ | |
| # WIC — FUNÇÃO GRADIO | |
| # ============================================================ | |
| def wic_ud(sentence: str): | |
| if not sentence or not sentence.strip(): | |
| return ( | |
| "Cole uma frase curta em grego antigo.", | |
| None | |
| ) | |
| results = morph_ud_analyze(sentence) | |
| if not results: | |
| return ( | |
| "Não foi possível analisar a frase com o pipeline UD.", | |
| None | |
| ) | |
| table = [] | |
| for tok in results: | |
| table.append([ | |
| tok["token"], | |
| tok["lema"], | |
| tok["upos"], | |
| tok["feats"] | |
| ]) | |
| return None, table | |
| # ============================================================ | |
| # INTERFACE GRADIO | |
| # ============================================================ | |
| with gr.Blocks() as demo: | |
| with gr.Blocks( | |
| css=""" | |
| #ddgp-logo { | |
| max-width: 60px; | |
| height: auto; | |
| } | |
| """ | |
| ) as demo: | |
| # LOGO + TÍTULO | |
| gr.HTML( | |
| """ | |
| <div style="display:flex; align-items:center; gap:16px;"> | |
| <img id="ddgp-logo" | |
| src="https://raw.githubusercontent.com/aniseferreira/DDGP_Plus/main/ddgp/logo.png"> | |
| <div> | |
| <h2>DDGP Plus</h2> | |
| <div>Análise morfológica (UD) + Dicionário Grego–Português</div> | |
| </div> | |
| </div> | |
| <hr/> | |
| """ | |
| ) | |
| # ============================ | |
| # ABA WIC | |
| # ============================ | |
| with gr.Tab("🧩 Análise morfológica (WIC — UD)"): | |
| wic_in = gr.Textbox( | |
| label="Cole uma frase curta contendo o vocábulo", | |
| placeholder="σαφέστερον δʼ ἂν μάθοις οὕτω." | |
| ) | |
| wic_btn = gr.Button("Analisar") | |
| wic_msg = gr.Markdown() | |
| wic_out = gr.Dataframe( | |
| headers=["Token", "Lema", "UPOS", "Feições"], | |
| interactive=False | |
| ) | |
| wic_btn.click( | |
| wic_ud, | |
| inputs=wic_in, | |
| outputs=[wic_msg, wic_out] | |
| ) | |
| # ============================ | |
| # ABA DDGP | |
| # ============================ | |
| with gr.Tab("📘 DDGP"): | |
| ddgp_in = gr.Textbox( | |
| label="""Digite o lema em grego politônico (ex.: λέγω). | |
| Nesta versão (HF), não há suporte a transliteração latina nem buscas parciais.""", | |
| placeholder="λέγω, πάθος, παθ" | |
| ) | |
| ddgp_btn = gr.Button("Consultar") | |
| ddgp_out = gr.HTML() | |
| ddgp_btn.click( | |
| ddgp_lookup, | |
| inputs=ddgp_in, | |
| outputs=ddgp_out | |
| ) | |
| # RODAPÉ | |
| gr.HTML( | |
| """ | |
| <hr/> | |
| <small> | |
| DDGP Plus — Projeto Letras Clássicas Digitais (FCLAr/UNESP).<br/> | |
| Conteúdo lexicográfico licenciado sob CC BY–NC–ND 4.0. | |
| </small> | |
| """, | |
| ) | |
| demo.launch() | |