# app.py — DDGP Plus (Gradio)
# WIC (Morph_Raw / UD) + DDGP
# -*- coding: utf-8 -*-
import os
import json
import unicodedata
import re
import gradio as gr
# ============================================================
# STANZA / UD (Morph_Raw)
# ============================================================
import stanza
ud_nlp = None
UD_AVAILABLE = False
def morph_ud_analyze(sentence: str):
global ud_nlp, UD_AVAILABLE
if not sentence or not sentence.strip():
return None
# inicializa o UD SOMENTE na primeira chamada
if ud_nlp is None:
try:
stanza.download(
"grc",
processors="tokenize,pos,lemma,morph",
verbose=False
)
ud_nlp = stanza.Pipeline(
lang="grc",
processors="tokenize,pos,lemma,morph",
tokenize_no_ssplit=True,
use_gpu=False
)
UD_AVAILABLE = True
except Exception:
UD_AVAILABLE = False
return None
# análise propriamente dita
doc = ud_nlp(sentence)
results = []
for sent in doc.sentences:
for w in sent.words:
results.append({
"token": w.text,
"lema": w.lemma,
"upos": w.upos,
"feats": w.feats
})
return results
# ============================================================
# DDGP — UTILITÁRIOS
# ============================================================
def normalize(text):
return unicodedata.normalize("NFC", (text or "")).strip()
def simplify(text):
s = normalize(text)
s = unicodedata.normalize("NFD", s)
s = "".join(ch for ch in s if not unicodedata.combining(ch))
s = "".join(ch for ch in s if not ch.isdigit())
s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "")
return s.lower()
# ============================================================
# DDGP — CARREGAMENTO DE DADOS
# ============================================================
# ============================================================
# CAMINHOS (HF-safe)
# ============================================================
BASE_DIR = os.path.abspath(os.getcwd())
DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data")
if not os.path.exists(DATA_DIR):
raise RuntimeError(
f"Diretório de dados do DDGP não encontrado: {DATA_DIR}\n"
f"Conteúdo de {BASE_DIR}: {os.listdir(BASE_DIR)}"
)
def load_json(path):
if not os.path.exists(path):
raise FileNotFoundError(f"Arquivo não encontrado: {path}")
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json"))
DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json"))
DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json"))
DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json"))
# ============================================================
# DDGP — FORMATAÇÃO
# ============================================================
def format_pdesc(pdesc: str) -> str:
if not pdesc:
return ""
p = pdesc.replace("\r\n", "\n").replace("\r", "\n")
p = re.sub(r"♦\s+", "\n\n**♦** ", p)
p = p.replace("\n", "
")
return p
def find_entry_ids_for_lemma_candidate(cand: str):
if not cand:
return []
base = simplify(cand)
results = []
seen = set()
if base in DDGP_INDEX_LEMAS:
eid = DDGP_INDEX_LEMAS[base]
results.append(eid)
seen.add(eid)
for k, eid in DDGP_INDEX_LEMAS.items():
k_simp = simplify(k)
if k_simp.startswith(base) and eid not in seen:
results.append(eid)
seen.add(eid)
return results
# ============================================================
# DDGP — FUNÇÃO DE LOOKUP (GRADIO)
# ============================================================
import unicodedata
def strip_greek_diacritics(s: str) -> str:
s = unicodedata.normalize("NFD", s)
s = "".join(ch for ch in s if not unicodedata.combining(ch))
return unicodedata.normalize("NFC", s)
def ddgp_lookup(query: str) -> str:
if not query or not query.strip():
return "⚠️ Digite uma forma ou lema em grego."
# normaliza (remove diacríticos)
palavra = strip_greek_diacritics(query.strip())
simp_form = simplify(palavra)
results = []
found_entries = []
# 1) lookup direto por forma
if simp_form in DDGP_INDEX_FORMAS:
for i in DDGP_INDEX_FORMAS[simp_form][:10]:
ent = DDGP_ENTRY.get(str(i))
if ent:
found_entries.append(ent)
# 2) se achou por forma, mostra
if found_entries:
for ent in found_entries:
results.append(
f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
)
else:
# 3) fallback: forma → lema
lemma_candidates = []
if simp_form in DDGP_FORMA_TO_LEMA:
lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form])
for cand in lemma_candidates:
for eid in find_entry_ids_for_lemma_candidate(cand):
ent = DDGP_ENTRY.get(str(eid))
if ent:
results.append(
f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
)
# 4) fallback FINAL: prefixo de lema (igual ao Streamlit)
if not results:
base = simp_form
seen = set()
for k, eid in DDGP_INDEX_LEMAS.items():
if k.startswith(base):
if eid not in seen:
ent = DDGP_ENTRY.get(str(eid))
if ent:
results.append(
f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
)
seen.add(eid)
if not results:
return "❌ Nenhuma entrada do DDGP encontrada."
return "\n\n---\n\n".join(results)
# ============================================================
# WIC — FUNÇÃO GRADIO
# ============================================================
def wic_ud(sentence: str):
if not sentence or not sentence.strip():
return (
"Cole uma frase curta em grego antigo.",
None
)
results = morph_ud_analyze(sentence)
if not results:
return (
"Não foi possível analisar a frase com o pipeline UD.",
None
)
table = []
for tok in results:
table.append([
tok["token"],
tok["lema"],
tok["upos"],
tok["feats"]
])
return None, table
# ============================================================
# INTERFACE GRADIO
# ============================================================
with gr.Blocks() as demo:
with gr.Blocks(
css="""
#ddgp-logo {
max-width: 60px;
height: auto;
}
"""
) as demo:
# LOGO + TÍTULO
gr.HTML(
"""