File size: 9,191 Bytes
8652fbd
 
 
 
 
 
 
 
 
 
 
0c84bde
8652fbd
 
 
 
 
 
 
 
 
 
 
00304f9
 
 
8652fbd
 
00304f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8652fbd
 
 
 
 
 
 
 
 
 
 
00304f9
8652fbd
 
 
00304f9
8652fbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cccdb99
 
 
 
 
 
8652fbd
 
cccdb99
 
 
 
 
 
8652fbd
 
cccdb99
 
8652fbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8db035
 
0c84bde
 
 
 
 
8652fbd
c8db035
8652fbd
 
 
74d5e1c
c8db035
8652fbd
 
74d5e1c
8652fbd
 
74d5e1c
8652fbd
 
 
 
 
 
74d5e1c
8652fbd
 
 
 
 
74d5e1c
8652fbd
74d5e1c
8652fbd
 
 
 
 
 
 
 
 
 
 
 
74d5e1c
 
d43cf96
 
 
 
 
 
 
 
 
 
 
 
c8db035
74d5e1c
 
 
 
 
8652fbd
 
 
 
 
 
 
 
 
 
 
82b45a1
8652fbd
 
 
 
 
 
 
 
 
82b45a1
 
8652fbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb1589e
2cbbc06
612bcc2
 
 
2cbbc06
 
 
 
8652fbd
 
c3eef34
8652fbd
 
 
 
 
 
 
 
 
 
bb1589e
c3eef34
8652fbd
 
 
 
 
 
 
3ab365e
8652fbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df75d60
 
 
8652fbd
 
 
8929df4
8652fbd
 
 
 
 
 
 
 
8929df4
8652fbd
 
 
 
 
 
 
8929df4
8652fbd
 
2cbbc06
d7a652e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# app.py — DDGP Plus (Gradio)
# WIC (Morph_Raw / UD) + DDGP
# -*- coding: utf-8 -*-

import os
import json
import unicodedata
import re

import gradio as gr


# ============================================================
# STANZA / UD (Morph_Raw)
# ============================================================

import stanza

ud_nlp = None
UD_AVAILABLE = False


def morph_ud_analyze(sentence: str):
    global ud_nlp, UD_AVAILABLE

    if not sentence or not sentence.strip():
        return None

    # inicializa o UD SOMENTE na primeira chamada
    if ud_nlp is None:
        try:
            stanza.download(
                "grc",
                processors="tokenize,pos,lemma,morph",
                verbose=False
            )

            ud_nlp = stanza.Pipeline(
                lang="grc",
                processors="tokenize,pos,lemma,morph",
                tokenize_no_ssplit=True,
                use_gpu=False
            )

            UD_AVAILABLE = True

        except Exception:
            UD_AVAILABLE = False
            return None

    # análise propriamente dita
    doc = ud_nlp(sentence)
    results = []

    for sent in doc.sentences:
        for w in sent.words:
            results.append({
                "token": w.text,
                "lema": w.lemma,
                "upos": w.upos,
                "feats": w.feats
            })

    return results



# ============================================================
# DDGP — UTILITÁRIOS
# ============================================================

def normalize(text):
    return unicodedata.normalize("NFC", (text or "")).strip()


def simplify(text):
    s = normalize(text)
    s = unicodedata.normalize("NFD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = "".join(ch for ch in s if not ch.isdigit())
    s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "")
    return s.lower()


# ============================================================
# DDGP — CARREGAMENTO DE DADOS
# ============================================================

# ============================================================
# CAMINHOS (HF-safe)
# ============================================================

BASE_DIR = os.path.abspath(os.getcwd())

DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data")

if not os.path.exists(DATA_DIR):
    raise RuntimeError(
        f"Diretório de dados do DDGP não encontrado: {DATA_DIR}\n"
        f"Conteúdo de {BASE_DIR}: {os.listdir(BASE_DIR)}"
    )


def load_json(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Arquivo não encontrado: {path}")
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json"))
DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json"))
DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json"))
DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json"))


# ============================================================
# DDGP — FORMATAÇÃO
# ============================================================

def format_pdesc(pdesc: str) -> str:
    if not pdesc:
        return ""
    p = pdesc.replace("\r\n", "\n").replace("\r", "\n")
    p = re.sub(r"♦\s+", "\n\n**♦** ", p)
    p = p.replace("\n", "<br/>")
    return p


def find_entry_ids_for_lemma_candidate(cand: str):
    if not cand:
        return []
    base = simplify(cand)
    results = []
    seen = set()

    if base in DDGP_INDEX_LEMAS:
        eid = DDGP_INDEX_LEMAS[base]
        results.append(eid)
        seen.add(eid)

    for k, eid in DDGP_INDEX_LEMAS.items():
        k_simp = simplify(k)
        if k_simp.startswith(base) and eid not in seen:
            results.append(eid)
            seen.add(eid)

    return results


# ============================================================
# DDGP — FUNÇÃO DE LOOKUP (GRADIO)
# ============================================================

import unicodedata

def strip_greek_diacritics(s: str) -> str:
    s = unicodedata.normalize("NFD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return unicodedata.normalize("NFC", s)

def ddgp_lookup(query: str) -> str:

    if not query or not query.strip():
        return "⚠️ Digite uma forma ou lema em grego."

    # normaliza (remove diacríticos)
    palavra = strip_greek_diacritics(query.strip())
    simp_form = simplify(palavra)

    results = []
    found_entries = []

    # 1) lookup direto por forma
    if simp_form in DDGP_INDEX_FORMAS:
        for i in DDGP_INDEX_FORMAS[simp_form][:10]:
            ent = DDGP_ENTRY.get(str(i))
            if ent:
                found_entries.append(ent)

    # 2) se achou por forma, mostra
    if found_entries:
        for ent in found_entries:
            results.append(
                f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
            )

    else:
        # 3) fallback: forma → lema
        lemma_candidates = []
        if simp_form in DDGP_FORMA_TO_LEMA:
            lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form])

        for cand in lemma_candidates:
            for eid in find_entry_ids_for_lemma_candidate(cand):
                ent = DDGP_ENTRY.get(str(eid))
                if ent:
                    results.append(
                        f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
                    )

        # 4) fallback FINAL: prefixo de lema (igual ao Streamlit)
        if not results:
            base = simp_form
            seen = set()

            for k, eid in DDGP_INDEX_LEMAS.items():
                if k.startswith(base):
                    if eid not in seen:
                        ent = DDGP_ENTRY.get(str(eid))
                        if ent:
                            results.append(
                                f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
                            )
                            seen.add(eid)

    if not results:
        return "❌ Nenhuma entrada do DDGP encontrada."

    return "\n\n---\n\n".join(results)

# ============================================================
# WIC — FUNÇÃO GRADIO
# ============================================================

def wic_ud(sentence: str):
    if not sentence or not sentence.strip():
        return (
            "Cole uma frase curta em grego antigo.",
            None
        )

  

    results = morph_ud_analyze(sentence)

    if not results:
        return (
            "Não foi possível analisar a frase com o pipeline UD.",
            None
        )

        

    table = []
    for tok in results:
        table.append([
            tok["token"],
            tok["lema"],
            tok["upos"],
            tok["feats"]
        ])

    return None, table


# ============================================================
# INTERFACE GRADIO
# ============================================================
with gr.Blocks() as demo:
    with gr.Blocks(
       css="""
        #ddgp-logo {
        max-width: 60px;
        height: auto;
    }
    """
) as demo:

    # LOGO + TÍTULO
        gr.HTML(
        """
        <div style="display:flex; align-items:center; gap:16px;">
            <img id="ddgp-logo"
                 src="https://raw.githubusercontent.com/aniseferreira/DDGP_Plus/main/ddgp/logo.png">
            <div>
              <h2>DDGP Plus</h2>
              <div>Análise morfológica (UD) + Dicionário Grego–Português</div>
            </div>
        </div>
        <hr/>
        """
       )

    # ============================
    # ABA WIC
    # ============================
    with gr.Tab("🧩 Análise morfológica (WIC — UD)"):
        wic_in = gr.Textbox(
            label="Cole uma frase curta contendo o vocábulo",
            placeholder="σαφέστερον δʼ ἂν μάθοις οὕτω."
        )
        wic_btn = gr.Button("Analisar")
        wic_msg = gr.Markdown()
        wic_out = gr.Dataframe(
            headers=["Token", "Lema", "UPOS", "Feições"],
            interactive=False
        )

        wic_btn.click(
            wic_ud,
            inputs=wic_in,
            outputs=[wic_msg, wic_out]
        )

    # ============================
    # ABA DDGP
    # ============================
    with gr.Tab("📘 DDGP"):
        ddgp_in = gr.Textbox(
            label="""Digite o lema em grego politônico (ex.: λέγω).
Nesta versão (HF), não há suporte a transliteração latina nem buscas parciais.""",

            placeholder="λέγω, πάθος, παθ"
        )
        ddgp_btn = gr.Button("Consultar")
        ddgp_out = gr.HTML()

        ddgp_btn.click(
            ddgp_lookup,
            inputs=ddgp_in,
            outputs=ddgp_out
        )

    # RODAPÉ
    gr.HTML(
        """
        <hr/>
        <small>
        DDGP Plus — Projeto Letras Clássicas Digitais (FCLAr/UNESP).<br/>
        Conteúdo lexicográfico licenciado sob CC BY–NC–ND 4.0.
        </small>
        """,
        
    )

demo.launch()