AniseF commited on
Commit
8652fbd
·
verified ·
1 Parent(s): f8a7ed5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +292 -0
app.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — DDGP Plus (Gradio)
2
+ # WIC (Morph_Raw / UD) + DDGP
3
+ # -*- coding: utf-8 -*-
4
+
5
+ import os
6
+ import json
7
+ import unicodedata
8
+ import re
9
+
10
+ import gradio as gr
11
+
12
+ # ============================================================
13
+ # STANZA / UD (Morph_Raw)
14
+ # ============================================================
15
+
16
+ import stanza
17
+
18
+ # baixa modelo uma vez (HF-friendly)
19
+ try:
20
+ stanza.download(
21
+ "grc",
22
+ processors="tokenize,pos,lemma,morph",
23
+ verbose=False
24
+ )
25
+ except Exception:
26
+ pass
27
+
28
+ ud_nlp = None
29
+ UD_AVAILABLE = False
30
+
31
+ try:
32
+ ud_nlp = stanza.Pipeline(
33
+ lang="grc",
34
+ processors="tokenize,pos,lemma,morph",
35
+ tokenize_no_ssplit=True,
36
+ use_gpu=False
37
+ )
38
+ UD_AVAILABLE = True
39
+ except Exception:
40
+ UD_AVAILABLE = False
41
+
42
+
43
+ def morph_ud_analyze(sentence: str):
44
+ """Morph_Raw UD: frase -> lista de tokens com lema, UPOS e feats."""
45
+ if not UD_AVAILABLE or not sentence.strip():
46
+ return None
47
+
48
+ doc = ud_nlp(sentence)
49
+ results = []
50
+
51
+ for sent in doc.sentences:
52
+ for w in sent.words:
53
+ results.append({
54
+ "token": w.text,
55
+ "lema": w.lemma,
56
+ "upos": w.upos,
57
+ "feats": w.feats
58
+ })
59
+ return results
60
+
61
+
62
+ # ============================================================
63
+ # DDGP — UTILITÁRIOS
64
+ # ============================================================
65
+
66
+ def normalize(text):
67
+ return unicodedata.normalize("NFC", (text or "")).strip()
68
+
69
+
70
+ def simplify(text):
71
+ s = normalize(text)
72
+ s = unicodedata.normalize("NFD", s)
73
+ s = "".join(ch for ch in s if not unicodedata.combining(ch))
74
+ s = "".join(ch for ch in s if not ch.isdigit())
75
+ s = s.replace(".", "").replace("-", "").replace("/", "").replace(" ", "")
76
+ return s.lower()
77
+
78
+
79
+ # ============================================================
80
+ # DDGP — CARREGAMENTO DE DADOS
81
+ # ============================================================
82
+
83
+ BASE_DIR = os.path.dirname(__file__)
84
+ DATA_DIR = os.path.join(BASE_DIR, "ddgp", "data")
85
+
86
+
87
+ def load_json(path):
88
+ with open(path, "r", encoding="utf-8") as f:
89
+ return json.load(f)
90
+
91
+
92
+ DDGP_ENTRY = load_json(os.path.join(DATA_DIR, "ddgp3x_entry.json"))
93
+ DDGP_INDEX_LEMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_lemas.json"))
94
+ DDGP_INDEX_FORMAS = load_json(os.path.join(DATA_DIR, "ddgp_index_formas_final.json"))
95
+ DDGP_FORMA_TO_LEMA = load_json(os.path.join(DATA_DIR, "ddgp_forma_to_lema.json"))
96
+
97
+
98
+ # ============================================================
99
+ # DDGP — FORMATAÇÃO
100
+ # ============================================================
101
+
102
+ def format_pdesc(pdesc: str) -> str:
103
+ if not pdesc:
104
+ return ""
105
+ p = pdesc.replace("\r\n", "\n").replace("\r", "\n")
106
+ p = re.sub(r"♦\s+", "\n\n**♦** ", p)
107
+ p = p.replace("\n", "<br/>")
108
+ return p
109
+
110
+
111
+ def find_entry_ids_for_lemma_candidate(cand: str):
112
+ if not cand:
113
+ return []
114
+ base = simplify(cand)
115
+ results = []
116
+ seen = set()
117
+
118
+ if base in DDGP_INDEX_LEMAS:
119
+ eid = DDGP_INDEX_LEMAS[base]
120
+ results.append(eid)
121
+ seen.add(eid)
122
+
123
+ for k, eid in DDGP_INDEX_LEMAS.items():
124
+ k_simp = simplify(k)
125
+ if k_simp.startswith(base) and eid not in seen:
126
+ results.append(eid)
127
+ seen.add(eid)
128
+
129
+ return results
130
+
131
+
132
+ # ============================================================
133
+ # DDGP — FUNÇÃO DE LOOKUP (GRADIO)
134
+ # ============================================================
135
+
136
+ def ddgp_lookup(query: str) -> str:
137
+ if not query or not query.strip():
138
+ return "⚠️ Digite uma forma ou lema em grego."
139
+
140
+ palavra = query.strip()
141
+ simp_form = simplify(palavra)
142
+
143
+ found_entries = []
144
+
145
+ # 1) lookup por forma
146
+ if simp_form in DDGP_INDEX_FORMAS:
147
+ for i in DDGP_INDEX_FORMAS[simp_form][:10]:
148
+ ent = DDGP_ENTRY.get(str(i))
149
+ if ent:
150
+ found_entries.append(ent)
151
+
152
+ results = []
153
+
154
+ if found_entries:
155
+ for ent in found_entries:
156
+ results.append(
157
+ f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
158
+ )
159
+ else:
160
+ # 2) forma → lema
161
+ lemma_candidates = []
162
+ if simp_form in DDGP_FORMA_TO_LEMA:
163
+ lemma_candidates.append(DDGP_FORMA_TO_LEMA[simp_form])
164
+
165
+ for cand in lemma_candidates:
166
+ for eid in find_entry_ids_for_lemma_candidate(cand):
167
+ ent = DDGP_ENTRY.get(str(eid))
168
+ if ent:
169
+ results.append(
170
+ f"### {ent.get('gword','')}\n{format_pdesc(ent.get('pdesc',''))}"
171
+ )
172
+
173
+ if not results:
174
+ return "❌ Nenhuma entrada do DDGP encontrada."
175
+
176
+ return "\n\n---\n\n".join(results)
177
+
178
+
179
+ # ============================================================
180
+ # WIC — FUNÇÃO GRADIO
181
+ # ============================================================
182
+
183
+ def wic_ud(sentence: str):
184
+ if not sentence or not sentence.strip():
185
+ return (
186
+ "Cole uma frase curta em grego antigo.",
187
+ None
188
+ )
189
+
190
+ if not UD_AVAILABLE:
191
+ return (
192
+ "⚠️ A análise morfológica em contexto (UD) não está disponível neste ambiente.",
193
+ None
194
+ )
195
+
196
+ results = morph_ud_analyze(sentence)
197
+
198
+ if not results:
199
+ return (
200
+ "Não foi possível analisar a frase com o pipeline UD.",
201
+ None
202
+ )
203
+
204
+ table = []
205
+ for tok in results:
206
+ table.append([
207
+ tok["token"],
208
+ tok["lema"],
209
+ tok["upos"],
210
+ tok["feats"]
211
+ ])
212
+
213
+ return None, table
214
+
215
+
216
+ # ============================================================
217
+ # INTERFACE GRADIO
218
+ # ============================================================
219
+
220
+ with gr.Blocks(
221
+ css="""
222
+ #ddgp-logo { max-width:120px; }
223
+ """
224
+ ) as demo:
225
+
226
+ # LOGO + TÍTULO
227
+ gr.Markdown(
228
+ """
229
+ <div style="display:flex; align-items:center; gap:16px;">
230
+ <img id="ddgp-logo"
231
+ src="https://raw.githubusercontent.com/aniseferreira/DDGP_Plus/main/ddgp/logo.png">
232
+ <div>
233
+ <h2>DDGP Plus</h2>
234
+ <div>Análise morfológica (UD) + Dicionário Grego–Português</div>
235
+ </div>
236
+ </div>
237
+ <hr/>
238
+ """,
239
+ unsafe_allow_html=True
240
+ )
241
+
242
+ # ============================
243
+ # ABA WIC
244
+ # ============================
245
+ with gr.Tab("🧩 Análise morfológica (WIC — UD)"):
246
+ wic_in = gr.Textbox(
247
+ label="Cole uma frase curta contendo o vocábulo",
248
+ placeholder="τὸ προκείμενον ἵνα μὴ μεῖζον ἡμῖν"
249
+ )
250
+ wic_btn = gr.Button("Analisar")
251
+ wic_msg = gr.Markdown()
252
+ wic_out = gr.Dataframe(
253
+ headers=["Token", "Lema", "UPOS", "Feições"],
254
+ interactive=False
255
+ )
256
+
257
+ wic_btn.click(
258
+ wic_ud,
259
+ inputs=wic_in,
260
+ outputs=[wic_msg, wic_out]
261
+ )
262
+
263
+ # ============================
264
+ # ABA DDGP
265
+ # ============================
266
+ with gr.Tab("📘 DDGP"):
267
+ ddgp_in = gr.Textbox(
268
+ label="Forma ou lema em grego",
269
+ placeholder="λέγω, πάθος, παθ"
270
+ )
271
+ ddgp_btn = gr.Button("Consultar")
272
+ ddgp_out = gr.Markdown()
273
+
274
+ ddgp_btn.click(
275
+ ddgp_lookup,
276
+ inputs=ddgp_in,
277
+ outputs=ddgp_out
278
+ )
279
+
280
+ # RODAPÉ
281
+ gr.Markdown(
282
+ """
283
+ <hr/>
284
+ <small>
285
+ DDGP Plus — Projeto Letras Clássicas Digitais (FCLAr/UNESP).<br/>
286
+ Conteúdo lexicográfico licenciado sob CC BY–NC–ND 4.0.
287
+ </small>
288
+ """,
289
+ unsafe_allow_html=True
290
+ )
291
+
292
+ demo.launch()