Spaces:

AniseF
/

pln-ddgp-plus

Sleeping

App Files Files Community

pln-ddgp-plus / app.py

AniseF

Update app.py

d80f633 verified 13 days ago

raw

history blame contribute delete

9.98 kB

	import os
	import subprocess
	import sys
	import urllib.request
	import json
	import pandas as pd
	import spacy
	import gradio as gr
	import re
	import unicodedata
	from translit import latin_to_basic_grc

	# --- 1. INSTALAÇÃO E CARREGAMENTO ---
	def install_odycy():
	try:
	import grc_odycy_joint_sm
	except ImportError:
	url = "https://huggingface.co/chcaa/grc_odycy_joint_sm/resolve/main/grc_odycy_joint_sm-any-py3-none-any.whl"
	valid_wheel_name = "grc_odycy_joint_sm-1.0.0-py3-none-any.whl"
	urllib.request.urlretrieve(url, valid_wheel_name)
	subprocess.check_call([sys.executable, "-m", "pip", "install", valid_wheel_name, "--no-deps"])
	os.remove(valid_wheel_name)

	install_odycy()

	try:
	import grc_odycy_joint_sm
	nlp = grc_odycy_joint_sm.load()
	except:
	nlp = spacy.blank("grc")

	TAG_MAP = {
	'NOUN': 'Substantivo', 'VERB': 'Verbo', 'ADJ': 'Adjetivo',
	'DET': 'Artigo/Det.', 'PRON': 'Pronome', 'ADV': 'Advérbio',
	'ADP': 'Preposição', 'CCONJ': 'Conjunção', 'SCONJ': 'Conjunção Sub.',
	'PART': 'Partícula', 'PROPN': 'Nome Próprio', 'PUNCT': 'Pontuação',
	'AUX': 'Auxiliar/Cópula', 'NUM': 'Numeral'
	}

	def load_json(path):
	if os.path.exists(path):
	with open(path, 'r', encoding='utf-8') as f:
	return json.load(f)
	return {}

	INDEX_LEMAS = load_json('ddgp_index_lemas.json')
	INDEX_FORMAS = load_json('ddgp_index_formas_final.json')
	FORMA_TO_LEMA = load_json('ddgp_forma_to_lema.json')
	ENTRIES = load_json('ddgp3x_entry.json')
	ABREV = load_json('abrev.json')

	css_content = ""
	for f_css in ['style.css', 'style_map.css']:
	if os.path.exists(f_css):
	with open(f_css, 'r', encoding='utf-8') as f:
	css_content += f.read() + "\n"

	# --- 2. FUNÇÕES DE APOIO E ORDENAÇÃO ---

	def normalizar_grego(texto):
	if not texto: return ""
	texto = unicodedata.normalize('NFD', texto.lower())
	texto = "".join(c for c in texto if not unicodedata.combining(c))
	return unicodedata.normalize('NFC', texto).strip()

	def ordem_grega(lema):
	alfabeto_map = {
	'α': 1, 'β': 2, 'γ': 3, 'δ': 4, 'ε': 5, 'ζ': 6, 'η': 7, 'θ': 8,
	'ι': 9, 'κ': 10, 'λ': 11, 'μ': 12, 'ν': 13, 'ξ': 14, 'ο': 15, 'π': 16,
	'ρ': 17, 'σ': 18, 'ς': 18, 'τ': 19, 'υ': 20, 'φ': 21, 'χ': 22, 'ψ': 23, 'ω': 24
	}
	lema_limpo = normalizar_grego(lema)
	return [alfabeto_map.get(char, 99) for char in lema_limpo]

	def aplicar_abreviaturas_seguro(texto):
	if not texto: return ""
	sorted_abrevs = sorted(ABREV.keys(), key=len, reverse=True)
	for ab in sorted_abrevs:
	pattern = r'\b' + re.escape(ab) + r'(?=\s\|[.,;:]\|$)'
	info = ABREV[ab]
	desc = info.get('descricao', '')
	categoria = info.get('categoria', '')
	classe_css = "autor-sc" if categoria == 'autor' else "abrev"
	subst = f'<span class="{classe_css}" title="{desc}">{ab}</span>'
	texto = re.sub(pattern, subst, texto)
	return texto

	def format_entry_html(entry_id):
	entry = ENTRIES.get(str(entry_id))
	if not entry: return None
	gword = entry.get('gword', '')
	pdesc = entry.get('pdesc', '')
	pdesc = aplicar_abreviaturas_seguro(pdesc)
	pdesc = re.sub(r'〈(.*?)〉', r'<span class="etimo">〈\1〉</span>', pdesc)

	return f"""
	<div class="result-box" style="text-transform: none !important; font-variant: normal !important;">
	<div style="color: #1a4d8f; font-size: 1.3em; font-weight: bold; margin-bottom: 6px; text-transform: none !important; font-variant: normal !important;">
	{gword}
	</div>
	<div style="line-height: 1.6; text-transform: none !important; font-variant: normal !important;">
	{pdesc}
	</div>
	</div>
	"""

	# --- 3. CONSULTA E ANÁLISE ---

	def consultar_ddgp(termo):
	if not termo: return ""
	# Translitera se for latim
	if any(ord(c) < 128 for c in termo if c.isalpha()):
	termo = latin_to_basic_grc(termo)

	termo_norm = normalizar_grego(termo)
	ids = []

	# BUSCA ESTRITA: Tentamos apenas o termo exato ou variações numéricas (ex: logos, logos1)
	# Isso evita que a busca traga palavras que apenas "começam" com o termo.
	tentativas = [termo_norm] + [f"{termo_norm}{i}" for i in range(1, 4)]

	for b in tentativas:
	if b in INDEX_LEMAS:
	ids.append(INDEX_LEMAS[b])

	if not ids: return ""

	html = ""
	for eid in sorted(set(ids)):
	res = format_entry_html(eid)
	if res: html += res
	return html

	def analisar_texto(texto):
	if not texto: return None, None, "0", "0", "0", "0", "0", ""
	doc = nlp(texto)
	dados = []
	lemas_unicos_processados = set()
	verbetes_dict = {}

	for token in doc:
	l_orig = token.lemma_
	l_norm = normalizar_grego(l_orig)
	pos_pt = TAG_MAP.get(token.pos_, token.pos_)

	morph_info = str(token.morph).replace("Case=", "").replace("Gender=", "").replace("Number=", "").replace("VerbForm=", "").replace("Person=", "").replace("Tense=", "").replace("Mood=", "").replace("Voice=", "")
	if not morph_info: morph_info = "-"

	# Busca no dicionário apenas uma vez por lema
	if token.pos_ not in ['PUNCT', 'SYM', 'SPACE'] and l_norm not in lemas_unicos_processados:
	res_html = consultar_ddgp(l_norm)
	if res_html:
	verbetes_dict[l_orig] = res_html
	lemas_unicos_processados.add(l_norm)

	dados.append({
	'Palavra': token.text,
	'Lema': l_orig,
	'Classe': pos_pt,
	'Morfologia': morph_info
	})

	# Cabeçalho com o aviso solicitado
	aviso_html = """
	<div style="background-color: #f8f9fa; padding: 10px; border-left: 4px solid #1a4d8f; margin-bottom: 15px; font-size: 0.9em; color: #555;">
	💡 <b>Dica:</b> Caso um lema não apareça automaticamente abaixo, utilize a aba "Busca direta no DDGP" para consultá-lo manualmente.
	</div>
	"""

	lexico_html = aviso_html
	for lema_ord in sorted(verbetes_dict.keys(), key=ordem_grega):
	lexico_html += verbetes_dict[lema_ord]

	df = pd.DataFrame(dados)
	tokens_df = df[~df['Classe'].isin(['Pontuação', 'PUNCT', 'SYM', 'SPACE'])]
	n_tokens = len(tokens_df)
	n_types = tokens_df['Palavra'].str.lower().nunique()
	n_lemas = tokens_df['Lema'].nunique()

	ttr = (n_types / n_tokens) if n_tokens > 0 else 0
	ltr = (n_lemas / n_tokens) if n_tokens > 0 else 0

	csv_path = "analise_filologica.csv"
	df.to_csv(csv_path, index=False)

	return (df.head(100), csv_path,
	str(n_tokens), str(n_types), str(n_lemas),
	f"{ltr:.2f}", f"{ttr:.2f}", lexico_html)

	# --- 4. INTERFACE ---
	with gr.Blocks(css=css_content, title="DDGP + OdyCy") as demo:
	with gr.Row():
	with gr.Column(scale=1, min_width=100):
	gr.HTML("""
	<div style="display: flex; align-items: center; justify-content: flex-start; height: 80px;">
	<img src="https://raw.githubusercontent.com/aniseferreira/DDGP_Plus/main/ddgp/logo.png" style="height: 80px;">
	</div>
	""")
	with gr.Column(scale=4):
	gr.Markdown("# Estação Filológica DDGP & OdyCy")
	gr.Markdown("## DDGP Plus: Análise lexical e consulta ao Dicionário Digital Grego-Português.")

	with gr.Tab("📝 Análise lexical"):
	txt = gr.Textbox(label="Texto em Grego Antigo", lines=6, placeholder="Insira o texto aqui sem aspas...Δειναὶ γὰρ αἱ γυναῖκες εὑρίσκειν τέχνας.")
	btn = gr.Button("🚀 Executar Análise", variant="primary")

	with gr.Row():
	t1 = gr.Label(label="Tokens (Total)")
	t2 = gr.Label(label="Types (Formas Únicas)")
	t3 = gr.Label(label="Lemas (Entradas)")
	t4 = gr.Label(label="LTR (Lema-Token)")
	t5 = gr.Label(label="TTR (Type-Token)")

	with gr.Row():
	with gr.Column(scale=2):
	out_t = gr.Dataframe(label="Formas")
	out_f = gr.File(label="Exportar CSV")
	with gr.Column(scale=1):
	gr.Markdown("### 📖 Léxico Contextual")
	out_l = gr.HTML()

	with gr.Tab("🔍 Busca direta no DDGP"):
	in_b = gr.Textbox(label="Busca direta no DDGP", placeholder="(ex. λόγος, logos)")
	btn_b = gr.Button("Consultar Base")
	out_b = gr.HTML()

	gr.Markdown(f"""
	---
	DDGP Plus — Analisador Morfológico e Dicionário Digital de Grego–Português 2026 v.1
	Baseado originalmente no Dicionário Grego-Português e diretamente no Dicionário Digital Grego–Português (DDGP e DGP - ver créditos em [hipatia.fclar.unesp.br](http://hipatia.fclar.unesp.br))
	Projetos Abertos em Letras Clássicas Digitais. Responsável: Anise D'Orange Ferreira.
	Desenvolvimento técnico e programação assistida por Gemini (Google AI).
	Sob licença CC BY-NC-SA 4.0.
	""")

	btn.click(analisar_texto, inputs=txt,
	outputs=[out_t, out_f, t1, t2, t3, t4, t5, out_l],
	api_name="analisar")

	btn_b.click(consultar_ddgp, inputs=in_b, outputs=out_b,
	api_name="consultar")

	# No final do seu app.py, substitua demo.launch() por:

	# No final do bloco 'with gr.Blocks...'
	def carregar_e_analisar(request: gr.Request):
	params = request.query_params
	texto_da_url = params.get("text", "")
	return texto_da_url

	# Quando o Space carrega:
	# 1. Preenche o texto
	# 2. Se houver texto, dispara a função de análise imediatamente
	demo.load(carregar_e_analisar, None, txt).then(
	fn=analisar_texto,
	inputs=txt,
	outputs=[out_t, out_f, t1, t2, t3, t4, t5, out_l]
	)

	demo.launch()