trendtopics / app.py
cmbtest's picture
update
f7efaf2 verified
import gradio as gr
import pandas as pd
import requests
import os
import time
from collections import Counter
from io import StringIO
# =====================================================
# CONFIGURAÇÃO
# =====================================================
MODEL_ID = "dslim/bert-base-NER"
API_URL = f"https://router.huggingface.co/{MODEL_ID}"
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
if not HF_API_TOKEN:
raise RuntimeError("HF_API_TOKEN não encontrado. Configure em Settings → Secrets.")
HEADERS = {
"Authorization": f"Bearer {HF_API_TOKEN}",
"Content-Type": "application/json"
}
# =====================================================
# FUNÇÃO PRINCIPAL
# =====================================================
def analyze_abstracts(csv_text):
# ---------- Ler CSV ----------
try:
df = pd.read_csv(StringIO(csv_text))
except Exception as e:
return pd.DataFrame(
[["Erro ao ler CSV", str(e), ""]],
columns=["Term", "Frequency", "Hot"]
)
if "abstract" not in df.columns:
return pd.DataFrame(
[["Erro", "Coluna 'abstract' não encontrada", ""]],
columns=["Term", "Frequency", "Hot"]
)
all_terms = []
# ---------- Processar abstracts ----------
for abstract in df["abstract"].dropna():
payload = {
"inputs": abstract,
"parameters": {
"aggregation_strategy": "simple"
},
"options": {
"wait_for_model": True
}
}
try:
response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
result = response.json()
# Caso modelo ainda esteja carregando
if isinstance(result, dict) and "error" in result:
print("Aviso da API:", result)
continue
# Caso correto
if isinstance(result, list):
for ent in result:
term = ent.get("word", "").strip().lower()
if len(term) > 2:
all_terms.append(term)
except Exception as e:
print("Erro na requisição:", e)
continue
if not all_terms:
return pd.DataFrame(
[["Nenhum termo médico encontrado (verifique logs da API)", "", ""]],
columns=["Term", "Frequency", "Hot"]
)
# ---------- Frequência ----------
term_counts = Counter(all_terms)
df_terms = (
pd.DataFrame(term_counts.items(), columns=["Term", "Frequency"])
.sort_values(by="Frequency", ascending=False)
.reset_index(drop=True)
)
# ---------- Hot terms ----------
threshold = df_terms["Frequency"].quantile(0.90)
df_terms["Hot"] = df_terms["Frequency"] >= threshold
return df_terms
# =====================================================
# INTERFACE
# =====================================================
interface = gr.Interface(
fn=analyze_abstracts,
inputs=gr.Textbox(
lines=18,
placeholder="Cole aqui seu CSV com colunas: date,abstract"
),
outputs=gr.Dataframe(),
title="Medical Hot Terms Analyzer",
description=(
"Extrai termos médicos de abstracts usando PubMedBERT (NER) "
"e identifica hot terms (top 10%)."
),
)
if __name__ == "__main__":
interface.launch()