Spaces:
Sleeping
Sleeping
File size: 3,417 Bytes
8fa16c7 d14acb6 0bcb2f9 8fa16c7 d14acb6 0bcb2f9 d14acb6 b221f79 f7efaf2 d14acb6 0bcb2f9 d14acb6 8fa16c7 d14acb6 8fa16c7 d14acb6 8fa16c7 d14acb6 8fa16c7 0bcb2f9 d14acb6 0bcb2f9 5825cdf 0bcb2f9 5825cdf d14acb6 8fa16c7 0bcb2f9 8fa16c7 d14acb6 0bcb2f9 8fa16c7 d14acb6 0bcb2f9 5825cdf d14acb6 8fa16c7 0bcb2f9 8fa16c7 d14acb6 0bcb2f9 d14acb6 8fa16c7 d14acb6 8fa16c7 d14acb6 8fa16c7 0bcb2f9 d14acb6 8fa16c7 d14acb6 8fa16c7 d14acb6 0bcb2f9 d14acb6 8fa16c7 d14acb6 0bcb2f9 d14acb6 8fa16c7 d14acb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
import pandas as pd
import requests
import os
import time
from collections import Counter
from io import StringIO
# =====================================================
# CONFIGURAÇÃO
# =====================================================
MODEL_ID = "dslim/bert-base-NER"
API_URL = f"https://router.huggingface.co/{MODEL_ID}"
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
if not HF_API_TOKEN:
raise RuntimeError("HF_API_TOKEN não encontrado. Configure em Settings → Secrets.")
HEADERS = {
"Authorization": f"Bearer {HF_API_TOKEN}",
"Content-Type": "application/json"
}
# =====================================================
# FUNÇÃO PRINCIPAL
# =====================================================
def analyze_abstracts(csv_text):
# ---------- Ler CSV ----------
try:
df = pd.read_csv(StringIO(csv_text))
except Exception as e:
return pd.DataFrame(
[["Erro ao ler CSV", str(e), ""]],
columns=["Term", "Frequency", "Hot"]
)
if "abstract" not in df.columns:
return pd.DataFrame(
[["Erro", "Coluna 'abstract' não encontrada", ""]],
columns=["Term", "Frequency", "Hot"]
)
all_terms = []
# ---------- Processar abstracts ----------
for abstract in df["abstract"].dropna():
payload = {
"inputs": abstract,
"parameters": {
"aggregation_strategy": "simple"
},
"options": {
"wait_for_model": True
}
}
try:
response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
result = response.json()
# Caso modelo ainda esteja carregando
if isinstance(result, dict) and "error" in result:
print("Aviso da API:", result)
continue
# Caso correto
if isinstance(result, list):
for ent in result:
term = ent.get("word", "").strip().lower()
if len(term) > 2:
all_terms.append(term)
except Exception as e:
print("Erro na requisição:", e)
continue
if not all_terms:
return pd.DataFrame(
[["Nenhum termo médico encontrado (verifique logs da API)", "", ""]],
columns=["Term", "Frequency", "Hot"]
)
# ---------- Frequência ----------
term_counts = Counter(all_terms)
df_terms = (
pd.DataFrame(term_counts.items(), columns=["Term", "Frequency"])
.sort_values(by="Frequency", ascending=False)
.reset_index(drop=True)
)
# ---------- Hot terms ----------
threshold = df_terms["Frequency"].quantile(0.90)
df_terms["Hot"] = df_terms["Frequency"] >= threshold
return df_terms
# =====================================================
# INTERFACE
# =====================================================
interface = gr.Interface(
fn=analyze_abstracts,
inputs=gr.Textbox(
lines=18,
placeholder="Cole aqui seu CSV com colunas: date,abstract"
),
outputs=gr.Dataframe(),
title="Medical Hot Terms Analyzer",
description=(
"Extrai termos médicos de abstracts usando PubMedBERT (NER) "
"e identifica hot terms (top 10%)."
),
)
if __name__ == "__main__":
interface.launch() |