Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| import os | |
| import time | |
| from collections import Counter | |
| from io import StringIO | |
| # ===================================================== | |
| # CONFIGURAÇÃO | |
| # ===================================================== | |
| MODEL_ID = "dslim/bert-base-NER" | |
| API_URL = f"https://router.huggingface.co/{MODEL_ID}" | |
| HF_API_TOKEN = os.getenv("HF_API_TOKEN") | |
| if not HF_API_TOKEN: | |
| raise RuntimeError("HF_API_TOKEN não encontrado. Configure em Settings → Secrets.") | |
| HEADERS = { | |
| "Authorization": f"Bearer {HF_API_TOKEN}", | |
| "Content-Type": "application/json" | |
| } | |
| # ===================================================== | |
| # FUNÇÃO PRINCIPAL | |
| # ===================================================== | |
| def analyze_abstracts(csv_text): | |
| # ---------- Ler CSV ---------- | |
| try: | |
| df = pd.read_csv(StringIO(csv_text)) | |
| except Exception as e: | |
| return pd.DataFrame( | |
| [["Erro ao ler CSV", str(e), ""]], | |
| columns=["Term", "Frequency", "Hot"] | |
| ) | |
| if "abstract" not in df.columns: | |
| return pd.DataFrame( | |
| [["Erro", "Coluna 'abstract' não encontrada", ""]], | |
| columns=["Term", "Frequency", "Hot"] | |
| ) | |
| all_terms = [] | |
| # ---------- Processar abstracts ---------- | |
| for abstract in df["abstract"].dropna(): | |
| payload = { | |
| "inputs": abstract, | |
| "parameters": { | |
| "aggregation_strategy": "simple" | |
| }, | |
| "options": { | |
| "wait_for_model": True | |
| } | |
| } | |
| try: | |
| response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60) | |
| result = response.json() | |
| # Caso modelo ainda esteja carregando | |
| if isinstance(result, dict) and "error" in result: | |
| print("Aviso da API:", result) | |
| continue | |
| # Caso correto | |
| if isinstance(result, list): | |
| for ent in result: | |
| term = ent.get("word", "").strip().lower() | |
| if len(term) > 2: | |
| all_terms.append(term) | |
| except Exception as e: | |
| print("Erro na requisição:", e) | |
| continue | |
| if not all_terms: | |
| return pd.DataFrame( | |
| [["Nenhum termo médico encontrado (verifique logs da API)", "", ""]], | |
| columns=["Term", "Frequency", "Hot"] | |
| ) | |
| # ---------- Frequência ---------- | |
| term_counts = Counter(all_terms) | |
| df_terms = ( | |
| pd.DataFrame(term_counts.items(), columns=["Term", "Frequency"]) | |
| .sort_values(by="Frequency", ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| # ---------- Hot terms ---------- | |
| threshold = df_terms["Frequency"].quantile(0.90) | |
| df_terms["Hot"] = df_terms["Frequency"] >= threshold | |
| return df_terms | |
| # ===================================================== | |
| # INTERFACE | |
| # ===================================================== | |
| interface = gr.Interface( | |
| fn=analyze_abstracts, | |
| inputs=gr.Textbox( | |
| lines=18, | |
| placeholder="Cole aqui seu CSV com colunas: date,abstract" | |
| ), | |
| outputs=gr.Dataframe(), | |
| title="Medical Hot Terms Analyzer", | |
| description=( | |
| "Extrai termos médicos de abstracts usando PubMedBERT (NER) " | |
| "e identifica hot terms (top 10%)." | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() |