Spaces:

asilvamaia
/

Ident_br

Sleeping

File size: 4,235 Bytes

fa24fb3

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from urllib.parse import urlparse
import pandas as pd
import io

# --- CONFIGURAÇÕES ---
# ⚠️ TROQUE PELO SEU USUÁRIO/MODELO NO HUGGING FACE
MODEL_ID = "asilvamaia/ident_br" 

# Configuração da Página
st.set_page_config(page_title="Validador de Domínios .BR", page_icon="🇧🇷")

# --- FUNÇÃO DE LIMPEZA (V11) ---
def limpar_entrada(texto: str) -> str:
    texto = str(texto).strip().lower()
    if not texto: return ""
    if "@" in texto: return texto # Mantém e-mail para rejeição

    if "http" not in texto and "://" not in texto:
        texto_temp = "http://" + texto
    else:
        texto_temp = texto

    try:
        parsed = urlparse(texto_temp)
        dominio_limpo = parsed.netloc if parsed.netloc else texto
        
        if ":" in dominio_limpo: # Remove porta
            dominio_limpo = dominio_limpo.split(':')[0]
            
        if dominio_limpo.startswith("www."): # Remove www
            dominio_limpo = dominio_limpo[4:]
            
        return dominio_limpo
    except:
        return texto

# --- CARREGAMENTO DO MODELO (COM CACHE) ---
# O cache impede que o modelo seja recarregado a cada clique, deixando o app rápido
@st.cache_resource
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
        # No Space gratuito usamos CPU
        model.to("cpu")
        model.eval()
        return tokenizer, model
    except Exception as e:
        st.error(f"Erro ao carregar o modelo: {e}")
        return None, None

tokenizer, model = load_model()

# --- INTERFACE ---
st.title("🇧🇷 Validador de Domínios .BR com IA")
st.write("Faça upload de uma lista suja (.txt) e a IA extrairá apenas os domínios .br válidos.")

uploaded_file = st.file_uploader("Escolha um arquivo .txt", type="txt")

if uploaded_file is not None and tokenizer is not None:
    # Lê o arquivo
    stringio = io.StringIO(uploaded_file.getvalue().decode("utf-8"))
    linhas = stringio.readlines()
    
    st.info(f"Arquivo carregado com {len(linhas)} linhas. Processando...")
    
    if st.button("Iniciar Limpeza"):
        validos = []
        rejeitados = []
        
        # Barra de progresso
        progress_bar = st.progress(0)
        
        # Processamento (Batch size 1 para simplicidade visual ou aumentar se precisar)
        # Como é CPU, vamos processar linha a linha para atualizar a barra
        for i, linha in enumerate(linhas):
            original = linha.strip()
            limpo = limpar_entrada(original)
            
            if not limpo: continue
            
            # Inferência
            inputs = tokenizer(limpo, return_tensors="pt", truncation=True, max_length=128)
            with torch.no_grad():
                outputs = model(**inputs)
                pred = torch.argmax(outputs.logits, dim=1).item()
            
            # Regras de Validação
            if pred == 1 and "." in limpo and "@" not in limpo:
                validos.append(limpo)
            else:
                rejeitados.append(original)
            
            # Atualiza barra a cada 10 itens para não travar a UI
            if i % 10 == 0:
                progress_bar.progress((i + 1) / len(linhas))
        
        progress_bar.progress(100)
        
        # --- RESULTADOS ---
        st.success("Processamento Concluído!")
        
        col1, col2 = st.columns(2)
        with col1:
            st.metric("✅ Aprovados", len(validos))
        with col2:
            st.metric("🔴 Rejeitados", len(rejeitados))
            
        # --- DOWNLOAD ---
        # Remove duplicatas
        validos_unicos = sorted(list(set(validos)))
        res_text = "\n".join(validos_unicos)
        
        st.download_button(
            label="⬇️ Baixar Lista Limpa (.txt)",
            data=res_text,
            file_name="dominios_limpos.txt",
            mime="text/plain"
        )
        
        # Opcional: Mostrar amostra
        with st.expander("Ver amostra dos aprovados"):
            st.write(validos_unicos[:20])