{
"cells": [
{
"cell_type": "markdown",
"id": "71c10e88",
"metadata": {},
"source": [
"### Instalar dependências"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "15f83305",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: skops==0.13.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from -r requirements.txt (line 1)) (0.13.0)\n",
"Requirement already satisfied: pandas==2.3.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from -r requirements.txt (line 2)) (2.3.2)\n",
"Requirement already satisfied: scikit-learn==1.7.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from -r requirements.txt (line 3)) (1.7.1)\n",
"Requirement already satisfied: spacy==3.8.7 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from -r requirements.txt (line 4)) (3.8.7)\n",
"Requirement already satisfied: joblib in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from -r requirements.txt (line 5)) (1.5.2)\n",
"Requirement already satisfied: numpy>=1.25.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from skops==0.13.0->-r requirements.txt (line 1)) (2.3.4)\n",
"Requirement already satisfied: packaging>=17.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from skops==0.13.0->-r requirements.txt (line 1)) (25.0)\n",
"Requirement already satisfied: prettytable>=3.9 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from skops==0.13.0->-r requirements.txt (line 1)) (3.17.0)\n",
"Requirement already satisfied: scipy>=1.10.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from skops==0.13.0->-r requirements.txt (line 1)) (1.16.3)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pandas==2.3.2->-r requirements.txt (line 2)) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pandas==2.3.2->-r requirements.txt (line 2)) (2025.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pandas==2.3.2->-r requirements.txt (line 2)) (2025.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from scikit-learn==1.7.1->-r requirements.txt (line 3)) (3.6.0)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (1.0.15)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (2.0.13)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (3.0.11)\n",
"Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (8.3.9)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (1.1.3)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (2.5.1)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (2.0.10)\n",
"Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (0.4.3)\n",
"Requirement already satisfied: typer<1.0.0,>=0.3.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (0.20.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (4.67.1)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (2.32.5)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (2.12.4)\n",
"Requirement already satisfied: jinja2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (3.1.6)\n",
"Requirement already satisfied: setuptools in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (80.9.0)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from spacy==3.8.7->-r requirements.txt (line 4)) (3.5.0)\n",
"Requirement already satisfied: language-data>=1.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from langcodes<4.0.0,>=3.2.0->spacy==3.8.7->-r requirements.txt (line 4)) (1.3.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy==3.8.7->-r requirements.txt (line 4)) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.41.5 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy==3.8.7->-r requirements.txt (line 4)) (2.41.5)\n",
"Requirement already satisfied: typing-extensions>=4.14.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy==3.8.7->-r requirements.txt (line 4)) (4.15.0)\n",
"Requirement already satisfied: typing-inspection>=0.4.2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy==3.8.7->-r requirements.txt (line 4)) (0.4.2)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.8.7->-r requirements.txt (line 4)) (3.4.4)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.8.7->-r requirements.txt (line 4)) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.8.7->-r requirements.txt (line 4)) (2.5.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.8.7->-r requirements.txt (line 4)) (2025.11.12)\n",
"Requirement already satisfied: blis<1.4.0,>=1.3.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from thinc<8.4.0,>=8.3.4->spacy==3.8.7->-r requirements.txt (line 4)) (1.3.2)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from thinc<8.4.0,>=8.3.4->spacy==3.8.7->-r requirements.txt (line 4)) (0.1.5)\n",
"Requirement already satisfied: click>=8.0.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (8.3.1)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (1.5.4)\n",
"Requirement already satisfied: rich>=10.11.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (14.2.0)\n",
"Requirement already satisfied: typer-slim<1.0.0,>=0.3.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.8.7->-r requirements.txt (line 4)) (0.20.0)\n",
"Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.8.7->-r requirements.txt (line 4)) (0.23.0)\n",
"Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from weasel<0.5.0,>=0.1.0->spacy==3.8.7->-r requirements.txt (line 4)) (7.5.0)\n",
"Requirement already satisfied: wrapt in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy==3.8.7->-r requirements.txt (line 4)) (2.0.1)\n",
"Requirement already satisfied: marisa-trie>=1.1.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy==3.8.7->-r requirements.txt (line 4)) (1.3.1)\n",
"Requirement already satisfied: wcwidth in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from prettytable>=3.9->skops==0.13.0->-r requirements.txt (line 1)) (0.2.14)\n",
"Requirement already satisfied: six>=1.5 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas==2.3.2->-r requirements.txt (line 2)) (1.17.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (4.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (2.19.2)\n",
"Requirement already satisfied: mdurl~=0.1 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy==3.8.7->-r requirements.txt (line 4)) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/garth/Projetos/sugestor-de-emails/.venv/lib/python3.13/site-packages (from jinja2->spacy==3.8.7->-r requirements.txt (line 4)) (3.0.3)\n",
"Collecting pt-core-news-sm==3.8.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.0/13.0 MB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25h\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('pt_core_news_sm')\n"
]
}
],
"source": [
"!pip install -r requirements.txt\n",
"!python -m spacy download pt_core_news_sm"
]
},
{
"cell_type": "markdown",
"id": "e1c638ad",
"metadata": {},
"source": [
"### Importar bibliotecas"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a28d353f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn.feature_extraction import DictVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.preprocessing import FunctionTransformer\n"
]
},
{
"cell_type": "markdown",
"id": "3ff61ebf",
"metadata": {},
"source": [
"### Dados de exemplo\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9b233d4a",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"emails_produtivo_improdutivo.csv\")\n",
"texts = df[\"email\"].values\n",
"labels = df[\"label\"].values"
]
},
{
"cell_type": "markdown",
"id": "2a2ca241",
"metadata": {},
"source": [
"### Separar treino e teste\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e258a336",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)"
]
},
{
"cell_type": "markdown",
"id": "98187e81",
"metadata": {},
"source": [
"### Criar funções de operações de texto"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "57d103ea",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"import re\n",
"\n",
"language_model = \"pt_core_news_sm\"\n",
"nlp = spacy.load(language_model)\n",
"\n",
"def lemmatize_text(text: str) -> str:\n",
" \"\"\"Lematiza e remove stopwords/pontuações de um texto.\"\"\"\n",
" doc = nlp(text.lower())\n",
" return \" \".join([\n",
" token.lemma_ for token in doc\n",
" if not token.is_punct and not token.is_space and token.text not in nlp.Defaults.stop_words\n",
" ])\n",
"\n",
"def extrair_flags( texts: list[str]):\n",
" # X é uma lista/array de textos\n",
" out = []\n",
" for txt in texts:\n",
" t = txt.lower()\n",
" flags = {\n",
" \"has_time\": int(bool(re.search(r\"\\b\\d{1,2}:\\d{2}\\b\", t))),\n",
" \"has_date\": int(bool(re.search(r\"\\b\\d{1,2}/\\d{1,2}/\\d{2,4}\\b\", t))),\n",
" \"has_ticket\": int(bool(re.search(r\"(inc|req|chamado|#\\d{4,})\", t))),\n",
" \"len_lt_60\": int(len(t) < 60),\n",
" \"has_action_word\": int(bool(re.search(r\"\\b(pauta|entrega|deadline|anexo|segu[e|ir]|agendar|reuni[aã]o|suporte|sla)\\b\", t))),\n",
" \"has_social_word\": int(bool(re.search(r\"\\b(parab[eé]ns|felicidades|obrigad[oa]!*$|confraterniza|happy hour|churrasco)\\b\", t)))\n",
" }\n",
" out.append(flags)\n",
" return out"
]
},
{
"cell_type": "markdown",
"id": "ac17b9fd",
"metadata": {},
"source": [
"### Criar pipeline TF-IDF + Classificador \n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "851cd1c0",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"vectorizer = TfidfVectorizer(\n",
" preprocessor=lemmatize_text,\n",
" lowercase=False,\n",
" ngram_range=(1, 2),\n",
" min_df=3,\n",
" max_df=0.95,\n",
" sublinear_tf=True,\n",
" norm='l2'\n",
")\n",
"\n",
"features = FeatureUnion([\n",
" (\"tfidf\", vectorizer),\n",
" (\"flags\", Pipeline([\n",
" (\"fx\", FunctionTransformer(extrair_flags, validate=False)),\n",
" (\"dv\", DictVectorizer(sparse=True))\n",
" ]))\n",
"])\n",
"\n",
"pipeline = Pipeline([\n",
" (\"feats\", features),\n",
" (\"clf\", LogisticRegression(class_weight='balanced', max_iter=2000))\n",
"])"
]
},
{
"cell_type": "markdown",
"id": "b6eb8af1",
"metadata": {},
"source": [
"### Treinar modelo\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7782ae07",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
Pipeline(steps=[('feats',\n",
" FeatureUnion(transformer_list=[('tfidf',\n",
" TfidfVectorizer(lowercase=False,\n",
" max_df=0.95,\n",
" min_df=3,\n",
" ngram_range=(1,\n",
" 2),\n",
" preprocessor=<function lemmatize_text at 0x7f3ae7c1a7a0>,\n",
" sublinear_tf=True)),\n",
" ('flags',\n",
" Pipeline(steps=[('fx',\n",
" FunctionTransformer(func=<function extrair_flags at 0x7f3ae7b923e0>)),\n",
" ('dv',\n",
" DictVectorizer())]))])),\n",
" ('clf',\n",
" LogisticRegression(class_weight='balanced', max_iter=2000))]) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. \n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" steps \n",
" [('feats', ...), ('clf', ...)] \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" transform_input \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" memory \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" False \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" transformer_list \n",
" [('tfidf', ...), ('flags', ...)] \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" n_jobs \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" transformer_weights \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose_feature_names_out \n",
" True \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" input \n",
" 'content' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" encoding \n",
" 'utf-8' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" decode_error \n",
" 'strict' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" strip_accents \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" lowercase \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" preprocessor \n",
" <function lem...x7f3ae7c1a7a0> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" tokenizer \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" analyzer \n",
" 'word' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" stop_words \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" token_pattern \n",
" '(?u)\\\\b\\\\w\\\\w+\\\\b' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" ngram_range \n",
" (1, ...) \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" max_df \n",
" 0.95 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" min_df \n",
" 3 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" max_features \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" vocabulary \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" binary \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" dtype \n",
" <class 'numpy.float64'> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" norm \n",
" 'l2' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" use_idf \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" smooth_idf \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sublinear_tf \n",
" True \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
extrair_flags
FunctionTransformer
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" func \n",
" <function ext...x7f3ae7b923e0> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" inverse_func \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" validate \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" accept_sparse \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" check_inverse \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" feature_names_out \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" kw_args \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" inv_kw_args \n",
" None \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" dtype \n",
" <class 'numpy.float64'> \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" separator \n",
" '=' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sparse \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" sort \n",
" True \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n",
" Parameters \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" penalty \n",
" 'l2' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" dual \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" tol \n",
" 0.0001 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" C \n",
" 1.0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" fit_intercept \n",
" True \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" intercept_scaling \n",
" 1 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" class_weight \n",
" 'balanced' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" random_state \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" solver \n",
" 'lbfgs' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" max_iter \n",
" 2000 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" multi_class \n",
" 'deprecated' \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" verbose \n",
" 0 \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" warm_start \n",
" False \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" n_jobs \n",
" None \n",
" \n",
" \n",
"\n",
" \n",
" \n",
" l1_ratio \n",
" None \n",
" \n",
" \n",
" \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Pipeline(steps=[('feats',\n",
" FeatureUnion(transformer_list=[('tfidf',\n",
" TfidfVectorizer(lowercase=False,\n",
" max_df=0.95,\n",
" min_df=3,\n",
" ngram_range=(1,\n",
" 2),\n",
" preprocessor=,\n",
" sublinear_tf=True)),\n",
" ('flags',\n",
" Pipeline(steps=[('fx',\n",
" FunctionTransformer(func=)),\n",
" ('dv',\n",
" DictVectorizer())]))])),\n",
" ('clf',\n",
" LogisticRegression(class_weight='balanced', max_iter=2000))])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "944db9d1",
"metadata": {},
"source": [
"### Avaliar modelo\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "20cb21eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" improdutivo 0.99 1.00 1.00 259\n",
" produtivo 1.00 0.99 1.00 241\n",
"\n",
" accuracy 1.00 500\n",
" macro avg 1.00 1.00 1.00 500\n",
"weighted avg 1.00 1.00 1.00 500\n",
"\n"
]
}
],
"source": [
"y_pred = pipeline.predict(X_test)\n",
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"id": "0855a435",
"metadata": {},
"source": [
"### Salvar modelo para formato `.skops`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97bbc77b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Modelo salvo em 'email_classifier_pt.skops' ás 2025-11-16T16:56:16\n"
]
}
],
"source": [
"from datetime import datetime, timezone\n",
"\n",
"# data e hora ISO 8601 em UTC (com 'Z')\n",
"agora_utc_iso = datetime.now(timezone.utc).strftime(\"%Y-%m-%dT%H:%M:%S\")\n",
"\n",
"\n",
"import joblib\n",
"\n",
"joblib.dump(pipeline, \"email_classifier_pt.joblib\")\n",
"\n",
"\n",
"print(\"Modelo salvo em 'email_classifier_pt.skops' ás \" + agora_utc_iso)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}