Mario Faúndez Vidal commited on
Commit ·
f1f6f45
1
Parent(s): b197211
Update project configuration and remove requirements.in
Browse files- .gitignore +2 -0
- Makefile +75 -17
- hate_speech_bert_bert_mlp_in_tensorflow.ipynb +0 -0
- hate_speech_run.ipynb +122 -123
- requirements.in +0 -3
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -128,3 +128,5 @@ dmypy.json
|
|
| 128 |
# Pyre type checker
|
| 129 |
.pyre/
|
| 130 |
flagged/
|
|
|
|
|
|
|
|
|
| 128 |
# Pyre type checker
|
| 129 |
.pyre/
|
| 130 |
flagged/
|
| 131 |
+
|
| 132 |
+
.ruff_cache/
|
Makefile
CHANGED
|
@@ -1,24 +1,82 @@
|
|
| 1 |
-
|
| 2 |
-
export PATH := ./venv/bin:$(PATH)
|
| 3 |
-
.PHONY: help
|
| 4 |
-
help: ## This help.
|
| 5 |
-
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
.DEFAULT_GOAL := help
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
touch requirements.in ;\
|
| 17 |
-
pip-compile --output-file requirements.txt requirements.in;\
|
| 18 |
-
pip install -r requirements.txt
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
clean:
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: help install dev clean test format lint status run notebook docker-build docker-run
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
IMAGE_NAME := classify-text-with-bert-hate-speech
|
| 4 |
+
IMAGE_TAG := local
|
| 5 |
+
|
| 6 |
+
# Default target - show help
|
| 7 |
.DEFAULT_GOAL := help
|
| 8 |
|
| 9 |
+
.ONESHELL:
|
| 10 |
+
|
| 11 |
+
help:
|
| 12 |
+
@echo "Project - Available Commands:"
|
| 13 |
+
@echo ""
|
| 14 |
+
@echo " make install - Install production dependencies with uv"
|
| 15 |
+
@echo " make dev - Install development dependencies with uv"
|
| 16 |
+
@echo " make run - Run the application (app.py)"
|
| 17 |
+
@echo " make notebook - Launch jupyter notebook/lab"
|
| 18 |
+
@echo " make test - Run tests (requires pytest)"
|
| 19 |
+
@echo " make format - Format code (ruff or black if available)"
|
| 20 |
+
@echo " make lint - Run linter (ruff if available)"
|
| 21 |
+
@echo " make clean - Remove venv and build/test artifacts"
|
| 22 |
+
@echo " make status - Show python version and installed packages summary"
|
| 23 |
+
@echo " make docker-build- Build a local docker image"
|
| 24 |
+
@echo " make docker-run - Run the docker image locally"
|
| 25 |
+
@echo ""
|
| 26 |
+
|
| 27 |
+
install:
|
| 28 |
+
@echo "📦 Installing production dependencies with uv..."
|
| 29 |
+
uv sync --no-dev
|
| 30 |
+
@echo "✅ Production dependencies installed successfully!"
|
| 31 |
+
|
| 32 |
+
dev:
|
| 33 |
+
@echo "📦 Installing development dependencies with uv..."
|
| 34 |
+
uv sync --dev
|
| 35 |
+
@echo "✅ Development dependencies installed successfully!"
|
| 36 |
+
|
| 37 |
+
run:
|
| 38 |
+
@echo "Running app.py..."
|
| 39 |
+
uv run python app.py
|
| 40 |
|
| 41 |
+
notebook:
|
| 42 |
+
@echo "Launching Jupyter Notebook (or lab if available)..."
|
| 43 |
+
uv run jupyter lab || uv run jupyter notebook
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
test:
|
| 46 |
+
@echo "Running tests with pytest..."
|
| 47 |
+
uv run pytest -q tests || true
|
| 48 |
+
|
| 49 |
+
format:
|
| 50 |
+
@echo "Formatting code with ruff..."
|
| 51 |
+
uv run ruff format . || true
|
| 52 |
+
-uv run ruff check --fix . || true
|
| 53 |
+
|
| 54 |
+
lint:
|
| 55 |
+
@echo "Running linter with ruff..."
|
| 56 |
+
uv run ruff check .
|
| 57 |
+
|
| 58 |
+
status:
|
| 59 |
+
@uv run python --version
|
| 60 |
+
@echo "Installed packages:"
|
| 61 |
+
@uv pip list
|
| 62 |
|
| 63 |
clean:
|
| 64 |
+
@echo "🧹 Cleaning up build and Python artifacts..."
|
| 65 |
+
-rm -rf .venv build/ dist/ *.egg-info .eggs
|
| 66 |
+
-find . -type d -name "__pycache__" -exec rm -rf {} +
|
| 67 |
+
-find . -type f -name "*.py[co]" -delete
|
| 68 |
+
-find . -type f -name ".coverage" -delete
|
| 69 |
+
-find . -type d -name ".pytest_cache" -exec rm -rf {} +
|
| 70 |
+
-find . -type d -name ".ruff_cache" -exec rm -rf {} +
|
| 71 |
+
-find . -type d -name ".mypy_cache" -exec rm -rf {} +
|
| 72 |
+
-find . -type d -name "htmlcov" -exec rm -rf {} +
|
| 73 |
+
-find . -type f -name ".DS_Store" -delete
|
| 74 |
+
@echo "✅ Cleanup completed!"
|
| 75 |
+
|
| 76 |
+
docker-build:
|
| 77 |
+
@echo "Building docker image ${IMAGE_NAME}:${IMAGE_TAG} (requires Docker)"
|
| 78 |
+
@docker build -t ${IMAGE_NAME}:${IMAGE_TAG} . || true
|
| 79 |
+
|
| 80 |
+
docker-run:
|
| 81 |
+
@echo "Running docker image ${IMAGE_NAME}:${IMAGE_TAG} (port 7860 forwarded)"
|
| 82 |
+
@docker run --rm -p 7860:7860 ${IMAGE_NAME}:${IMAGE_TAG}
|
hate_speech_bert_bert_mlp_in_tensorflow.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
hate_speech_run.ipynb
CHANGED
|
@@ -1,134 +1,133 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
"
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
},
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
"\n",
|
| 36 |
-
"In this notebook, I am going to use a pretreined BERT to compute vector-space representations of a hate speech dataset to feed two different downsteam Archtectures (CNN and MLP).\n",
|
| 37 |
-
"\n",
|
| 38 |
-
"Sentiment Analysis\n",
|
| 39 |
-
"\n",
|
| 40 |
-
"This notebook trains a sentiment analysis model to classify the [Hate Speech and Offensive Language Dataset]( https://www.kaggle.com/mrmorj/hate-speech-and-offensive-language-dataset) tweets in three classes:\n",
|
| 41 |
-
" \n",
|
| 42 |
-
"* 0 - hate speech \n",
|
| 43 |
-
"* 1 - offensive language \n",
|
| 44 |
-
"* 2 - neither as positive or negative"
|
| 45 |
-
],
|
| 46 |
-
"metadata": {
|
| 47 |
-
"id": "Jh_WkIs1iJDs"
|
| 48 |
-
}
|
| 49 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
{
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
"
|
| 57 |
-
"
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
{
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
"import tensorflow_text as text\n",
|
| 69 |
-
"import numpy as np\n",
|
| 70 |
-
"np.set_printoptions(suppress=True)\n",
|
| 71 |
-
"\n",
|
| 72 |
-
"\n",
|
| 73 |
-
"# Carga el modelo\n",
|
| 74 |
-
"with tf.keras.utils.custom_object_scope({'AdamWeightDecay': AdamWeightDecay, 'WarmUp': WarmUp}):\n",
|
| 75 |
-
" classifier_model = tf.keras.models.load_model('classifier_model.h5', \n",
|
| 76 |
-
" custom_objects={'KerasLayer': hub.KerasLayer})"
|
| 77 |
-
],
|
| 78 |
-
"metadata": {
|
| 79 |
-
"colab": {
|
| 80 |
-
"base_uri": "https://localhost:8080/"
|
| 81 |
-
},
|
| 82 |
-
"id": "EQlIdYjKYAnn",
|
| 83 |
-
"outputId": "efb1f87f-8b45-4201-ac14-2abdc74b8cfd"
|
| 84 |
-
},
|
| 85 |
-
"execution_count": null,
|
| 86 |
-
"outputs": [
|
| 87 |
-
{
|
| 88 |
-
"output_type": "stream",
|
| 89 |
-
"name": "stderr",
|
| 90 |
-
"text": [
|
| 91 |
-
"WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n",
|
| 92 |
-
"WARNING:tensorflow:From /usr/local/lib/python3.9/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
|
| 93 |
-
"Instructions for updating:\n",
|
| 94 |
-
"Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
|
| 95 |
-
"WARNING:tensorflow:Error in loading the saved optimizer state. As a result, your model is starting with a freshly initialized optimizer.\n"
|
| 96 |
-
]
|
| 97 |
-
}
|
| 98 |
-
]
|
| 99 |
},
|
| 100 |
{
|
| 101 |
-
|
| 102 |
-
"
|
| 103 |
-
|
| 104 |
-
],
|
| 105 |
-
"metadata": {
|
| 106 |
-
"colab": {
|
| 107 |
-
"base_uri": "https://localhost:8080/"
|
| 108 |
-
},
|
| 109 |
-
"id": "6Ma3P-7iYEbA",
|
| 110 |
-
"outputId": "2e6fbc37-6ab8-4035-c706-f8d6d3d8c7ba"
|
| 111 |
-
},
|
| 112 |
-
"execution_count": null,
|
| 113 |
-
"outputs": [
|
| 114 |
-
{
|
| 115 |
-
"output_type": "stream",
|
| 116 |
-
"name": "stdout",
|
| 117 |
-
"text": [
|
| 118 |
-
"1/1 [==============================] - 1s 1s/step\n"
|
| 119 |
-
]
|
| 120 |
-
},
|
| 121 |
-
{
|
| 122 |
-
"output_type": "execute_result",
|
| 123 |
-
"data": {
|
| 124 |
-
"text/plain": [
|
| 125 |
-
"array([[0.99998355, 0.00001638, 0.00000017]], dtype=float32)"
|
| 126 |
-
]
|
| 127 |
-
},
|
| 128 |
-
"metadata": {},
|
| 129 |
-
"execution_count": 4
|
| 130 |
-
}
|
| 131 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "Jh_WkIs1iJDs"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"# Implementation of text classification with BERT\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"Still Working on it.\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"This notebook is based in this TensorFlow tutorial: [Classify text with BERT](https://www.tensorflow.org/tutorials/text/classify_text_wibert)\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"BERT [(article link)](https://arxiv.org/abs/1810.04805) and other Transformer encoder architectures have been wildly successful on a variety of tasks in NLP (natural language processing). They compute vector-space representations of natural language that are suitable for use in deep learning models.\n",
|
| 17 |
+
"\n",
|
| 18 |
+
":\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"Source: http://www.d2l.ai/chapter_natural-language-processing-pretraining/index.html\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"BERT models are usually pre-trained on a large corpus of text, then fine-tuned for specific tasks.\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"In this notebook, I am going to use a pretreined BERT to compute vector-space representations of a hate speech dataset to feed two different downsteam Archtectures (CNN and MLP).\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"Sentiment Analysis\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"This notebook trains a sentiment analysis model to classify the [Hate Speech and Offensive Language Dataset]( https://www.kaggle.com/mrmorj/hate-speech-and-offensive-language-dataset) tweets in three classes:\n",
|
| 29 |
+
" \n",
|
| 30 |
+
"* 0 - hate speech \n",
|
| 31 |
+
"* 1 - offensive language \n",
|
| 32 |
+
"* 2 - neither as positive or negative"
|
| 33 |
+
]
|
| 34 |
},
|
| 35 |
+
{
|
| 36 |
+
"cell_type": "code",
|
| 37 |
+
"execution_count": null,
|
| 38 |
+
"metadata": {
|
| 39 |
+
"id": "Ltc_HOzjX87s"
|
| 40 |
+
},
|
| 41 |
+
"outputs": [],
|
| 42 |
+
"source": [
|
| 43 |
+
"!pip install -q tensorflow-text > /dev/null\n",
|
| 44 |
+
"!pip install -q tf-models-official > /dev/null"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": null,
|
| 50 |
+
"metadata": {
|
| 51 |
+
"colab": {
|
| 52 |
+
"base_uri": "https://localhost:8080/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
+
"id": "EQlIdYjKYAnn",
|
| 55 |
+
"outputId": "efb1f87f-8b45-4201-ac14-2abdc74b8cfd"
|
| 56 |
+
},
|
| 57 |
+
"outputs": [
|
| 58 |
{
|
| 59 |
+
"name": "stderr",
|
| 60 |
+
"output_type": "stream",
|
| 61 |
+
"text": [
|
| 62 |
+
"WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n",
|
| 63 |
+
"WARNING:tensorflow:From /usr/local/lib/python3.9/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
|
| 64 |
+
"Instructions for updating:\n",
|
| 65 |
+
"Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
|
| 66 |
+
"WARNING:tensorflow:Error in loading the saved optimizer state. As a result, your model is starting with a freshly initialized optimizer.\n"
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
],
|
| 70 |
+
"source": [
|
| 71 |
+
"import numpy as np\n",
|
| 72 |
+
"import tensorflow as tf\n",
|
| 73 |
+
"import tensorflow_hub as hub\n",
|
| 74 |
+
"from official.nlp.optimization import AdamWeightDecay, WarmUp\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"np.set_printoptions(suppress=True)\n",
|
| 77 |
+
"\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# Carga el modelo\n",
|
| 80 |
+
"with tf.keras.utils.custom_object_scope({\"AdamWeightDecay\": AdamWeightDecay, \"WarmUp\": WarmUp}):\n",
|
| 81 |
+
" classifier_model = tf.keras.models.load_model(\"classifier_model.h5\", custom_objects={\"KerasLayer\": hub.KerasLayer})"
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"execution_count": null,
|
| 87 |
+
"metadata": {
|
| 88 |
+
"colab": {
|
| 89 |
+
"base_uri": "https://localhost:8080/"
|
| 90 |
},
|
| 91 |
+
"id": "6Ma3P-7iYEbA",
|
| 92 |
+
"outputId": "2e6fbc37-6ab8-4035-c706-f8d6d3d8c7ba"
|
| 93 |
+
},
|
| 94 |
+
"outputs": [
|
| 95 |
{
|
| 96 |
+
"name": "stdout",
|
| 97 |
+
"output_type": "stream",
|
| 98 |
+
"text": [
|
| 99 |
+
"1/1 [==============================] - 1s 1s/step\n"
|
| 100 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
+
"data": {
|
| 104 |
+
"text/plain": [
|
| 105 |
+
"array([[0.99998355, 0.00001638, 0.00000017]], dtype=float32)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
]
|
| 107 |
+
},
|
| 108 |
+
"execution_count": 4,
|
| 109 |
+
"metadata": {},
|
| 110 |
+
"output_type": "execute_result"
|
| 111 |
}
|
| 112 |
+
],
|
| 113 |
+
"source": [
|
| 114 |
+
"classifier_model.predict([\"LEETSSS GOOO Get those ... outta here!!!!!!\"])"
|
| 115 |
+
]
|
| 116 |
+
}
|
| 117 |
+
],
|
| 118 |
+
"metadata": {
|
| 119 |
+
"colab": {
|
| 120 |
+
"provenance": []
|
| 121 |
+
},
|
| 122 |
+
"gpuClass": "standard",
|
| 123 |
+
"kernelspec": {
|
| 124 |
+
"display_name": "Python 3",
|
| 125 |
+
"name": "python3"
|
| 126 |
+
},
|
| 127 |
+
"language_info": {
|
| 128 |
+
"name": "python"
|
| 129 |
+
}
|
| 130 |
+
},
|
| 131 |
+
"nbformat": 4,
|
| 132 |
+
"nbformat_minor": 0
|
| 133 |
}
|
requirements.in
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
gradio
|
| 2 |
-
tensorflow-text
|
| 3 |
-
tf-models-official
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|