Ubuntu commited on
Commit ·
58d9159
0
Parent(s):
Re-adiciona model.safetensors via LFS
Browse files- .gitattributes +36 -0
- .gitignore +96 -0
- Dockerfile +3 -0
- README.md +7 -0
- config.json +19 -0
- docker-compose.yml +45 -0
- generation_config.json +5 -0
- model.safetensors +3 -0
- nginx.conf +41 -0
- pyproject.toml +47 -0
- setup.py +18 -0
- special_tokens_map.json +5 -0
- src/__init__.py +0 -0
- src/dataset/__init__.py +0 -0
- src/dataset/fine_tuning.py +160 -0
- src/dataset/pre_train.py +106 -0
- src/logger/__init__.py +0 -0
- src/logger/logger.py +144 -0
- src/pre-training.py +111 -0
- src/tokenizer/__init__.py +0 -0
- src/tokenizer/tests.py +77 -0
- src/tokenizer/tokens-bpe-36k.json +0 -0
- src/tokenizer/trainer.py +77 -0
- src/training.py +203 -0
- src/tynerox/__init__.py +3 -0
- src/tynerox/modeling.py +449 -0
- src/visualizations/sample.html +723 -0
- tokenizer.json +0 -0
- tokenizer_config.json +91 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte‑compiled / build artifacts
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
src/checkpoints/
|
| 16 |
+
mlflow/
|
| 17 |
+
postgres-temp/
|
| 18 |
+
eggs/
|
| 19 |
+
.eggs/
|
| 20 |
+
lib/
|
| 21 |
+
lib64/
|
| 22 |
+
parts/
|
| 23 |
+
sdist/
|
| 24 |
+
var/
|
| 25 |
+
*.egg-info/
|
| 26 |
+
.installed.cfg
|
| 27 |
+
*.egg
|
| 28 |
+
|
| 29 |
+
# Installer logs
|
| 30 |
+
pip-log.txt
|
| 31 |
+
pip-delete-this-directory.txt
|
| 32 |
+
|
| 33 |
+
# PyInstaller
|
| 34 |
+
# Usually these files are written by a python script from a template
|
| 35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 36 |
+
*.manifest
|
| 37 |
+
*.spec
|
| 38 |
+
*.pth
|
| 39 |
+
|
| 40 |
+
# Unit test / coverage reports
|
| 41 |
+
htmlcov/
|
| 42 |
+
.tox/
|
| 43 |
+
.nox/
|
| 44 |
+
.coverage
|
| 45 |
+
.coverage.*
|
| 46 |
+
.cache
|
| 47 |
+
nosetests.xml
|
| 48 |
+
coverage.xml
|
| 49 |
+
*.cover
|
| 50 |
+
*.py,cover
|
| 51 |
+
.hypothesis/
|
| 52 |
+
|
| 53 |
+
# IDEs and editors
|
| 54 |
+
.idea/
|
| 55 |
+
*.iml
|
| 56 |
+
.vscode/
|
| 57 |
+
*.sublime-project
|
| 58 |
+
*.sublime-workspace
|
| 59 |
+
|
| 60 |
+
# Docker
|
| 61 |
+
docker-compose.override.yml
|
| 62 |
+
.docker/
|
| 63 |
+
|
| 64 |
+
# Environment / virtualenv
|
| 65 |
+
.env
|
| 66 |
+
.venv/
|
| 67 |
+
env/
|
| 68 |
+
venv/
|
| 69 |
+
ENV/
|
| 70 |
+
env.bak/
|
| 71 |
+
venv.bak/
|
| 72 |
+
|
| 73 |
+
# Poetry
|
| 74 |
+
poetry.lock
|
| 75 |
+
poetry.toml
|
| 76 |
+
|
| 77 |
+
# Pyproject (se configurar como público; se privado, remova)
|
| 78 |
+
# pyproject.toml
|
| 79 |
+
|
| 80 |
+
# Lock files
|
| 81 |
+
Pipfile.lock
|
| 82 |
+
|
| 83 |
+
# Runtime data
|
| 84 |
+
*.pid
|
| 85 |
+
*.seed
|
| 86 |
+
*.log
|
| 87 |
+
|
| 88 |
+
# Jupyter Notebook
|
| 89 |
+
.ipynb_checkpoints
|
| 90 |
+
|
| 91 |
+
# VS Code settings
|
| 92 |
+
.vscode/
|
| 93 |
+
|
| 94 |
+
# OS files
|
| 95 |
+
.DS_Store
|
| 96 |
+
Thumbs.db
|
Dockerfile
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM ghcr.io/mlflow/mlflow:v2.21.3
|
| 2 |
+
|
| 3 |
+
RUN pip install boto3
|
README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
base_model:
|
| 6 |
+
- bobboyms/tynerox
|
| 7 |
+
---
|
config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"TyneRoxModel"
|
| 4 |
+
],
|
| 5 |
+
"causal": true,
|
| 6 |
+
"d_model": 1024,
|
| 7 |
+
"dropout": 0.1,
|
| 8 |
+
"layer_norm_eps": 1e-05,
|
| 9 |
+
"max_position_embeddings": 2048,
|
| 10 |
+
"model_type": "tynerox",
|
| 11 |
+
"num_attention_heads": 16,
|
| 12 |
+
"num_hidden_layers": 12,
|
| 13 |
+
"pad_token_id": 1,
|
| 14 |
+
"tie_word_embeddings": false,
|
| 15 |
+
"torch_dtype": "float32",
|
| 16 |
+
"transformers_version": "4.51.3",
|
| 17 |
+
"vocab_size": 36010,
|
| 18 |
+
"window_size": 512
|
| 19 |
+
}
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3'
|
| 2 |
+
services:
|
| 3 |
+
mlflow:
|
| 4 |
+
# image: ghcr.io/mlflow/mlflow:v2.21.3
|
| 5 |
+
# image: ghcr.io/mlflow/mlflow:v3.0.0rc0
|
| 6 |
+
build: .
|
| 7 |
+
ports:
|
| 8 |
+
- "5000:5000"
|
| 9 |
+
environment:
|
| 10 |
+
- MLFLOW_ARTIFACT_ROOT=s3://1hh-mlflow/artifacts
|
| 11 |
+
- MLFLOW_TRACKING_URI=http://mlflow:5000
|
| 12 |
+
volumes:
|
| 13 |
+
- ./mlflow:/mlflow
|
| 14 |
+
command: mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlflow.db --default-artifact-root s3://1hh-mlflow/artifacts
|
| 15 |
+
# s3://1hh-mlflow/artifacts/
|
| 16 |
+
postgres:
|
| 17 |
+
image: postgres:14
|
| 18 |
+
environment:
|
| 19 |
+
- POSTGRES_USER=mlflow
|
| 20 |
+
- POSTGRES_PASSWORD=mlflow
|
| 21 |
+
- POSTGRES_DB=mlflowdb
|
| 22 |
+
volumes:
|
| 23 |
+
- ./postgres-temp:/var/lib/postgresql/temp
|
| 24 |
+
|
| 25 |
+
# minio:
|
| 26 |
+
# image: minio/minio:latest
|
| 27 |
+
# ports:
|
| 28 |
+
# - "9000:9000"
|
| 29 |
+
# environment:
|
| 30 |
+
# - MINIO_ROOT_USER=minioadmin
|
| 31 |
+
# - MINIO_ROOT_PASSWORD=minioadmin
|
| 32 |
+
# volumes:
|
| 33 |
+
# - ./minio-temp:/temp
|
| 34 |
+
# command: server /temp --console-address ":9001"
|
| 35 |
+
|
| 36 |
+
nginx:
|
| 37 |
+
image: nginx:latest
|
| 38 |
+
ports:
|
| 39 |
+
- "80:80"
|
| 40 |
+
volumes:
|
| 41 |
+
# - ./nginx.conf:/etc/nginx/nginx.conf:ro
|
| 42 |
+
- ./nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
| 43 |
+
depends_on:
|
| 44 |
+
- mlflow
|
| 45 |
+
# - minio
|
generation_config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"pad_token_id": 1,
|
| 4 |
+
"transformers_version": "4.51.3"
|
| 5 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2dc6c386af412163c51f18f97152117040b6464f9e64159ef464d50471ceda1c
|
| 3 |
+
size 1101168184
|
nginx.conf
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# mlflow_proxy.conf
|
| 2 |
+
|
| 3 |
+
upstream mlflow {
|
| 4 |
+
server mlflow:5000;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
# upstream minio {
|
| 8 |
+
# server minio:9000;
|
| 9 |
+
# }
|
| 10 |
+
|
| 11 |
+
server {
|
| 12 |
+
listen 80;
|
| 13 |
+
server_name _; # Opcional: escuta em qualquer nome de host
|
| 14 |
+
|
| 15 |
+
# Logs específicos para este server block (ajuda na depuração)
|
| 16 |
+
access_log /var/log/nginx/mlflow_access.log;
|
| 17 |
+
error_log /var/log/nginx/mlflow_error.log debug; # Use 'debug' para mais detalhes
|
| 18 |
+
|
| 19 |
+
location / {
|
| 20 |
+
proxy_pass http://mlflow;
|
| 21 |
+
proxy_set_header Host $host;
|
| 22 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 23 |
+
# Headers úteis para proxys reversos
|
| 24 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 25 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 26 |
+
|
| 27 |
+
# Opcional: Aumentar timeouts se houver problemas de conexão
|
| 28 |
+
# proxy_connect_timeout 60s;
|
| 29 |
+
# proxy_read_timeout 60s;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
# location /minio {
|
| 33 |
+
# # Atenção: MinIO pode precisar de reescrita de URL ou configuração específica
|
| 34 |
+
# # dependendo de como ele lida com subpastas.
|
| 35 |
+
# proxy_pass http://minio;
|
| 36 |
+
# proxy_set_header Host $host; # MinIO pode precisar do host correto
|
| 37 |
+
# proxy_set_header X-Real-IP $remote_addr;
|
| 38 |
+
# proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 39 |
+
# proxy_set_header X-Forwarded-Proto $scheme;
|
| 40 |
+
# }
|
| 41 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "tynerox"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = [
|
| 6 |
+
{name = "Thiago L. Rodrigues"}
|
| 7 |
+
]
|
| 8 |
+
readme = "README.md"
|
| 9 |
+
requires-python = ">=3.9,<4.0"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"torch (>=2.6.0,<3.0.0)",
|
| 12 |
+
"transformers[torch] (>=4.50.3,<5.0.0)",
|
| 13 |
+
"python-dotenv (>=1.1.0,<2.0.0)",
|
| 14 |
+
"tavily-python (>=0.5.3,<0.6.0)",
|
| 15 |
+
"langchain-community (>=0.3.20,<0.4.0)",
|
| 16 |
+
"pydantic (>=2.11.1,<3.0.0)",
|
| 17 |
+
"pandas (>=2.2.3,<3.0.0)",
|
| 18 |
+
"openai-agents (>=0.0.7,<0.0.8)",
|
| 19 |
+
"datasets (>=3.5.0,<4.0.0)",
|
| 20 |
+
"mlflow (>=2.21.3,<3.0.0)",
|
| 21 |
+
"beautifulsoup4 (>=4.13.3,<5.0.0)",
|
| 22 |
+
"packaging (>=24.2,<25.0)",
|
| 23 |
+
"boto3 (>=1.37.37,<2.0.0)",
|
| 24 |
+
"flash-attn (>=2.7.4.post1,<3.0.0)",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
[tool.poetry]
|
| 28 |
+
name = "tynerox"
|
| 29 |
+
version = "0.1.0"
|
| 30 |
+
packages = [
|
| 31 |
+
{ include = "tynerox", from = "src" }
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
[build-system]
|
| 36 |
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
| 37 |
+
build-backend = "poetry.core.masonry.api"
|
| 38 |
+
|
| 39 |
+
[tool.poetry.group.dev.dependencies]
|
| 40 |
+
pytest = "^8.3.5"
|
| 41 |
+
|
| 42 |
+
[tool.pytest.ini_options]
|
| 43 |
+
minversion = "6.0"
|
| 44 |
+
addopts = "-ra -q"
|
| 45 |
+
testpaths = ["src/tests"]
|
| 46 |
+
python_files = ["test_*.py"]
|
| 47 |
+
norecursedirs = ["postgres-data"]
|
setup.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name="tynerox",
|
| 5 |
+
version="0.1.0",
|
| 6 |
+
description="TyneRox: custom rotary-transformer causal LM",
|
| 7 |
+
author="Thiago Luiz Rodrigues",
|
| 8 |
+
author_email="<EMAIL>",
|
| 9 |
+
url="https://github.com/seu-usuario/tynerox",
|
| 10 |
+
license="Apache-2.0",
|
| 11 |
+
packages=find_packages("src"),
|
| 12 |
+
package_dir={"": "src"},
|
| 13 |
+
install_requires=[
|
| 14 |
+
"torch>=2.6.0,<3.0.0",
|
| 15 |
+
"transformers[torch]>=4.50.3,<5.0.0",
|
| 16 |
+
"flash-attn>=2.7.4.post1,<3.0.0",
|
| 17 |
+
],
|
| 18 |
+
)
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": "<|endoftext|>",
|
| 3 |
+
"pad_token": "<|endoftext|>",
|
| 4 |
+
"unk_token": "[UNK]"
|
| 5 |
+
}
|
src/__init__.py
ADDED
|
File without changes
|
src/dataset/__init__.py
ADDED
|
File without changes
|
src/dataset/fine_tuning.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
import torch
|
| 3 |
+
from torch.utils.data import DataLoader
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 7 |
+
from functools import partial
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def tokenize_function(examples: Dict[str, Any], tokenizer: Any) -> Dict[str, List[int]]:
|
| 11 |
+
"""
|
| 12 |
+
Aplica a template de chat do tokenizer e gera os token ids.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
examples (Dict[str, Any]): Dicionário contendo a lista de mensagens sob a chave "messages".
|
| 16 |
+
tokenizer (Any): Instância do tokenizer que deverá possuir a propriedade 'chat_template'.
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Dict[str, List[int]]: Dicionário com os token ids gerados.
|
| 20 |
+
"""
|
| 21 |
+
full_text = tokenizer.apply_chat_template(
|
| 22 |
+
examples["messages"],
|
| 23 |
+
tokenize=True,
|
| 24 |
+
add_generation_prompt=True
|
| 25 |
+
)
|
| 26 |
+
return {"input_ids": full_text}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def custom_collate_fn(
|
| 30 |
+
batch: List[Dict[str, List[int]]],
|
| 31 |
+
pad_token_id: int = 29797,
|
| 32 |
+
ignore_index: int = -100,
|
| 33 |
+
allowed_max_length: Optional[int] = None,
|
| 34 |
+
device: str = "cpu",
|
| 35 |
+
) -> Dict[str, torch.Tensor]:
|
| 36 |
+
"""
|
| 37 |
+
• Faz padding das sequências
|
| 38 |
+
• Cria pares (input, label) deslocando 1 posição
|
| 39 |
+
• Aplica `ignore_index` (-100) APENAS nos labels depois do 1.º PAD
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
# 1) Lista → Tensor + PAD final
|
| 43 |
+
seqs = [torch.tensor(s["input_ids"] + [pad_token_id]) for s in batch]
|
| 44 |
+
|
| 45 |
+
# 2) Padding até o comprimento máximo do batch
|
| 46 |
+
padded = pad_sequence(seqs, batch_first=True, padding_value=pad_token_id)
|
| 47 |
+
|
| 48 |
+
# 3) Desloca 1 posição e CLONA para quebrar o compartilhamento de memória
|
| 49 |
+
input_ids = padded[:, :-1].clone() # ← nunca terá -100
|
| 50 |
+
labels = padded[:, 1:].clone() # ← vamos editar aqui
|
| 51 |
+
|
| 52 |
+
# 4) Define -100 após o primeiro PAD de cada sequência
|
| 53 |
+
pad_mask = (labels == pad_token_id)
|
| 54 |
+
if pad_mask.any():
|
| 55 |
+
# índice da primeira ocorrência de PAD em cada linha
|
| 56 |
+
first_pad_pos = pad_mask.float().cumsum(1).eq(1) & pad_mask
|
| 57 |
+
# tudo que vem depois do primeiro PAD recebe -100
|
| 58 |
+
mask_after_first_pad = pad_mask & ~first_pad_pos
|
| 59 |
+
labels[mask_after_first_pad] = ignore_index
|
| 60 |
+
|
| 61 |
+
# 5) Trunca se for solicitado
|
| 62 |
+
if allowed_max_length is not None:
|
| 63 |
+
input_ids = input_ids[:, :allowed_max_length]
|
| 64 |
+
labels = labels[:, :allowed_max_length]
|
| 65 |
+
|
| 66 |
+
return {
|
| 67 |
+
"input_ids": input_ids.to(device),
|
| 68 |
+
"labels": labels.to(device),
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def create_data_loader_fine_tuning(
|
| 74 |
+
tokenizer: Any,
|
| 75 |
+
batch_size: int,
|
| 76 |
+
path_folder: str,
|
| 77 |
+
split: str = "train",
|
| 78 |
+
pad_token_id: int = 0,
|
| 79 |
+
ignore_index: int = -100,
|
| 80 |
+
allowed_max_length: Optional[int] = None,
|
| 81 |
+
device: str = "cpu"
|
| 82 |
+
) -> DataLoader:
|
| 83 |
+
"""
|
| 84 |
+
Cria o DataLoader para fine-tuning, a partir de um dataset_files tokenizado.
|
| 85 |
+
|
| 86 |
+
Esta função carrega o dataset_files, aplica a tokenização utilizando uma template de chat,
|
| 87 |
+
e retorna um DataLoader que utiliza a função custom_collate_fn para o processamento
|
| 88 |
+
adequado das batches.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
tokenizer (Any): Tokenizer pré-treinado que suporte chat templates.
|
| 92 |
+
batch_size (int): Número de amostras por batch.
|
| 93 |
+
path_folder (str): Caminho ou identificador do dataset_files.
|
| 94 |
+
split (str): Divisão do dataset_files a ser utilizada (por exemplo, "train" ou "test").
|
| 95 |
+
pad_token_id (int): ID do token para padding.
|
| 96 |
+
ignore_index (int): Valor a ser ignorado na função de perda.
|
| 97 |
+
allowed_max_length (Optional[int]): Se definido, trunca as sequências para este tamanho máximo.
|
| 98 |
+
device (str): Dispositivo para onde os tensores serão enviados ("cpu" ou "cuda").
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
DataLoader: Instância do DataLoader pronta para o fine-tuning.
|
| 102 |
+
"""
|
| 103 |
+
# Define a template de chat e atribui ao tokenizer.
|
| 104 |
+
chat_template = """
|
| 105 |
+
{% for message in messages %}
|
| 106 |
+
{% if message['role'] == 'user' %}
|
| 107 |
+
{{ '<|user_start|>' + message['content'] + '<|user_end|>' + '\n'}}
|
| 108 |
+
{% elif message['role'] == 'assistant' %}
|
| 109 |
+
{{ '<|assistant_start|>' + message['content'] + '<|assistant_end|>' + '\n' }}
|
| 110 |
+
{% endif %}
|
| 111 |
+
{% endfor %}
|
| 112 |
+
"""
|
| 113 |
+
tokenizer.chat_template = chat_template
|
| 114 |
+
|
| 115 |
+
# Carrega o dataset_files.
|
| 116 |
+
raw_dataset = load_dataset(path=path_folder, split=split, download_mode="force_redownload")
|
| 117 |
+
|
| 118 |
+
# Aplica a tokenização utilizando a função definida.
|
| 119 |
+
tokenized_dataset = raw_dataset.map(
|
| 120 |
+
lambda examples: tokenize_function(examples, tokenizer),
|
| 121 |
+
batched=True,
|
| 122 |
+
remove_columns=raw_dataset.column_names,
|
| 123 |
+
desc="Tokenizando dataset_files"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Configura o collate_fn com os parâmetros desejados.
|
| 127 |
+
collate = partial(
|
| 128 |
+
custom_collate_fn,
|
| 129 |
+
pad_token_id=pad_token_id,
|
| 130 |
+
ignore_index=ignore_index,
|
| 131 |
+
allowed_max_length=allowed_max_length,
|
| 132 |
+
device=device
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
print("Criando DataLoader...")
|
| 136 |
+
return DataLoader(
|
| 137 |
+
tokenized_dataset,
|
| 138 |
+
batch_size=batch_size,
|
| 139 |
+
shuffle=False,
|
| 140 |
+
drop_last=False,
|
| 141 |
+
num_workers=0,
|
| 142 |
+
collate_fn=collate
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
if __name__ == "__main__":
|
| 147 |
+
# Carrega o tokenizer pré-treinado.
|
| 148 |
+
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
|
| 149 |
+
|
| 150 |
+
# Cria o DataLoader para a divisão de treino do dataset_files "conversational".
|
| 151 |
+
loader = create_data_loader_fine_tuning(
|
| 152 |
+
tokenizer=tokenizer,
|
| 153 |
+
batch_size=100,
|
| 154 |
+
path_folder="conversational",
|
| 155 |
+
split="test"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Testa a extração de uma batch.
|
| 159 |
+
batch = next(iter(loader))
|
| 160 |
+
print(batch["input_ids"].shape, batch["labels"].shape)
|
src/dataset/pre_train.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Any, Dict, List, Optional, Union
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from torch.utils.data import DataLoader
|
| 6 |
+
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
| 7 |
+
from datasets import load_dataset, Dataset, DatasetDict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def tokenize_function(
|
| 11 |
+
examples: Dict[str, List[str]],
|
| 12 |
+
tokenizer: PreTrainedTokenizer
|
| 13 |
+
) -> Dict[str, List[List[int]]]:
|
| 14 |
+
"""
|
| 15 |
+
Tokeniza os exemplos sem aplicar truncamento ou padding.
|
| 16 |
+
|
| 17 |
+
Retorna apenas os input_ids.
|
| 18 |
+
"""
|
| 19 |
+
tokenized_output = tokenizer(examples["text"], truncation=False, padding=False)
|
| 20 |
+
return {"input_ids": tokenized_output["input_ids"]}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def pack_documents(
|
| 24 |
+
examples: Dict[str, List[List[int]]],
|
| 25 |
+
max_length: int,
|
| 26 |
+
eos_token_id: Optional[int] = None
|
| 27 |
+
) -> Dict[str, List[List[int]]]:
|
| 28 |
+
"""
|
| 29 |
+
Aplica Document Packing e retorna apenas os inputs de tamanho fixo (max_length),
|
| 30 |
+
descartando o último token extra usado para labels.
|
| 31 |
+
"""
|
| 32 |
+
# Concatena tokens de todo o batch
|
| 33 |
+
concatenated: List[int] = []
|
| 34 |
+
separator = [eos_token_id] if eos_token_id is not None else []
|
| 35 |
+
first = True
|
| 36 |
+
for doc in examples["input_ids"]:
|
| 37 |
+
if not first and separator:
|
| 38 |
+
concatenated.extend(separator)
|
| 39 |
+
concatenated.extend(doc)
|
| 40 |
+
first = False
|
| 41 |
+
|
| 42 |
+
block_size = max_length + 1
|
| 43 |
+
total_len = (len(concatenated) // block_size) * block_size
|
| 44 |
+
if total_len == 0:
|
| 45 |
+
return {"input_ids": []}
|
| 46 |
+
|
| 47 |
+
concatenated = concatenated[:total_len]
|
| 48 |
+
# Divide em blocos de block_size e remove o último token de cada bloco
|
| 49 |
+
blocks = [
|
| 50 |
+
concatenated[i : i + block_size]
|
| 51 |
+
for i in range(0, total_len, block_size)
|
| 52 |
+
]
|
| 53 |
+
inputs = [blk[:-1] for blk in blocks]
|
| 54 |
+
|
| 55 |
+
# Filtra qualquer bloco vazio
|
| 56 |
+
inputs = [inp for inp in inputs if len(inp) > 0]
|
| 57 |
+
return {"input_ids": inputs}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def create_train_dataloader(
|
| 61 |
+
folder_path: str,
|
| 62 |
+
tokenizer: PreTrainedTokenizerFast,
|
| 63 |
+
batch_size: int = 4,
|
| 64 |
+
max_length: int = 512,
|
| 65 |
+
drop_last: bool = True,
|
| 66 |
+
num_workers: int = 5
|
| 67 |
+
) -> Optional[DataLoader]:
|
| 68 |
+
"""
|
| 69 |
+
Carrega .txt de folder_path, tokeniza, aplica packing só de inputs
|
| 70 |
+
e retorna um DataLoader que fornece batches de input_ids.
|
| 71 |
+
"""
|
| 72 |
+
raw_dataset = load_dataset(folder_path, split="train", streaming=False)
|
| 73 |
+
print(f"Dataset bruto carregado: {raw_dataset}")
|
| 74 |
+
|
| 75 |
+
# 1) Tokenização
|
| 76 |
+
tokenized = raw_dataset.map(
|
| 77 |
+
lambda ex: tokenize_function(ex, tokenizer),
|
| 78 |
+
batched=True,
|
| 79 |
+
batch_size=1000,
|
| 80 |
+
num_proc=20,
|
| 81 |
+
remove_columns=raw_dataset.column_names,
|
| 82 |
+
)
|
| 83 |
+
print(f"Dataset tokenizado: {tokenized}")
|
| 84 |
+
|
| 85 |
+
# 2) Document Packing sem labels
|
| 86 |
+
packed = tokenized.map(
|
| 87 |
+
lambda ex: pack_documents(
|
| 88 |
+
ex,
|
| 89 |
+
max_length=max_length,
|
| 90 |
+
eos_token_id=tokenizer.eos_token_id
|
| 91 |
+
),
|
| 92 |
+
batched=True,
|
| 93 |
+
batch_size=10000,
|
| 94 |
+
num_proc=20,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# 3) Configura para PyTorch
|
| 98 |
+
packed.set_format(type="torch", columns=["input_ids"])
|
| 99 |
+
|
| 100 |
+
print("Criando DataLoader...")
|
| 101 |
+
return DataLoader(
|
| 102 |
+
packed,
|
| 103 |
+
batch_size=batch_size,
|
| 104 |
+
drop_last=drop_last,
|
| 105 |
+
num_workers=num_workers,
|
| 106 |
+
)
|
src/logger/__init__.py
ADDED
|
File without changes
|
src/logger/logger.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Optional
|
| 2 |
+
import os
|
| 3 |
+
from zoneinfo import ZoneInfo
|
| 4 |
+
|
| 5 |
+
import mlflow
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
from datetime import datetime, date
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TrainerLogger:
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
tracking_uri: str,
|
| 16 |
+
experiment: str,
|
| 17 |
+
total_params: int,
|
| 18 |
+
model_name: str = None,
|
| 19 |
+
run_name: str = None,
|
| 20 |
+
tags: Dict[str, str] = None,
|
| 21 |
+
):
|
| 22 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 23 |
+
mlflow.set_experiment(experiment)
|
| 24 |
+
|
| 25 |
+
# Ativar autologging para PyTorch
|
| 26 |
+
mlflow.pytorch.autolog(log_models=True) # Desativamos log automático de modelos para controle manual
|
| 27 |
+
|
| 28 |
+
# Iniciar run com contexto
|
| 29 |
+
self.run = mlflow.start_run(run_name=run_name)
|
| 30 |
+
self.run_id = self.run.info.run_id
|
| 31 |
+
self.experiment = experiment
|
| 32 |
+
self.model_name = model_name
|
| 33 |
+
self.total_params = total_params
|
| 34 |
+
|
| 35 |
+
# Registrar tags para melhor organização
|
| 36 |
+
default_tags = {"model_type": self.model_name}
|
| 37 |
+
if tags:
|
| 38 |
+
default_tags.update(tags)
|
| 39 |
+
mlflow.set_tags(default_tags)
|
| 40 |
+
|
| 41 |
+
# Registrar parâmetros
|
| 42 |
+
base_params = {"model_name": self.model_name, "total_params": self.total_params}
|
| 43 |
+
self.log_parameters(base_params)
|
| 44 |
+
|
| 45 |
+
def log_parameters(self, parameters: dict):
|
| 46 |
+
mlflow.log_params(parameters) # Mais eficiente que log_param individual
|
| 47 |
+
|
| 48 |
+
def log_metrics(self, metrics: dict, step: Optional[int] = None):
|
| 49 |
+
mlflow.log_metrics(metrics, step)
|
| 50 |
+
|
| 51 |
+
def log_checkpoint_table(self, current_lr:float, loss:float, perplexity: float, last_batch:int) -> None:
|
| 52 |
+
"""
|
| 53 |
+
Log a checkpoint record (month, day, hour, perplexity) to MLflow as a table artifact.
|
| 54 |
+
Perplexity is rounded to 4 decimal places.
|
| 55 |
+
|
| 56 |
+
Parameters
|
| 57 |
+
----------
|
| 58 |
+
perplexity : float
|
| 59 |
+
The perplexity metric to log (rounded to 4 decimal places).
|
| 60 |
+
:param current_lr:
|
| 61 |
+
:param loss:
|
| 62 |
+
:param perplexity:
|
| 63 |
+
:param last_batch:
|
| 64 |
+
"""
|
| 65 |
+
# Define artifact directory and ensure it exists
|
| 66 |
+
artifact_dir = f"checkpoint_table/model"
|
| 67 |
+
os.makedirs(artifact_dir, exist_ok=True)
|
| 68 |
+
|
| 69 |
+
# Capture current timestamp
|
| 70 |
+
now = datetime.now(ZoneInfo("America/Sao_Paulo"))
|
| 71 |
+
record = {
|
| 72 |
+
"month": now.month,
|
| 73 |
+
"day": now.day,
|
| 74 |
+
"hour": f"{now.hour:02d}:{now.minute:02d}",
|
| 75 |
+
"last_batch": last_batch,
|
| 76 |
+
"current_lr": round(current_lr, 7),
|
| 77 |
+
"perplexity": round(perplexity, 4),
|
| 78 |
+
"loss": round(loss, 4),
|
| 79 |
+
|
| 80 |
+
}
|
| 81 |
+
df_record = pd.DataFrame([record])
|
| 82 |
+
|
| 83 |
+
# Define artifact file path (relative POSIX path)
|
| 84 |
+
artifact_file = f"{artifact_dir}/checkpoint_table.json"
|
| 85 |
+
|
| 86 |
+
# Log the table to MLflow Tracking
|
| 87 |
+
mlflow.log_table(
|
| 88 |
+
data=df_record,
|
| 89 |
+
artifact_file=artifact_file
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def checkpoint_model(self, model: nn.Module):
|
| 93 |
+
# Criar diretório local para checkpoint
|
| 94 |
+
step = 1
|
| 95 |
+
checkpoint_dir = f"checkpoints/model_{step}"
|
| 96 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 97 |
+
|
| 98 |
+
# Salvar estado do modelo localmente
|
| 99 |
+
checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
|
| 100 |
+
torch.save(model.state_dict(), checkpoint_path)
|
| 101 |
+
|
| 102 |
+
# Registrar artefato no MLflow
|
| 103 |
+
mlflow.log_artifact(checkpoint_path, f"model_checkpoints/epoch_{step}")
|
| 104 |
+
|
| 105 |
+
input_example = torch.zeros(1, 128, dtype=torch.long) # Ajuste as dimensões conforme seu modelo
|
| 106 |
+
# input_example_numpy = input_example.cpu().numpy()
|
| 107 |
+
|
| 108 |
+
# Registrar modelo no registro de modelos MLflow
|
| 109 |
+
if self.model_name:
|
| 110 |
+
registered_model_name = f"{self.model_name}"
|
| 111 |
+
mlflow.pytorch.log_model(
|
| 112 |
+
pytorch_model=model,
|
| 113 |
+
artifact_path=f"models/epoch_{step}",
|
| 114 |
+
registered_model_name=registered_model_name,
|
| 115 |
+
pip_requirements=["torch>=1.9.0"],
|
| 116 |
+
code_paths=["tynerox/"], # Inclui código-fonte relevante
|
| 117 |
+
# input_example=input_example_numpy, # Exemplo de entrada
|
| 118 |
+
signature=None # Adicione assinatura do modelo se possível
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
table_dict = {
|
| 122 |
+
"entrada": ["Pergunta A", "Pergunta B"],
|
| 123 |
+
"saida": ["Resposta A", "Resposta B"],
|
| 124 |
+
"nota": [0.75, 0.40],
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
def log_html(self, html: str, step: Optional[int] = None):
|
| 128 |
+
file_path = f"visualizations/sample.html"
|
| 129 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 130 |
+
|
| 131 |
+
with open(file_path, "w") as f:
|
| 132 |
+
f.write(html)
|
| 133 |
+
|
| 134 |
+
mlflow.log_artifact(file_path)
|
| 135 |
+
|
| 136 |
+
def finish(self):
|
| 137 |
+
"""Finaliza a execução do MLflow run"""
|
| 138 |
+
mlflow.end_run()
|
| 139 |
+
|
| 140 |
+
def __enter__(self):
|
| 141 |
+
return self
|
| 142 |
+
|
| 143 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 144 |
+
self.finish()
|
src/pre-training.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from tokenizers import Tokenizer
|
| 5 |
+
from transformers import PreTrainedTokenizerFast, get_cosine_schedule_with_warmup
|
| 6 |
+
|
| 7 |
+
from training import PreTrainer
|
| 8 |
+
from tynerox.modeling import TyneRoxModel, TyneRoxConfig
|
| 9 |
+
from dataset.pre_train import create_train_dataloader
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
|
| 13 |
+
# 1 - Carrega o tokenizador
|
| 14 |
+
tokenizer = Tokenizer.from_file("tokenizer/tokens-bpe-36k.json")
|
| 15 |
+
tokenizer = PreTrainedTokenizerFast(
|
| 16 |
+
tokenizer_object=tokenizer,
|
| 17 |
+
unk_token="[UNK]",
|
| 18 |
+
pad_token="<|endoftext|>",
|
| 19 |
+
eos_token="<|endoftext|>",
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
tokenizer.save_pretrained(f"../")
|
| 23 |
+
|
| 24 |
+
# 2 Inicia a configuração e o modelo
|
| 25 |
+
config = TyneRoxConfig(
|
| 26 |
+
vocab_size=tokenizer.vocab_size,
|
| 27 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
model = TyneRoxModel(config)
|
| 31 |
+
model.to("cuda")
|
| 32 |
+
|
| 33 |
+
# 3 - Carrega o dataset de treinamento
|
| 34 |
+
folder_path = "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens"
|
| 35 |
+
dataloader = create_train_dataloader(
|
| 36 |
+
folder_path,
|
| 37 |
+
tokenizer,
|
| 38 |
+
batch_size=20,
|
| 39 |
+
max_length=1024,
|
| 40 |
+
drop_last=True,
|
| 41 |
+
num_workers=10
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# 4 - Criando o optmizer
|
| 45 |
+
model = torch.compile(model)
|
| 46 |
+
optimizer = torch.optim.AdamW(
|
| 47 |
+
model.parameters(),
|
| 48 |
+
lr=0.000461, # Mantenha a LR inicial ou ajuste ligeiramente (ex: 3e-4)
|
| 49 |
+
weight_decay=0.1
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# 5 - Configura o warmup
|
| 53 |
+
epochs = 1
|
| 54 |
+
batch_size = 40
|
| 55 |
+
size_dataset = 2_883_231
|
| 56 |
+
warmup_ratio = 0.05
|
| 57 |
+
|
| 58 |
+
num_training_steps = len(dataloader) * epochs
|
| 59 |
+
num_warmup_steps = math.floor(num_training_steps * warmup_ratio)
|
| 60 |
+
|
| 61 |
+
# 6. Scheduler
|
| 62 |
+
scheduler = get_cosine_schedule_with_warmup(
|
| 63 |
+
optimizer,
|
| 64 |
+
num_warmup_steps=num_warmup_steps,
|
| 65 |
+
num_training_steps=num_training_steps,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
sample_prompts = [
|
| 69 |
+
"Olá, como vai você? ",
|
| 70 |
+
"Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que ",
|
| 71 |
+
"Não, respondeu; na verdade, estou com medo ",
|
| 72 |
+
"O resultado representa uma desaceleração ",
|
| 73 |
+
"No vídeo, é possível ver ",
|
| 74 |
+
"Essa receita de torta de frango ",
|
| 75 |
+
"Durante o primeiro mandato ",
|
| 76 |
+
"Os donos de cães "
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
logger_config = {
|
| 80 |
+
"tracking_uri": "http://127.0.0.1:5000",
|
| 81 |
+
"experiment": "Pre training LLM",
|
| 82 |
+
"model_name": "Pre training LLM (Long Context)"
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
trainer = PreTrainer(
|
| 86 |
+
model=model,
|
| 87 |
+
optimizer=optimizer,
|
| 88 |
+
scheduler=scheduler,
|
| 89 |
+
tokenizer=tokenizer,
|
| 90 |
+
train_loader=dataloader,
|
| 91 |
+
test_loader=None,
|
| 92 |
+
logger_config=logger_config,
|
| 93 |
+
use_amp=True
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
trainer.train(num_epochs=epochs,sample_prompts=sample_prompts)
|
| 97 |
+
|
| 98 |
+
# 7 - Salva as configurações do modelo para enviar para o hugginfaces
|
| 99 |
+
model.save_pretrained(f"../")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
src/tokenizer/__init__.py
ADDED
|
File without changes
|
src/tokenizer/tests.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from tokenizers import Tokenizer
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
# Carrega streaming do dataset e o tokenizer
|
| 6 |
+
dataset_stream = load_dataset(
|
| 7 |
+
"bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens",
|
| 8 |
+
split="train",
|
| 9 |
+
streaming=True
|
| 10 |
+
)
|
| 11 |
+
tokenizer = Tokenizer.from_file("tokens-bpe-36k.json")
|
| 12 |
+
encode = tokenizer.encode
|
| 13 |
+
unk_id = tokenizer.token_to_id("[UNK]")
|
| 14 |
+
vocab_size = tokenizer.get_vocab_size()
|
| 15 |
+
|
| 16 |
+
print("Tamanho do vocabulário:", tokenizer.get_vocab_size())
|
| 17 |
+
enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo")
|
| 18 |
+
print(tokenizer.decode(enc.ids, skip_special_tokens=True))
|
| 19 |
+
|
| 20 |
+
# Contadores
|
| 21 |
+
total_tokens = 0
|
| 22 |
+
total_words = 0
|
| 23 |
+
unk_tokens = 0
|
| 24 |
+
seen_ids = set()
|
| 25 |
+
|
| 26 |
+
batch_size = 512
|
| 27 |
+
batch_counter = 0
|
| 28 |
+
|
| 29 |
+
def batch_iterator(stream, bs):
|
| 30 |
+
buf = []
|
| 31 |
+
for ex in stream:
|
| 32 |
+
buf.append(ex["text"])
|
| 33 |
+
if len(buf) == bs:
|
| 34 |
+
yield buf
|
| 35 |
+
buf = []
|
| 36 |
+
if buf:
|
| 37 |
+
yield buf
|
| 38 |
+
|
| 39 |
+
for texts in batch_iterator(dataset_stream, batch_size):
|
| 40 |
+
# tokeniza em batch
|
| 41 |
+
encs = tokenizer.encode_batch(texts)
|
| 42 |
+
|
| 43 |
+
# conta palavras e tokens no batch
|
| 44 |
+
words_in_batch = sum(len(t.split()) for t in texts)
|
| 45 |
+
total_words += words_in_batch
|
| 46 |
+
|
| 47 |
+
for enc in encs:
|
| 48 |
+
total_tokens += len(enc.ids)
|
| 49 |
+
unk_tokens += enc.ids.count(unk_id)
|
| 50 |
+
seen_ids.update(enc.ids)
|
| 51 |
+
|
| 52 |
+
# impressão parcial a cada 100 batches
|
| 53 |
+
if batch_counter % 100 == 0:
|
| 54 |
+
oov_rate = unk_tokens / total_tokens * 100
|
| 55 |
+
frag = total_tokens / total_words
|
| 56 |
+
coverage = len(seen_ids) / vocab_size * 100
|
| 57 |
+
ttr = len(seen_ids) / total_tokens
|
| 58 |
+
print(f"[Batch {batch_counter:04d}] "
|
| 59 |
+
f"OOV: {oov_rate:.3f}% | "
|
| 60 |
+
f"Frag: {frag:.3f} t/palavra | "
|
| 61 |
+
f"Coverage: {coverage:.2f}% | "
|
| 62 |
+
f"TTR: {ttr:.4f}")
|
| 63 |
+
batch_counter += 1
|
| 64 |
+
|
| 65 |
+
# resultado final
|
| 66 |
+
oov_rate = unk_tokens / total_tokens * 100
|
| 67 |
+
frag = total_tokens / total_words
|
| 68 |
+
coverage = len(seen_ids) / vocab_size * 100
|
| 69 |
+
ttr = len(seen_ids) / total_tokens
|
| 70 |
+
|
| 71 |
+
print("\n=== Métricas Finais ===")
|
| 72 |
+
print(f"Total de tokens: {total_tokens}")
|
| 73 |
+
print(f"Total de palavras: {total_words}")
|
| 74 |
+
print(f"OOV rate: {oov_rate:.3f}%")
|
| 75 |
+
print(f"Fragmentação: {frag:.3f} tokens/palavra")
|
| 76 |
+
print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado")
|
| 77 |
+
print(f"Type–Token Ratio: {ttr:.4f}")
|
src/tokenizer/tokens-bpe-36k.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/tokenizer/trainer.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
from tokenizers import Tokenizer
|
| 3 |
+
from tokenizers.models import BPE
|
| 4 |
+
from tokenizers.trainers import BpeTrainer
|
| 5 |
+
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
|
| 6 |
+
import time # Para medir o tempo
|
| 7 |
+
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents, NFC
|
| 8 |
+
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
|
| 9 |
+
# 1. Carregar o dataset em modo streaming
|
| 10 |
+
dataset_stream = load_dataset("bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens", split="train", streaming=True)
|
| 11 |
+
|
| 12 |
+
print("Dataset carregado em modo streaming:")
|
| 13 |
+
print(dataset_stream)
|
| 14 |
+
|
| 15 |
+
# Nome da coluna que contém o texto
|
| 16 |
+
coluna_texto = "text"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# 2. Criar o gerador para o treinamento do tokenizador
|
| 20 |
+
# Esta função irá iterar sobre o dataset streaming e fornecer o texto
|
| 21 |
+
def get_training_corpus_streaming():
|
| 22 |
+
count = 0
|
| 23 |
+
start_time = time.time()
|
| 24 |
+
print("Iniciando iteração sobre o dataset streaming para o tokenizador...")
|
| 25 |
+
for sample in dataset_stream:
|
| 26 |
+
# Certifique-se de que a amostra não é None e a coluna existe
|
| 27 |
+
if sample and coluna_texto in sample and isinstance(sample[coluna_texto], str):
|
| 28 |
+
yield sample[coluna_texto]
|
| 29 |
+
count += 1
|
| 30 |
+
if count % 10000 == 0: # Log a cada 10000 amostras
|
| 31 |
+
elapsed = time.time() - start_time
|
| 32 |
+
print(f" Processadas {count} amostras para o tokenizador em {elapsed:.2f} segundos...")
|
| 33 |
+
else: # Opcional: Logar amostras inválidas/puladas
|
| 34 |
+
print(f"Aviso: Pulando amostra inválida ou sem coluna '{coluna_texto}': {sample}")
|
| 35 |
+
end_time = time.time()
|
| 36 |
+
print(
|
| 37 |
+
f"Iteração completa. Total de {count} amostras fornecidas ao tokenizador em {end_time - start_time:.2f} segundos.")
|
| 38 |
+
|
| 39 |
+
special_tokens=[
|
| 40 |
+
"[UNK]", "<|endoftext|>",
|
| 41 |
+
"<|user_start|>", "<|user_end|>",
|
| 42 |
+
"<|assistant_start|>", "<|assistant_end|>",
|
| 43 |
+
"<|think_start|>", "<|think_end|>",
|
| 44 |
+
"<|command_start|>", "<|command_end|>",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
print("Inicializando o tokenizador BPE...")
|
| 49 |
+
# tokenizer.pre_tokenizer = Whitespace()
|
| 50 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 51 |
+
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
| 52 |
+
tokenizer.normalizer = NFC()
|
| 53 |
+
tokenizer.decoder = ByteLevelDecoder(add_prefix_space=True)
|
| 54 |
+
|
| 55 |
+
# Aqui: merges mais agressivos
|
| 56 |
+
trainer = BpeTrainer(
|
| 57 |
+
vocab_size=36000 + len(special_tokens),
|
| 58 |
+
min_frequency=7,
|
| 59 |
+
limit_alphabet=1300,
|
| 60 |
+
# continuing_subword_prefix="##",
|
| 61 |
+
# end_of_word_suffix="</w>", # baixa frequência mínima para 1
|
| 62 |
+
show_progress=True, # barra de progresso
|
| 63 |
+
special_tokens=special_tokens,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
print("Iniciando o treinamento do tokenizador a partir do stream...")
|
| 67 |
+
start_train_time = time.time()
|
| 68 |
+
tokenizer.train_from_iterator(
|
| 69 |
+
get_training_corpus_streaming(),
|
| 70 |
+
trainer=trainer
|
| 71 |
+
)
|
| 72 |
+
end_train_time = time.time()
|
| 73 |
+
print(f"Treinamento do tokenizador concluído em {end_train_time - start_train_time:.2f} segundos!")
|
| 74 |
+
|
| 75 |
+
save_path = "tokens-bpe-36k.json"
|
| 76 |
+
tokenizer.save("tokens-bpe-36k.json", pretty=True)
|
| 77 |
+
print(f"Tokenizador salvo em {save_path}")
|
src/training.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import time
|
| 3 |
+
from typing import Any, Optional, Dict, List
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from logger.logger import TrainerLogger
|
| 10 |
+
from torch.utils.data import DataLoader
|
| 11 |
+
from transformers import PreTrainedModel
|
| 12 |
+
|
| 13 |
+
# Configuração do dispositivo
|
| 14 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class BaseTrainer:
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
model: PreTrainedModel,
|
| 21 |
+
optimizer: torch.optim.Optimizer,
|
| 22 |
+
scheduler: torch.optim.lr_scheduler._LRScheduler,
|
| 23 |
+
tokenizer: Any,
|
| 24 |
+
train_loader: DataLoader,
|
| 25 |
+
test_loader: Optional[DataLoader] = None,
|
| 26 |
+
logger_config: Dict[str, Any] = None,
|
| 27 |
+
use_amp: bool = True,
|
| 28 |
+
):
|
| 29 |
+
self.model = model.to(device)
|
| 30 |
+
self.optimizer = optimizer
|
| 31 |
+
self.scheduler = scheduler
|
| 32 |
+
self.tokenizer = tokenizer
|
| 33 |
+
self.train_loader = train_loader
|
| 34 |
+
self.test_loader = test_loader
|
| 35 |
+
self.use_amp = use_amp
|
| 36 |
+
self.scaler = torch.amp.GradScaler('cuda') if use_amp else None
|
| 37 |
+
self.train_step = 0
|
| 38 |
+
self._best_perplexity = float('inf')
|
| 39 |
+
self._epochs_no_improve = 0
|
| 40 |
+
|
| 41 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 42 |
+
self.logger = TrainerLogger(
|
| 43 |
+
tracking_uri=logger_config["tracking_uri"],
|
| 44 |
+
experiment=logger_config["experiment"],
|
| 45 |
+
run_name=logger_config["model_name"],
|
| 46 |
+
model_name=logger_config["model_name"],
|
| 47 |
+
total_params=total_params,
|
| 48 |
+
tags={"version": "1.0", "environment": "development"},
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def _generate_sample(self, sample_prompts: List[str] = []):
|
| 52 |
+
self.model.eval()
|
| 53 |
+
samples_html = ""
|
| 54 |
+
for prompt in sample_prompts:
|
| 55 |
+
try:
|
| 56 |
+
# sample_text = generate_text_sample(self.model, self.tokenizer, prompt)
|
| 57 |
+
inputs = self.tokenizer(prompt, return_tensors="pt")
|
| 58 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
| 59 |
+
|
| 60 |
+
# 4) Gere texto
|
| 61 |
+
with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
|
| 62 |
+
generated_ids = self.model.generate(
|
| 63 |
+
input_ids=input_ids,
|
| 64 |
+
max_length=100, # comprimento total (prompt + continuação)
|
| 65 |
+
num_beams=5, # número de “hips” em beam search
|
| 66 |
+
do_sample=True, # ativa amostragem (em vez de pura greed)
|
| 67 |
+
top_k=50, # restringe sampling aos top-50 tokens
|
| 68 |
+
top_p=0.95, # usa nucleus sampling (p acumulado ≤ 0.95)
|
| 69 |
+
temperature=0.7, # controle de “criatividade”
|
| 70 |
+
repetition_penalty=1.2, # penaliza repetições exatas
|
| 71 |
+
use_cache=True, # reutiliza past_key_values (default)
|
| 72 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
| 73 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# 5) Decode para string
|
| 77 |
+
generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
generated_text = f"Erro: {e}"
|
| 80 |
+
samples_html += f"<h4><b>prompt:</b> {prompt}</h4><p><b>Resposta:</b> {generated_text}</p>"
|
| 81 |
+
self.model.train()
|
| 82 |
+
return samples_html
|
| 83 |
+
|
| 84 |
+
def _calc_loss_batch(self, inputs: torch.Tensor) -> torch.Tensor:
|
| 85 |
+
"""
|
| 86 |
+
Calcula apenas a entropia cruzada para um batch de input_ids,
|
| 87 |
+
desativando o cache de chaves/valores durante o treinamento.
|
| 88 |
+
"""
|
| 89 |
+
ignore_idx = -100
|
| 90 |
+
# valida que todos os tokens estão no vocabulário ou são tokens de ignore
|
| 91 |
+
valid = ((inputs >= 0) | (inputs == ignore_idx)) & (inputs < self.tokenizer.vocab_size)
|
| 92 |
+
assert valid.all(), f"Há labels inválidos: min={inputs.min().item()}, max={inputs.max().item()}"
|
| 93 |
+
|
| 94 |
+
inputs = inputs.to(device)
|
| 95 |
+
with torch.autocast(device_type="cuda", dtype=torch.float16):
|
| 96 |
+
outputs = self.model(
|
| 97 |
+
input_ids=inputs,
|
| 98 |
+
labels=inputs,
|
| 99 |
+
use_cache=False, # desabilita o KV-cache no treino
|
| 100 |
+
return_dict=True # garante acesso via .loss e .logits
|
| 101 |
+
)
|
| 102 |
+
loss = outputs.loss
|
| 103 |
+
logits = outputs.logits
|
| 104 |
+
if torch.isnan(logits).any() or torch.isinf(logits).any():
|
| 105 |
+
raise RuntimeError("Logits inválidos detectados")
|
| 106 |
+
return loss
|
| 107 |
+
|
| 108 |
+
def _train_epoch(self, epoch: int, sample_prompts: Optional[List[str]] = None) -> List[float]:
|
| 109 |
+
if sample_prompts is None:
|
| 110 |
+
sample_prompts = []
|
| 111 |
+
|
| 112 |
+
self.model.train()
|
| 113 |
+
losses = []
|
| 114 |
+
size_dataset = len(self.train_loader)
|
| 115 |
+
pbar = tqdm(
|
| 116 |
+
self.train_loader,
|
| 117 |
+
total=size_dataset,
|
| 118 |
+
desc=f"Epoch {epoch + 1}",
|
| 119 |
+
unit="batch",
|
| 120 |
+
leave=False,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
for i, batch in enumerate(pbar):
|
| 124 |
+
start_time = time.time()
|
| 125 |
+
self.optimizer.zero_grad()
|
| 126 |
+
loss = self._calc_loss_batch(batch['input_ids'])
|
| 127 |
+
losses.append(loss.item())
|
| 128 |
+
|
| 129 |
+
if self.use_amp:
|
| 130 |
+
self.scaler.scale(loss).backward()
|
| 131 |
+
self.scaler.unscale_(self.optimizer)
|
| 132 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
|
| 133 |
+
self.scaler.step(self.optimizer)
|
| 134 |
+
self.scaler.update()
|
| 135 |
+
else:
|
| 136 |
+
loss.backward()
|
| 137 |
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
|
| 138 |
+
self.optimizer.step()
|
| 139 |
+
|
| 140 |
+
self.scheduler.step()
|
| 141 |
+
perplexity = math.exp(loss.item())
|
| 142 |
+
current_lr = self.optimizer.param_groups[0].get('lr', 0.0)
|
| 143 |
+
elapsed_time = time.time() - start_time
|
| 144 |
+
|
| 145 |
+
pbar.set_postfix({
|
| 146 |
+
"loss": f"{loss.item():.4f}",
|
| 147 |
+
"perplexity": f"{perplexity:.4f}",
|
| 148 |
+
"lr": f"{current_lr:.2e}",
|
| 149 |
+
"elapsed_time": f"{elapsed_time:.2f}s",
|
| 150 |
+
})
|
| 151 |
+
|
| 152 |
+
# Logging a cada 100 batches
|
| 153 |
+
if (i + 1) % 100 == 0:
|
| 154 |
+
self.train_step += 1
|
| 155 |
+
avg_loss = sum(losses[-100:]) / 100
|
| 156 |
+
avg_perplexity = math.exp(sum(losses[-100:]) / 100)
|
| 157 |
+
self.logger.log_metrics(
|
| 158 |
+
{
|
| 159 |
+
"train_loss": avg_loss,
|
| 160 |
+
"train_perplexity": avg_perplexity,
|
| 161 |
+
"lr": current_lr,
|
| 162 |
+
},
|
| 163 |
+
step=self.train_step,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Gera samples
|
| 167 |
+
if (i + 1) % 500 == 0:
|
| 168 |
+
samples_html = self._generate_sample(sample_prompts)
|
| 169 |
+
self.logger.log_html(f"<html><head><meta charset='utf-8'></head><body>{samples_html}</body></html>",
|
| 170 |
+
step=self.train_step)
|
| 171 |
+
|
| 172 |
+
# Checkpoint a cada 1000 batches
|
| 173 |
+
if (i + 1) % 1000 == 0:
|
| 174 |
+
avg_loss = sum(losses[-1000:]) / 1000
|
| 175 |
+
avg_perplexity = math.exp(sum(losses[-1000:]) / 1000)
|
| 176 |
+
self.logger.log_checkpoint_table(current_lr, avg_loss, avg_perplexity, i + 1)
|
| 177 |
+
self.logger.checkpoint_model(self.model)
|
| 178 |
+
self.model.save_pretrained(f"../")
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
return losses
|
| 182 |
+
|
| 183 |
+
def train(self, num_epochs: int = 500, sample_prompts: Optional[List[str]] = None):
|
| 184 |
+
for epoch in range(num_epochs):
|
| 185 |
+
train_losses = self._train_epoch(epoch, sample_prompts)
|
| 186 |
+
mean_train_loss = sum(train_losses) / len(train_losses)
|
| 187 |
+
self.logger.log_metrics(
|
| 188 |
+
{"mean_train_loss": mean_train_loss},
|
| 189 |
+
step=epoch,
|
| 190 |
+
)
|
| 191 |
+
print(f"Epoch {epoch + 1} | Train Loss: {mean_train_loss:.4f}")
|
| 192 |
+
|
| 193 |
+
self.logger.finish()
|
| 194 |
+
print("Treinamento concluído!")
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
# Exemplo de uso para Fine-Tuning:
|
| 198 |
+
class TuningTrainer(BaseTrainer):
|
| 199 |
+
pass
|
| 200 |
+
|
| 201 |
+
# Exemplo de uso para Pré-Treinamento:
|
| 202 |
+
class PreTrainer(BaseTrainer):
|
| 203 |
+
pass
|
src/tynerox/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .modeling import TyneRoxModel, TyneRoxConfig
|
| 2 |
+
|
| 3 |
+
__all__ = ["TyneRoxConfig", "TyneRoxModel"]
|
src/tynerox/modeling.py
ADDED
|
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from flash_attn.flash_attn_interface import flash_attn_func
|
| 6 |
+
from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
|
| 7 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 8 |
+
from typing import Optional, Literal, Union, Tuple
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class PositionalEncoding(nn.Module):
|
| 12 |
+
"""
|
| 13 |
+
Implements positional encoding (sinusoidal or rotary).
|
| 14 |
+
"""
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
embed_dim: int,
|
| 18 |
+
context_length: int,
|
| 19 |
+
dropout: float = 0.1,
|
| 20 |
+
encoding_type: Literal['sinusoidal', 'rotary'] = 'rotary',
|
| 21 |
+
):
|
| 22 |
+
super().__init__()
|
| 23 |
+
if embed_dim <= 0 or context_length <= 0:
|
| 24 |
+
raise ValueError("embed_dim and context_length must be positive integers")
|
| 25 |
+
if not 0 <= dropout < 1:
|
| 26 |
+
raise ValueError("dropout must be between 0 and 1")
|
| 27 |
+
|
| 28 |
+
self.dropout = nn.Dropout(dropout)
|
| 29 |
+
self.encoding_type = encoding_type.lower()
|
| 30 |
+
self.max_seq_len = context_length
|
| 31 |
+
self.embed_dim = embed_dim
|
| 32 |
+
|
| 33 |
+
if self.encoding_type == 'sinusoidal':
|
| 34 |
+
pe = self._create_sinusoidal_embeddings(context_length, embed_dim)
|
| 35 |
+
self.register_buffer('pe', pe.unsqueeze(0), persistent=True)
|
| 36 |
+
elif self.encoding_type == 'rotary':
|
| 37 |
+
if embed_dim % 2 != 0:
|
| 38 |
+
raise ValueError("embed_dim must be even for rotary encoding")
|
| 39 |
+
# inv_freq of size D/2
|
| 40 |
+
inv_freq = 1.0 / (10000 ** (torch.arange(0, embed_dim, 2).float() / embed_dim))
|
| 41 |
+
self.register_buffer('inv_freq', inv_freq, persistent=True)
|
| 42 |
+
else:
|
| 43 |
+
raise ValueError("Unsupported encoding_type: 'sinusoidal' or 'rotary'")
|
| 44 |
+
|
| 45 |
+
def _create_sinusoidal_embeddings(self, max_seq_len: int, dim: int) -> torch.Tensor:
|
| 46 |
+
position = torch.arange(max_seq_len).unsqueeze(1).float()
|
| 47 |
+
div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
|
| 48 |
+
pe = torch.zeros(max_seq_len, dim)
|
| 49 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
| 50 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
| 51 |
+
return pe
|
| 52 |
+
|
| 53 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 54 |
+
# x shape: [B, T, D]
|
| 55 |
+
if self.encoding_type == 'sinusoidal':
|
| 56 |
+
seq_len = x.size(1)
|
| 57 |
+
x = x + self.pe[:, :seq_len, :]
|
| 58 |
+
else:
|
| 59 |
+
# rotary: split even/odd dims and apply rotary
|
| 60 |
+
seq_len = x.size(1)
|
| 61 |
+
positions = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
|
| 62 |
+
# freqs of shape [T, D/2]
|
| 63 |
+
freqs = torch.einsum('i , j -> i j', positions, self.inv_freq)
|
| 64 |
+
x = self.apply_rotary(x, freqs)
|
| 65 |
+
return self.dropout(x)
|
| 66 |
+
|
| 67 |
+
@staticmethod
|
| 68 |
+
def apply_rotary(x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
|
| 69 |
+
# x: [B, T, D], emb: [T, D/2]
|
| 70 |
+
x1, x2 = x.chunk(2, dim=-1) # each [B, T, D/2]
|
| 71 |
+
emb_sin = emb.sin()[None, :, :] # [1, T, D/2]
|
| 72 |
+
emb_cos = emb.cos()[None, :, :] # [1, T, D/2]
|
| 73 |
+
# apply rotary
|
| 74 |
+
rotated1 = x1 * emb_cos + x2 * emb_sin
|
| 75 |
+
rotated2 = x2 * emb_cos - x1 * emb_sin
|
| 76 |
+
return torch.cat([rotated1, rotated2], dim=-1) # [B, T, D]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class PositionalEmbedding(nn.Module):
|
| 80 |
+
"""
|
| 81 |
+
Combines token embedding with positional encoding.
|
| 82 |
+
"""
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
vocab_size: int,
|
| 86 |
+
embed_dim: int,
|
| 87 |
+
context_length: int,
|
| 88 |
+
dropout: float = 0.05,
|
| 89 |
+
encoding_type: Literal['sinusoidal', 'rotary'] = 'rotary'
|
| 90 |
+
):
|
| 91 |
+
super().__init__()
|
| 92 |
+
if vocab_size <= 0 or embed_dim <= 0 or context_length <= 0:
|
| 93 |
+
raise ValueError("vocab_size, embed_dim, context_length must be > 0")
|
| 94 |
+
|
| 95 |
+
self.token_embedding = nn.Embedding(vocab_size, embed_dim)
|
| 96 |
+
self.scale = math.sqrt(embed_dim)
|
| 97 |
+
self.pos_encoding = PositionalEncoding(
|
| 98 |
+
embed_dim=embed_dim,
|
| 99 |
+
context_length=context_length,
|
| 100 |
+
dropout=dropout,
|
| 101 |
+
encoding_type=encoding_type
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
def forward(self, input_ids: torch.LongTensor) -> torch.Tensor:
|
| 105 |
+
# input_ids: [B, T]
|
| 106 |
+
x = self.token_embedding(input_ids) * self.scale # [B, T, D]
|
| 107 |
+
return self.pos_encoding(x)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def get_alibi_slopes(n_heads: int) -> torch.Tensor:
|
| 111 |
+
def _get_slopes(n):
|
| 112 |
+
base = 2 ** (-8.0 / n)
|
| 113 |
+
return torch.tensor([base ** (i + 1) for i in range(n)])
|
| 114 |
+
if math.log2(n_heads).is_integer():
|
| 115 |
+
return _get_slopes(n_heads)
|
| 116 |
+
m = 2 ** math.floor(math.log2(n_heads))
|
| 117 |
+
slopes = _get_slopes(m)
|
| 118 |
+
extra = _get_slopes(2 * m)[::2][: n_heads - m]
|
| 119 |
+
return torch.cat([slopes, extra], dim=0)
|
| 120 |
+
|
| 121 |
+
# -----------------------------------------------------------------------------
|
| 122 |
+
# Feed-Forward
|
| 123 |
+
# -----------------------------------------------------------------------------
|
| 124 |
+
|
| 125 |
+
class FeedForward(nn.Module):
|
| 126 |
+
def __init__(self, emb_dim: int, hidden_dim_multiplier: int = 4):
|
| 127 |
+
super().__init__()
|
| 128 |
+
hidden_dim = emb_dim * hidden_dim_multiplier
|
| 129 |
+
self.fc1 = nn.Linear(emb_dim, hidden_dim)
|
| 130 |
+
self.fc2 = nn.Linear(hidden_dim // 2, emb_dim)
|
| 131 |
+
self.activation = nn.SiLU()
|
| 132 |
+
|
| 133 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 134 |
+
x_fc1 = self.fc1(x)
|
| 135 |
+
x_up, x_gate = x_fc1.chunk(2, dim=-1)
|
| 136 |
+
return self.fc2(x_up * self.activation(x_gate))
|
| 137 |
+
|
| 138 |
+
# -----------------------------------------------------------------------------
|
| 139 |
+
# Attention-Free Transformer (AFT) Simple
|
| 140 |
+
# -----------------------------------------------------------------------------
|
| 141 |
+
|
| 142 |
+
class AFTSimple(nn.Module):
|
| 143 |
+
def __init__(
|
| 144 |
+
self,
|
| 145 |
+
embed_dim: int,
|
| 146 |
+
activation=torch.sigmoid,
|
| 147 |
+
causal: bool = True,
|
| 148 |
+
):
|
| 149 |
+
super().__init__()
|
| 150 |
+
self.embed_dim = embed_dim
|
| 151 |
+
self.causal = causal
|
| 152 |
+
self.activation = activation
|
| 153 |
+
|
| 154 |
+
self.qkv = nn.Linear(embed_dim, 3 * embed_dim, bias=False)
|
| 155 |
+
self.project = nn.Linear(embed_dim, embed_dim)
|
| 156 |
+
|
| 157 |
+
def forward(
|
| 158 |
+
self,
|
| 159 |
+
x: torch.Tensor,
|
| 160 |
+
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
|
| 161 |
+
) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
| 162 |
+
# x: [B, T_new, D]
|
| 163 |
+
B, T_new, D = x.shape
|
| 164 |
+
if D != self.embed_dim:
|
| 165 |
+
raise ValueError(f"Input dim ({D}) != embed_dim ({self.embed_dim})")
|
| 166 |
+
|
| 167 |
+
qkv = self.qkv(x) # [B, T_new, 3*D]
|
| 168 |
+
Q, K_new, V_new = qkv.chunk(3, dim=-1) # each [B, T_new, D]
|
| 169 |
+
|
| 170 |
+
# concatenate past if provided
|
| 171 |
+
if past_key_values is not None:
|
| 172 |
+
K_past, V_past = past_key_values
|
| 173 |
+
K = torch.cat([K_past, K_new], dim=1) # [B, T_all, D]
|
| 174 |
+
V = torch.cat([V_past, V_new], dim=1)
|
| 175 |
+
else:
|
| 176 |
+
K, V = K_new, V_new
|
| 177 |
+
|
| 178 |
+
# compute attention-free aggregate
|
| 179 |
+
softmax_k = F.softmax(K, dim=1) # [B, T_all, D]
|
| 180 |
+
weighted_v = softmax_k * V # [B, T_all, D]
|
| 181 |
+
|
| 182 |
+
if self.causal:
|
| 183 |
+
context = torch.cumsum(weighted_v, dim=1) # [B, T_all, D]
|
| 184 |
+
else:
|
| 185 |
+
total = weighted_v.sum(dim=1, keepdim=True) # [B, 1, D]
|
| 186 |
+
context = total.expand(-1, K.size(1), -1) # [B, T_all, D]
|
| 187 |
+
|
| 188 |
+
# slice only the new positions
|
| 189 |
+
context_new = context[:, -T_new:, :] # [B, T_new, D]
|
| 190 |
+
gate = self.activation(Q) # [B, T_new, D]
|
| 191 |
+
Y = gate * context_new # [B, T_new, D]
|
| 192 |
+
Y = self.project(Y) # [B, T_new, D]
|
| 193 |
+
|
| 194 |
+
# return output and updated cache
|
| 195 |
+
return Y, (K, V)
|
| 196 |
+
|
| 197 |
+
# -----------------------------------------------------------------------------
|
| 198 |
+
# Flash Attention with ALiBi and KV-cache
|
| 199 |
+
# -----------------------------------------------------------------------------
|
| 200 |
+
|
| 201 |
+
class FlashAttention(nn.Module):
|
| 202 |
+
def __init__(
|
| 203 |
+
self,
|
| 204 |
+
embed_dim: int,
|
| 205 |
+
num_heads: int,
|
| 206 |
+
window_size: int,
|
| 207 |
+
causal: bool = True,
|
| 208 |
+
qkv_bias: bool = False,
|
| 209 |
+
):
|
| 210 |
+
super().__init__()
|
| 211 |
+
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
|
| 212 |
+
self.num_heads = num_heads
|
| 213 |
+
self.head_dim = embed_dim // num_heads
|
| 214 |
+
self.causal = causal
|
| 215 |
+
self.window_size = window_size
|
| 216 |
+
|
| 217 |
+
self.qkv = nn.Linear(embed_dim, 3 * embed_dim, bias=qkv_bias)
|
| 218 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
|
| 219 |
+
|
| 220 |
+
# precompute ALiBi slopes
|
| 221 |
+
self.register_buffer('alibi', get_alibi_slopes(num_heads))
|
| 222 |
+
|
| 223 |
+
def forward(
|
| 224 |
+
self,
|
| 225 |
+
x: torch.Tensor,
|
| 226 |
+
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
|
| 227 |
+
) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
| 228 |
+
# x: [B, T_new, D]
|
| 229 |
+
B, T_new, _ = x.size()
|
| 230 |
+
qkv = self.qkv(x).view(B, T_new, self.num_heads, 3 * self.head_dim)
|
| 231 |
+
q, k_new, v_new = torch.chunk(qkv, 3, dim=-1) # each [B, T_new, H, Dh]
|
| 232 |
+
|
| 233 |
+
if past_key_values is not None:
|
| 234 |
+
k_past, v_past = past_key_values
|
| 235 |
+
k = torch.cat([k_past, k_new], dim=1) # [B, T_all, H, Dh]
|
| 236 |
+
v = torch.cat([v_past, v_new], dim=1)
|
| 237 |
+
else:
|
| 238 |
+
k, v = k_new, v_new
|
| 239 |
+
|
| 240 |
+
attn_out = flash_attn_func(
|
| 241 |
+
q, k, v,
|
| 242 |
+
softmax_scale=1.0 / math.sqrt(self.head_dim),
|
| 243 |
+
causal=self.causal,
|
| 244 |
+
window_size=(self.window_size - 1, 0) if self.causal else (-1, -1),
|
| 245 |
+
alibi_slopes=self.alibi,
|
| 246 |
+
return_attn_probs=False,
|
| 247 |
+
)
|
| 248 |
+
# attn_out: [B, T_new, H, Dh]
|
| 249 |
+
out = attn_out.contiguous().view(B, T_new, -1) # [B, T_new, D]
|
| 250 |
+
y = self.out_proj(out) # [B, T_new, D]
|
| 251 |
+
|
| 252 |
+
return y, (k, v)
|
| 253 |
+
|
| 254 |
+
# -----------------------------------------------------------------------------
|
| 255 |
+
# Transformer Blocks and Model
|
| 256 |
+
# -----------------------------------------------------------------------------
|
| 257 |
+
|
| 258 |
+
class TransformerBlock(nn.Module):
|
| 259 |
+
def __init__(self, config, att_global: bool = True):
|
| 260 |
+
super().__init__()
|
| 261 |
+
if att_global:
|
| 262 |
+
self.attn = AFTSimple(embed_dim=config.d_model, causal=config.causal)
|
| 263 |
+
else:
|
| 264 |
+
self.attn = FlashAttention(
|
| 265 |
+
embed_dim=config.d_model,
|
| 266 |
+
num_heads=config.num_attention_heads,
|
| 267 |
+
window_size=config.window_size,
|
| 268 |
+
causal=config.causal,
|
| 269 |
+
qkv_bias=True,
|
| 270 |
+
)
|
| 271 |
+
self.ff = nn.Sequential(
|
| 272 |
+
FeedForward(config.d_model),
|
| 273 |
+
FeedForward(config.d_model),
|
| 274 |
+
)
|
| 275 |
+
self.ln1 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
|
| 276 |
+
self.ln2 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
|
| 277 |
+
self.drop = nn.Dropout(config.dropout)
|
| 278 |
+
|
| 279 |
+
def forward(
|
| 280 |
+
self,
|
| 281 |
+
x: torch.Tensor,
|
| 282 |
+
past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
|
| 283 |
+
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
| 284 |
+
# Attention + residual
|
| 285 |
+
residual = x
|
| 286 |
+
x = self.ln1(x)
|
| 287 |
+
y, present = self.attn(x, past_key_values=past_key_values)
|
| 288 |
+
x = self.drop(y) + residual
|
| 289 |
+
|
| 290 |
+
# Feed-forward + residual
|
| 291 |
+
residual = x
|
| 292 |
+
x = self.ln2(x)
|
| 293 |
+
x = self.ff(x)
|
| 294 |
+
x = self.drop(x) + residual
|
| 295 |
+
|
| 296 |
+
return x, present
|
| 297 |
+
|
| 298 |
+
class ResidualBlocks(nn.Module):
|
| 299 |
+
def __init__(self, config):
|
| 300 |
+
super().__init__()
|
| 301 |
+
blocks = []
|
| 302 |
+
for i in range(config.num_hidden_layers):
|
| 303 |
+
# alternate local/global: every 3rd layer global
|
| 304 |
+
att_global = ((i + 1) % 3 == 0)
|
| 305 |
+
blocks.append(TransformerBlock(config, att_global=att_global))
|
| 306 |
+
self.layers = nn.ModuleList(blocks)
|
| 307 |
+
self.final_ln = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
|
| 308 |
+
|
| 309 |
+
def forward(
|
| 310 |
+
self,
|
| 311 |
+
x: torch.Tensor,
|
| 312 |
+
past_key_values: Optional[Tuple[Tuple[torch.Tensor,torch.Tensor], ...]] = None
|
| 313 |
+
) -> Tuple[torch.Tensor, Tuple[Tuple[torch.Tensor,torch.Tensor], ...]]:
|
| 314 |
+
new_past = []
|
| 315 |
+
for i, layer in enumerate(self.layers):
|
| 316 |
+
pkv = None if past_key_values is None else past_key_values[i]
|
| 317 |
+
x, present = layer(x, past_key_values=pkv)
|
| 318 |
+
new_past.append(present)
|
| 319 |
+
x = self.final_ln(x)
|
| 320 |
+
return x, tuple(new_past)
|
| 321 |
+
|
| 322 |
+
# -----------------------------------------------------------------------------
|
| 323 |
+
# Configuration and Model
|
| 324 |
+
# -----------------------------------------------------------------------------
|
| 325 |
+
|
| 326 |
+
class TyneRoxConfig(PretrainedConfig):
|
| 327 |
+
model_type = "tynerox"
|
| 328 |
+
|
| 329 |
+
def __init__(
|
| 330 |
+
self,
|
| 331 |
+
vocab_size: int = 30522,
|
| 332 |
+
context_length: int = 2048,
|
| 333 |
+
d_model: int = 1024,
|
| 334 |
+
num_heads: int = 16,
|
| 335 |
+
window_size: int = 512,
|
| 336 |
+
num_hidden_layers: int = 12,
|
| 337 |
+
causal: bool = True,
|
| 338 |
+
dropout: float = 0.1,
|
| 339 |
+
layer_norm_eps: float = 1e-5,
|
| 340 |
+
tie_word_embeddings: bool = False,
|
| 341 |
+
pad_token_id:int = 0,
|
| 342 |
+
**kwargs
|
| 343 |
+
):
|
| 344 |
+
super().__init__(**kwargs)
|
| 345 |
+
self.vocab_size = vocab_size
|
| 346 |
+
self.max_position_embeddings = context_length
|
| 347 |
+
self.d_model = d_model
|
| 348 |
+
self.num_attention_heads = num_heads
|
| 349 |
+
self.window_size = window_size
|
| 350 |
+
self.num_hidden_layers = num_hidden_layers
|
| 351 |
+
self.causal = causal
|
| 352 |
+
self.dropout = dropout
|
| 353 |
+
self.layer_norm_eps = layer_norm_eps
|
| 354 |
+
self.tie_word_embeddings = tie_word_embeddings
|
| 355 |
+
self.pad_token_id = pad_token_id
|
| 356 |
+
|
| 357 |
+
class TyneRoxModel(PreTrainedModel, GenerationMixin):
|
| 358 |
+
config_class = TyneRoxConfig
|
| 359 |
+
|
| 360 |
+
def __init__(self, config: TyneRoxConfig):
|
| 361 |
+
super().__init__(config)
|
| 362 |
+
self.embed = PositionalEmbedding(
|
| 363 |
+
config.vocab_size,
|
| 364 |
+
config.d_model,
|
| 365 |
+
config.max_position_embeddings,
|
| 366 |
+
dropout=config.dropout,
|
| 367 |
+
encoding_type='rotary'
|
| 368 |
+
)
|
| 369 |
+
self.transformer = ResidualBlocks(config)
|
| 370 |
+
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
| 371 |
+
self.post_init()
|
| 372 |
+
|
| 373 |
+
def get_input_embeddings(self):
|
| 374 |
+
return self.embed.token_embedding
|
| 375 |
+
|
| 376 |
+
def set_input_embeddings(self, value):
|
| 377 |
+
self.embed.token_embedding = value
|
| 378 |
+
|
| 379 |
+
def get_output_embeddings(self):
|
| 380 |
+
return self.lm_head
|
| 381 |
+
|
| 382 |
+
def set_output_embeddings(self, value):
|
| 383 |
+
self.lm_head = value
|
| 384 |
+
|
| 385 |
+
def forward(
|
| 386 |
+
self,
|
| 387 |
+
input_ids: torch.LongTensor,
|
| 388 |
+
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
| 389 |
+
labels: Optional[torch.LongTensor] = None,
|
| 390 |
+
use_cache: bool = True,
|
| 391 |
+
return_dict: bool = True,
|
| 392 |
+
**kwargs
|
| 393 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 394 |
+
# 1) Embeddings
|
| 395 |
+
x = self.embed(input_ids) # [B, T, D]
|
| 396 |
+
|
| 397 |
+
# 2) Transformer blocks with KV-cache
|
| 398 |
+
x, new_past = self.transformer(x, past_key_values=past_key_values)
|
| 399 |
+
|
| 400 |
+
# 3) Project to vocabulary logits
|
| 401 |
+
logits = self.lm_head(x) # [B, T, V]
|
| 402 |
+
|
| 403 |
+
# 4) Compute loss if labels provided
|
| 404 |
+
loss = None
|
| 405 |
+
if labels is not None:
|
| 406 |
+
shift_logits = logits[:, :-1, :].contiguous()
|
| 407 |
+
shift_labels = labels[:, 1:].contiguous()
|
| 408 |
+
loss = F.cross_entropy(
|
| 409 |
+
shift_logits.view(-1, shift_logits.size(-1)),
|
| 410 |
+
shift_labels.view(-1),
|
| 411 |
+
ignore_index=-100,
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# 5) Return standardized output
|
| 415 |
+
if not return_dict:
|
| 416 |
+
output = (logits, new_past) if use_cache else (logits,)
|
| 417 |
+
return ((loss,) + output) if loss is not None else output
|
| 418 |
+
|
| 419 |
+
return CausalLMOutputWithPast(
|
| 420 |
+
loss=loss,
|
| 421 |
+
logits=logits,
|
| 422 |
+
past_key_values=new_past if use_cache else None,
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
def _reorder_cache(
|
| 426 |
+
self,
|
| 427 |
+
past_key_values: Tuple[Tuple[torch.Tensor, torch.Tensor], ...],
|
| 428 |
+
beam_idx: torch.Tensor
|
| 429 |
+
) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
|
| 430 |
+
reordered = []
|
| 431 |
+
for k, v in past_key_values:
|
| 432 |
+
# ambos têm batch dim = dim 0
|
| 433 |
+
reordered.append((k.index_select(0, beam_idx),
|
| 434 |
+
v.index_select(0, beam_idx)))
|
| 435 |
+
return tuple(reordered)
|
| 436 |
+
|
| 437 |
+
def prepare_inputs_for_generation(
|
| 438 |
+
self,
|
| 439 |
+
input_ids: torch.LongTensor,
|
| 440 |
+
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
| 441 |
+
**kwargs
|
| 442 |
+
) -> dict:
|
| 443 |
+
# at generation time, only feed in the last token
|
| 444 |
+
if past_key_values is not None:
|
| 445 |
+
input_ids = input_ids[:, -1:].contiguous()
|
| 446 |
+
return {
|
| 447 |
+
"input_ids": input_ids,
|
| 448 |
+
"past_key_values": past_key_values,
|
| 449 |
+
}
|
src/visualizations/sample.html
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html><head><meta charset='utf-8'></head><body><h4><b>prompt:</b> Olá, como vai você? </h4><p><b>Resposta:</b> Olá, como vai você? .
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
</p><h4><b>prompt:</b> Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que </h4><p><b>Resposta:</b> Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que .
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
</p><h4><b>prompt:</b> Não, respondeu; na verdade, estou com medo </h4><p><b>Resposta:</b> Não, respondeu; na verdade, estou com medo .
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
</p><h4><b>prompt:</b> O resultado representa uma desaceleração </h4><p><b>Resposta:</b> O resultado representa uma desaceleração .
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
</p><h4><b>prompt:</b> No vídeo, é possível ver </h4><p><b>Resposta:</b> No vídeo, é possível ver .
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
</p><h4><b>prompt:</b> Essa receita de torta de frango </h4><p><b>Resposta:</b> Essa receita de torta de frango .
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
</p><h4><b>prompt:</b> Durante o primeiro mandato </h4><p><b>Resposta:</b> Durante o primeiro mandato .
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
</p><h4><b>prompt:</b> Os donos de cães </h4><p><b>Resposta:</b> Os donos de cães .
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
|
| 670 |
+
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
|
| 691 |
+
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
</p></body></html>
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[UNK]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<|endoftext|>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "<|user_start|>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<|user_end|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"4": {
|
| 36 |
+
"content": "<|assistant_start|>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"5": {
|
| 44 |
+
"content": "<|assistant_end|>",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": false,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": true
|
| 50 |
+
},
|
| 51 |
+
"6": {
|
| 52 |
+
"content": "<|think_start|>",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": false,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": true
|
| 58 |
+
},
|
| 59 |
+
"7": {
|
| 60 |
+
"content": "<|think_end|>",
|
| 61 |
+
"lstrip": false,
|
| 62 |
+
"normalized": false,
|
| 63 |
+
"rstrip": false,
|
| 64 |
+
"single_word": false,
|
| 65 |
+
"special": true
|
| 66 |
+
},
|
| 67 |
+
"8": {
|
| 68 |
+
"content": "<|command_start|>",
|
| 69 |
+
"lstrip": false,
|
| 70 |
+
"normalized": false,
|
| 71 |
+
"rstrip": false,
|
| 72 |
+
"single_word": false,
|
| 73 |
+
"special": true
|
| 74 |
+
},
|
| 75 |
+
"9": {
|
| 76 |
+
"content": "<|command_end|>",
|
| 77 |
+
"lstrip": false,
|
| 78 |
+
"normalized": false,
|
| 79 |
+
"rstrip": false,
|
| 80 |
+
"single_word": false,
|
| 81 |
+
"special": true
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
"clean_up_tokenization_spaces": false,
|
| 85 |
+
"eos_token": "<|endoftext|>",
|
| 86 |
+
"extra_special_tokens": {},
|
| 87 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 88 |
+
"pad_token": "<|endoftext|>",
|
| 89 |
+
"tokenizer_class": "PreTrainedTokenizer",
|
| 90 |
+
"unk_token": "[UNK]"
|
| 91 |
+
}
|