Ubuntu commited on
Commit
58d9159
·
0 Parent(s):

Re-adiciona model.safetensors via LFS

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte‑compiled / build artifacts
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ src/checkpoints/
16
+ mlflow/
17
+ postgres-temp/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # Installer logs
30
+ pip-log.txt
31
+ pip-delete-this-directory.txt
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+ *.pth
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+
53
+ # IDEs and editors
54
+ .idea/
55
+ *.iml
56
+ .vscode/
57
+ *.sublime-project
58
+ *.sublime-workspace
59
+
60
+ # Docker
61
+ docker-compose.override.yml
62
+ .docker/
63
+
64
+ # Environment / virtualenv
65
+ .env
66
+ .venv/
67
+ env/
68
+ venv/
69
+ ENV/
70
+ env.bak/
71
+ venv.bak/
72
+
73
+ # Poetry
74
+ poetry.lock
75
+ poetry.toml
76
+
77
+ # Pyproject (se configurar como público; se privado, remova)
78
+ # pyproject.toml
79
+
80
+ # Lock files
81
+ Pipfile.lock
82
+
83
+ # Runtime data
84
+ *.pid
85
+ *.seed
86
+ *.log
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # VS Code settings
92
+ .vscode/
93
+
94
+ # OS files
95
+ .DS_Store
96
+ Thumbs.db
Dockerfile ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ FROM ghcr.io/mlflow/mlflow:v2.21.3
2
+
3
+ RUN pip install boto3
README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ pipeline_tag: text-generation
5
+ base_model:
6
+ - bobboyms/tynerox
7
+ ---
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TyneRoxModel"
4
+ ],
5
+ "causal": true,
6
+ "d_model": 1024,
7
+ "dropout": 0.1,
8
+ "layer_norm_eps": 1e-05,
9
+ "max_position_embeddings": 2048,
10
+ "model_type": "tynerox",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 12,
13
+ "pad_token_id": 1,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "float32",
16
+ "transformers_version": "4.51.3",
17
+ "vocab_size": 36010,
18
+ "window_size": 512
19
+ }
docker-compose.yml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+ services:
3
+ mlflow:
4
+ # image: ghcr.io/mlflow/mlflow:v2.21.3
5
+ # image: ghcr.io/mlflow/mlflow:v3.0.0rc0
6
+ build: .
7
+ ports:
8
+ - "5000:5000"
9
+ environment:
10
+ - MLFLOW_ARTIFACT_ROOT=s3://1hh-mlflow/artifacts
11
+ - MLFLOW_TRACKING_URI=http://mlflow:5000
12
+ volumes:
13
+ - ./mlflow:/mlflow
14
+ command: mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlflow.db --default-artifact-root s3://1hh-mlflow/artifacts
15
+ # s3://1hh-mlflow/artifacts/
16
+ postgres:
17
+ image: postgres:14
18
+ environment:
19
+ - POSTGRES_USER=mlflow
20
+ - POSTGRES_PASSWORD=mlflow
21
+ - POSTGRES_DB=mlflowdb
22
+ volumes:
23
+ - ./postgres-temp:/var/lib/postgresql/temp
24
+
25
+ # minio:
26
+ # image: minio/minio:latest
27
+ # ports:
28
+ # - "9000:9000"
29
+ # environment:
30
+ # - MINIO_ROOT_USER=minioadmin
31
+ # - MINIO_ROOT_PASSWORD=minioadmin
32
+ # volumes:
33
+ # - ./minio-temp:/temp
34
+ # command: server /temp --console-address ":9001"
35
+
36
+ nginx:
37
+ image: nginx:latest
38
+ ports:
39
+ - "80:80"
40
+ volumes:
41
+ # - ./nginx.conf:/etc/nginx/nginx.conf:ro
42
+ - ./nginx.conf:/etc/nginx/conf.d/default.conf:ro
43
+ depends_on:
44
+ - mlflow
45
+ # - minio
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 1,
4
+ "transformers_version": "4.51.3"
5
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc6c386af412163c51f18f97152117040b6464f9e64159ef464d50471ceda1c
3
+ size 1101168184
nginx.conf ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mlflow_proxy.conf
2
+
3
+ upstream mlflow {
4
+ server mlflow:5000;
5
+ }
6
+
7
+ # upstream minio {
8
+ # server minio:9000;
9
+ # }
10
+
11
+ server {
12
+ listen 80;
13
+ server_name _; # Opcional: escuta em qualquer nome de host
14
+
15
+ # Logs específicos para este server block (ajuda na depuração)
16
+ access_log /var/log/nginx/mlflow_access.log;
17
+ error_log /var/log/nginx/mlflow_error.log debug; # Use 'debug' para mais detalhes
18
+
19
+ location / {
20
+ proxy_pass http://mlflow;
21
+ proxy_set_header Host $host;
22
+ proxy_set_header X-Real-IP $remote_addr;
23
+ # Headers úteis para proxys reversos
24
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
25
+ proxy_set_header X-Forwarded-Proto $scheme;
26
+
27
+ # Opcional: Aumentar timeouts se houver problemas de conexão
28
+ # proxy_connect_timeout 60s;
29
+ # proxy_read_timeout 60s;
30
+ }
31
+
32
+ # location /minio {
33
+ # # Atenção: MinIO pode precisar de reescrita de URL ou configuração específica
34
+ # # dependendo de como ele lida com subpastas.
35
+ # proxy_pass http://minio;
36
+ # proxy_set_header Host $host; # MinIO pode precisar do host correto
37
+ # proxy_set_header X-Real-IP $remote_addr;
38
+ # proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
39
+ # proxy_set_header X-Forwarded-Proto $scheme;
40
+ # }
41
+ }
pyproject.toml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "tynerox"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "Thiago L. Rodrigues"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.9,<4.0"
10
+ dependencies = [
11
+ "torch (>=2.6.0,<3.0.0)",
12
+ "transformers[torch] (>=4.50.3,<5.0.0)",
13
+ "python-dotenv (>=1.1.0,<2.0.0)",
14
+ "tavily-python (>=0.5.3,<0.6.0)",
15
+ "langchain-community (>=0.3.20,<0.4.0)",
16
+ "pydantic (>=2.11.1,<3.0.0)",
17
+ "pandas (>=2.2.3,<3.0.0)",
18
+ "openai-agents (>=0.0.7,<0.0.8)",
19
+ "datasets (>=3.5.0,<4.0.0)",
20
+ "mlflow (>=2.21.3,<3.0.0)",
21
+ "beautifulsoup4 (>=4.13.3,<5.0.0)",
22
+ "packaging (>=24.2,<25.0)",
23
+ "boto3 (>=1.37.37,<2.0.0)",
24
+ "flash-attn (>=2.7.4.post1,<3.0.0)",
25
+ ]
26
+
27
+ [tool.poetry]
28
+ name = "tynerox"
29
+ version = "0.1.0"
30
+ packages = [
31
+ { include = "tynerox", from = "src" }
32
+ ]
33
+
34
+
35
+ [build-system]
36
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
37
+ build-backend = "poetry.core.masonry.api"
38
+
39
+ [tool.poetry.group.dev.dependencies]
40
+ pytest = "^8.3.5"
41
+
42
+ [tool.pytest.ini_options]
43
+ minversion = "6.0"
44
+ addopts = "-ra -q"
45
+ testpaths = ["src/tests"]
46
+ python_files = ["test_*.py"]
47
+ norecursedirs = ["postgres-data"]
setup.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="tynerox",
5
+ version="0.1.0",
6
+ description="TyneRox: custom rotary-transformer causal LM",
7
+ author="Thiago Luiz Rodrigues",
8
+ author_email="<EMAIL>",
9
+ url="https://github.com/seu-usuario/tynerox",
10
+ license="Apache-2.0",
11
+ packages=find_packages("src"),
12
+ package_dir={"": "src"},
13
+ install_requires=[
14
+ "torch>=2.6.0,<3.0.0",
15
+ "transformers[torch]>=4.50.3,<5.0.0",
16
+ "flash-attn>=2.7.4.post1,<3.0.0",
17
+ ],
18
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "<|endoftext|>",
3
+ "pad_token": "<|endoftext|>",
4
+ "unk_token": "[UNK]"
5
+ }
src/__init__.py ADDED
File without changes
src/dataset/__init__.py ADDED
File without changes
src/dataset/fine_tuning.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Optional
2
+ import torch
3
+ from torch.utils.data import DataLoader
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer
6
+ from torch.nn.utils.rnn import pad_sequence
7
+ from functools import partial
8
+
9
+
10
+ def tokenize_function(examples: Dict[str, Any], tokenizer: Any) -> Dict[str, List[int]]:
11
+ """
12
+ Aplica a template de chat do tokenizer e gera os token ids.
13
+
14
+ Args:
15
+ examples (Dict[str, Any]): Dicionário contendo a lista de mensagens sob a chave "messages".
16
+ tokenizer (Any): Instância do tokenizer que deverá possuir a propriedade 'chat_template'.
17
+
18
+ Returns:
19
+ Dict[str, List[int]]: Dicionário com os token ids gerados.
20
+ """
21
+ full_text = tokenizer.apply_chat_template(
22
+ examples["messages"],
23
+ tokenize=True,
24
+ add_generation_prompt=True
25
+ )
26
+ return {"input_ids": full_text}
27
+
28
+
29
+ def custom_collate_fn(
30
+ batch: List[Dict[str, List[int]]],
31
+ pad_token_id: int = 29797,
32
+ ignore_index: int = -100,
33
+ allowed_max_length: Optional[int] = None,
34
+ device: str = "cpu",
35
+ ) -> Dict[str, torch.Tensor]:
36
+ """
37
+ • Faz padding das sequências
38
+ • Cria pares (input, label) deslocando 1 posição
39
+ • Aplica `ignore_index` (-100) APENAS nos labels depois do 1.º PAD
40
+ """
41
+
42
+ # 1) Lista → Tensor + PAD final
43
+ seqs = [torch.tensor(s["input_ids"] + [pad_token_id]) for s in batch]
44
+
45
+ # 2) Padding até o comprimento máximo do batch
46
+ padded = pad_sequence(seqs, batch_first=True, padding_value=pad_token_id)
47
+
48
+ # 3) Desloca 1 posição e CLONA para quebrar o compartilhamento de memória
49
+ input_ids = padded[:, :-1].clone() # ← nunca terá -100
50
+ labels = padded[:, 1:].clone() # ← vamos editar aqui
51
+
52
+ # 4) Define -100 após o primeiro PAD de cada sequência
53
+ pad_mask = (labels == pad_token_id)
54
+ if pad_mask.any():
55
+ # índice da primeira ocorrência de PAD em cada linha
56
+ first_pad_pos = pad_mask.float().cumsum(1).eq(1) & pad_mask
57
+ # tudo que vem depois do primeiro PAD recebe -100
58
+ mask_after_first_pad = pad_mask & ~first_pad_pos
59
+ labels[mask_after_first_pad] = ignore_index
60
+
61
+ # 5) Trunca se for solicitado
62
+ if allowed_max_length is not None:
63
+ input_ids = input_ids[:, :allowed_max_length]
64
+ labels = labels[:, :allowed_max_length]
65
+
66
+ return {
67
+ "input_ids": input_ids.to(device),
68
+ "labels": labels.to(device),
69
+ }
70
+
71
+
72
+
73
+ def create_data_loader_fine_tuning(
74
+ tokenizer: Any,
75
+ batch_size: int,
76
+ path_folder: str,
77
+ split: str = "train",
78
+ pad_token_id: int = 0,
79
+ ignore_index: int = -100,
80
+ allowed_max_length: Optional[int] = None,
81
+ device: str = "cpu"
82
+ ) -> DataLoader:
83
+ """
84
+ Cria o DataLoader para fine-tuning, a partir de um dataset_files tokenizado.
85
+
86
+ Esta função carrega o dataset_files, aplica a tokenização utilizando uma template de chat,
87
+ e retorna um DataLoader que utiliza a função custom_collate_fn para o processamento
88
+ adequado das batches.
89
+
90
+ Args:
91
+ tokenizer (Any): Tokenizer pré-treinado que suporte chat templates.
92
+ batch_size (int): Número de amostras por batch.
93
+ path_folder (str): Caminho ou identificador do dataset_files.
94
+ split (str): Divisão do dataset_files a ser utilizada (por exemplo, "train" ou "test").
95
+ pad_token_id (int): ID do token para padding.
96
+ ignore_index (int): Valor a ser ignorado na função de perda.
97
+ allowed_max_length (Optional[int]): Se definido, trunca as sequências para este tamanho máximo.
98
+ device (str): Dispositivo para onde os tensores serão enviados ("cpu" ou "cuda").
99
+
100
+ Returns:
101
+ DataLoader: Instância do DataLoader pronta para o fine-tuning.
102
+ """
103
+ # Define a template de chat e atribui ao tokenizer.
104
+ chat_template = """
105
+ {% for message in messages %}
106
+ {% if message['role'] == 'user' %}
107
+ {{ '<|user_start|>' + message['content'] + '<|user_end|>' + '\n'}}
108
+ {% elif message['role'] == 'assistant' %}
109
+ {{ '<|assistant_start|>' + message['content'] + '<|assistant_end|>' + '\n' }}
110
+ {% endif %}
111
+ {% endfor %}
112
+ """
113
+ tokenizer.chat_template = chat_template
114
+
115
+ # Carrega o dataset_files.
116
+ raw_dataset = load_dataset(path=path_folder, split=split, download_mode="force_redownload")
117
+
118
+ # Aplica a tokenização utilizando a função definida.
119
+ tokenized_dataset = raw_dataset.map(
120
+ lambda examples: tokenize_function(examples, tokenizer),
121
+ batched=True,
122
+ remove_columns=raw_dataset.column_names,
123
+ desc="Tokenizando dataset_files"
124
+ )
125
+
126
+ # Configura o collate_fn com os parâmetros desejados.
127
+ collate = partial(
128
+ custom_collate_fn,
129
+ pad_token_id=pad_token_id,
130
+ ignore_index=ignore_index,
131
+ allowed_max_length=allowed_max_length,
132
+ device=device
133
+ )
134
+
135
+ print("Criando DataLoader...")
136
+ return DataLoader(
137
+ tokenized_dataset,
138
+ batch_size=batch_size,
139
+ shuffle=False,
140
+ drop_last=False,
141
+ num_workers=0,
142
+ collate_fn=collate
143
+ )
144
+
145
+
146
+ if __name__ == "__main__":
147
+ # Carrega o tokenizer pré-treinado.
148
+ tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
149
+
150
+ # Cria o DataLoader para a divisão de treino do dataset_files "conversational".
151
+ loader = create_data_loader_fine_tuning(
152
+ tokenizer=tokenizer,
153
+ batch_size=100,
154
+ path_folder="conversational",
155
+ split="test"
156
+ )
157
+
158
+ # Testa a extração de uma batch.
159
+ batch = next(iter(loader))
160
+ print(batch["input_ids"].shape, batch["labels"].shape)
src/dataset/pre_train.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ import torch
5
+ from torch.utils.data import DataLoader
6
+ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
7
+ from datasets import load_dataset, Dataset, DatasetDict
8
+
9
+
10
+ def tokenize_function(
11
+ examples: Dict[str, List[str]],
12
+ tokenizer: PreTrainedTokenizer
13
+ ) -> Dict[str, List[List[int]]]:
14
+ """
15
+ Tokeniza os exemplos sem aplicar truncamento ou padding.
16
+
17
+ Retorna apenas os input_ids.
18
+ """
19
+ tokenized_output = tokenizer(examples["text"], truncation=False, padding=False)
20
+ return {"input_ids": tokenized_output["input_ids"]}
21
+
22
+
23
+ def pack_documents(
24
+ examples: Dict[str, List[List[int]]],
25
+ max_length: int,
26
+ eos_token_id: Optional[int] = None
27
+ ) -> Dict[str, List[List[int]]]:
28
+ """
29
+ Aplica Document Packing e retorna apenas os inputs de tamanho fixo (max_length),
30
+ descartando o último token extra usado para labels.
31
+ """
32
+ # Concatena tokens de todo o batch
33
+ concatenated: List[int] = []
34
+ separator = [eos_token_id] if eos_token_id is not None else []
35
+ first = True
36
+ for doc in examples["input_ids"]:
37
+ if not first and separator:
38
+ concatenated.extend(separator)
39
+ concatenated.extend(doc)
40
+ first = False
41
+
42
+ block_size = max_length + 1
43
+ total_len = (len(concatenated) // block_size) * block_size
44
+ if total_len == 0:
45
+ return {"input_ids": []}
46
+
47
+ concatenated = concatenated[:total_len]
48
+ # Divide em blocos de block_size e remove o último token de cada bloco
49
+ blocks = [
50
+ concatenated[i : i + block_size]
51
+ for i in range(0, total_len, block_size)
52
+ ]
53
+ inputs = [blk[:-1] for blk in blocks]
54
+
55
+ # Filtra qualquer bloco vazio
56
+ inputs = [inp for inp in inputs if len(inp) > 0]
57
+ return {"input_ids": inputs}
58
+
59
+
60
+ def create_train_dataloader(
61
+ folder_path: str,
62
+ tokenizer: PreTrainedTokenizerFast,
63
+ batch_size: int = 4,
64
+ max_length: int = 512,
65
+ drop_last: bool = True,
66
+ num_workers: int = 5
67
+ ) -> Optional[DataLoader]:
68
+ """
69
+ Carrega .txt de folder_path, tokeniza, aplica packing só de inputs
70
+ e retorna um DataLoader que fornece batches de input_ids.
71
+ """
72
+ raw_dataset = load_dataset(folder_path, split="train", streaming=False)
73
+ print(f"Dataset bruto carregado: {raw_dataset}")
74
+
75
+ # 1) Tokenização
76
+ tokenized = raw_dataset.map(
77
+ lambda ex: tokenize_function(ex, tokenizer),
78
+ batched=True,
79
+ batch_size=1000,
80
+ num_proc=20,
81
+ remove_columns=raw_dataset.column_names,
82
+ )
83
+ print(f"Dataset tokenizado: {tokenized}")
84
+
85
+ # 2) Document Packing sem labels
86
+ packed = tokenized.map(
87
+ lambda ex: pack_documents(
88
+ ex,
89
+ max_length=max_length,
90
+ eos_token_id=tokenizer.eos_token_id
91
+ ),
92
+ batched=True,
93
+ batch_size=10000,
94
+ num_proc=20,
95
+ )
96
+
97
+ # 3) Configura para PyTorch
98
+ packed.set_format(type="torch", columns=["input_ids"])
99
+
100
+ print("Criando DataLoader...")
101
+ return DataLoader(
102
+ packed,
103
+ batch_size=batch_size,
104
+ drop_last=drop_last,
105
+ num_workers=num_workers,
106
+ )
src/logger/__init__.py ADDED
File without changes
src/logger/logger.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional
2
+ import os
3
+ from zoneinfo import ZoneInfo
4
+
5
+ import mlflow
6
+ import pandas as pd
7
+ import torch
8
+ import torch.nn as nn
9
+ from datetime import datetime, date
10
+
11
+
12
+ class TrainerLogger:
13
+ def __init__(
14
+ self,
15
+ tracking_uri: str,
16
+ experiment: str,
17
+ total_params: int,
18
+ model_name: str = None,
19
+ run_name: str = None,
20
+ tags: Dict[str, str] = None,
21
+ ):
22
+ mlflow.set_tracking_uri(tracking_uri)
23
+ mlflow.set_experiment(experiment)
24
+
25
+ # Ativar autologging para PyTorch
26
+ mlflow.pytorch.autolog(log_models=True) # Desativamos log automático de modelos para controle manual
27
+
28
+ # Iniciar run com contexto
29
+ self.run = mlflow.start_run(run_name=run_name)
30
+ self.run_id = self.run.info.run_id
31
+ self.experiment = experiment
32
+ self.model_name = model_name
33
+ self.total_params = total_params
34
+
35
+ # Registrar tags para melhor organização
36
+ default_tags = {"model_type": self.model_name}
37
+ if tags:
38
+ default_tags.update(tags)
39
+ mlflow.set_tags(default_tags)
40
+
41
+ # Registrar parâmetros
42
+ base_params = {"model_name": self.model_name, "total_params": self.total_params}
43
+ self.log_parameters(base_params)
44
+
45
+ def log_parameters(self, parameters: dict):
46
+ mlflow.log_params(parameters) # Mais eficiente que log_param individual
47
+
48
+ def log_metrics(self, metrics: dict, step: Optional[int] = None):
49
+ mlflow.log_metrics(metrics, step)
50
+
51
+ def log_checkpoint_table(self, current_lr:float, loss:float, perplexity: float, last_batch:int) -> None:
52
+ """
53
+ Log a checkpoint record (month, day, hour, perplexity) to MLflow as a table artifact.
54
+ Perplexity is rounded to 4 decimal places.
55
+
56
+ Parameters
57
+ ----------
58
+ perplexity : float
59
+ The perplexity metric to log (rounded to 4 decimal places).
60
+ :param current_lr:
61
+ :param loss:
62
+ :param perplexity:
63
+ :param last_batch:
64
+ """
65
+ # Define artifact directory and ensure it exists
66
+ artifact_dir = f"checkpoint_table/model"
67
+ os.makedirs(artifact_dir, exist_ok=True)
68
+
69
+ # Capture current timestamp
70
+ now = datetime.now(ZoneInfo("America/Sao_Paulo"))
71
+ record = {
72
+ "month": now.month,
73
+ "day": now.day,
74
+ "hour": f"{now.hour:02d}:{now.minute:02d}",
75
+ "last_batch": last_batch,
76
+ "current_lr": round(current_lr, 7),
77
+ "perplexity": round(perplexity, 4),
78
+ "loss": round(loss, 4),
79
+
80
+ }
81
+ df_record = pd.DataFrame([record])
82
+
83
+ # Define artifact file path (relative POSIX path)
84
+ artifact_file = f"{artifact_dir}/checkpoint_table.json"
85
+
86
+ # Log the table to MLflow Tracking
87
+ mlflow.log_table(
88
+ data=df_record,
89
+ artifact_file=artifact_file
90
+ )
91
+
92
+ def checkpoint_model(self, model: nn.Module):
93
+ # Criar diretório local para checkpoint
94
+ step = 1
95
+ checkpoint_dir = f"checkpoints/model_{step}"
96
+ os.makedirs(checkpoint_dir, exist_ok=True)
97
+
98
+ # Salvar estado do modelo localmente
99
+ checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
100
+ torch.save(model.state_dict(), checkpoint_path)
101
+
102
+ # Registrar artefato no MLflow
103
+ mlflow.log_artifact(checkpoint_path, f"model_checkpoints/epoch_{step}")
104
+
105
+ input_example = torch.zeros(1, 128, dtype=torch.long) # Ajuste as dimensões conforme seu modelo
106
+ # input_example_numpy = input_example.cpu().numpy()
107
+
108
+ # Registrar modelo no registro de modelos MLflow
109
+ if self.model_name:
110
+ registered_model_name = f"{self.model_name}"
111
+ mlflow.pytorch.log_model(
112
+ pytorch_model=model,
113
+ artifact_path=f"models/epoch_{step}",
114
+ registered_model_name=registered_model_name,
115
+ pip_requirements=["torch>=1.9.0"],
116
+ code_paths=["tynerox/"], # Inclui código-fonte relevante
117
+ # input_example=input_example_numpy, # Exemplo de entrada
118
+ signature=None # Adicione assinatura do modelo se possível
119
+ )
120
+
121
+ table_dict = {
122
+ "entrada": ["Pergunta A", "Pergunta B"],
123
+ "saida": ["Resposta A", "Resposta B"],
124
+ "nota": [0.75, 0.40],
125
+ }
126
+
127
+ def log_html(self, html: str, step: Optional[int] = None):
128
+ file_path = f"visualizations/sample.html"
129
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
130
+
131
+ with open(file_path, "w") as f:
132
+ f.write(html)
133
+
134
+ mlflow.log_artifact(file_path)
135
+
136
+ def finish(self):
137
+ """Finaliza a execução do MLflow run"""
138
+ mlflow.end_run()
139
+
140
+ def __enter__(self):
141
+ return self
142
+
143
+ def __exit__(self, exc_type, exc_val, exc_tb):
144
+ self.finish()
src/pre-training.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from tokenizers import Tokenizer
5
+ from transformers import PreTrainedTokenizerFast, get_cosine_schedule_with_warmup
6
+
7
+ from training import PreTrainer
8
+ from tynerox.modeling import TyneRoxModel, TyneRoxConfig
9
+ from dataset.pre_train import create_train_dataloader
10
+
11
+ if __name__ == "__main__":
12
+
13
+ # 1 - Carrega o tokenizador
14
+ tokenizer = Tokenizer.from_file("tokenizer/tokens-bpe-36k.json")
15
+ tokenizer = PreTrainedTokenizerFast(
16
+ tokenizer_object=tokenizer,
17
+ unk_token="[UNK]",
18
+ pad_token="<|endoftext|>",
19
+ eos_token="<|endoftext|>",
20
+ )
21
+
22
+ tokenizer.save_pretrained(f"../")
23
+
24
+ # 2 Inicia a configuração e o modelo
25
+ config = TyneRoxConfig(
26
+ vocab_size=tokenizer.vocab_size,
27
+ pad_token_id=tokenizer.pad_token_id,
28
+ )
29
+
30
+ model = TyneRoxModel(config)
31
+ model.to("cuda")
32
+
33
+ # 3 - Carrega o dataset de treinamento
34
+ folder_path = "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens"
35
+ dataloader = create_train_dataloader(
36
+ folder_path,
37
+ tokenizer,
38
+ batch_size=20,
39
+ max_length=1024,
40
+ drop_last=True,
41
+ num_workers=10
42
+ )
43
+
44
+ # 4 - Criando o optmizer
45
+ model = torch.compile(model)
46
+ optimizer = torch.optim.AdamW(
47
+ model.parameters(),
48
+ lr=0.000461, # Mantenha a LR inicial ou ajuste ligeiramente (ex: 3e-4)
49
+ weight_decay=0.1
50
+ )
51
+
52
+ # 5 - Configura o warmup
53
+ epochs = 1
54
+ batch_size = 40
55
+ size_dataset = 2_883_231
56
+ warmup_ratio = 0.05
57
+
58
+ num_training_steps = len(dataloader) * epochs
59
+ num_warmup_steps = math.floor(num_training_steps * warmup_ratio)
60
+
61
+ # 6. Scheduler
62
+ scheduler = get_cosine_schedule_with_warmup(
63
+ optimizer,
64
+ num_warmup_steps=num_warmup_steps,
65
+ num_training_steps=num_training_steps,
66
+ )
67
+
68
+ sample_prompts = [
69
+ "Olá, como vai você? ",
70
+ "Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que ",
71
+ "Não, respondeu; na verdade, estou com medo ",
72
+ "O resultado representa uma desaceleração ",
73
+ "No vídeo, é possível ver ",
74
+ "Essa receita de torta de frango ",
75
+ "Durante o primeiro mandato ",
76
+ "Os donos de cães "
77
+ ]
78
+
79
+ logger_config = {
80
+ "tracking_uri": "http://127.0.0.1:5000",
81
+ "experiment": "Pre training LLM",
82
+ "model_name": "Pre training LLM (Long Context)"
83
+ }
84
+
85
+ trainer = PreTrainer(
86
+ model=model,
87
+ optimizer=optimizer,
88
+ scheduler=scheduler,
89
+ tokenizer=tokenizer,
90
+ train_loader=dataloader,
91
+ test_loader=None,
92
+ logger_config=logger_config,
93
+ use_amp=True
94
+ )
95
+
96
+ trainer.train(num_epochs=epochs,sample_prompts=sample_prompts)
97
+
98
+ # 7 - Salva as configurações do modelo para enviar para o hugginfaces
99
+ model.save_pretrained(f"../")
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
src/tokenizer/__init__.py ADDED
File without changes
src/tokenizer/tests.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import Tokenizer
3
+
4
+ if __name__ == "__main__":
5
+ # Carrega streaming do dataset e o tokenizer
6
+ dataset_stream = load_dataset(
7
+ "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens",
8
+ split="train",
9
+ streaming=True
10
+ )
11
+ tokenizer = Tokenizer.from_file("tokens-bpe-36k.json")
12
+ encode = tokenizer.encode
13
+ unk_id = tokenizer.token_to_id("[UNK]")
14
+ vocab_size = tokenizer.get_vocab_size()
15
+
16
+ print("Tamanho do vocabulário:", tokenizer.get_vocab_size())
17
+ enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo")
18
+ print(tokenizer.decode(enc.ids, skip_special_tokens=True))
19
+
20
+ # Contadores
21
+ total_tokens = 0
22
+ total_words = 0
23
+ unk_tokens = 0
24
+ seen_ids = set()
25
+
26
+ batch_size = 512
27
+ batch_counter = 0
28
+
29
+ def batch_iterator(stream, bs):
30
+ buf = []
31
+ for ex in stream:
32
+ buf.append(ex["text"])
33
+ if len(buf) == bs:
34
+ yield buf
35
+ buf = []
36
+ if buf:
37
+ yield buf
38
+
39
+ for texts in batch_iterator(dataset_stream, batch_size):
40
+ # tokeniza em batch
41
+ encs = tokenizer.encode_batch(texts)
42
+
43
+ # conta palavras e tokens no batch
44
+ words_in_batch = sum(len(t.split()) for t in texts)
45
+ total_words += words_in_batch
46
+
47
+ for enc in encs:
48
+ total_tokens += len(enc.ids)
49
+ unk_tokens += enc.ids.count(unk_id)
50
+ seen_ids.update(enc.ids)
51
+
52
+ # impressão parcial a cada 100 batches
53
+ if batch_counter % 100 == 0:
54
+ oov_rate = unk_tokens / total_tokens * 100
55
+ frag = total_tokens / total_words
56
+ coverage = len(seen_ids) / vocab_size * 100
57
+ ttr = len(seen_ids) / total_tokens
58
+ print(f"[Batch {batch_counter:04d}] "
59
+ f"OOV: {oov_rate:.3f}% | "
60
+ f"Frag: {frag:.3f} t/palavra | "
61
+ f"Coverage: {coverage:.2f}% | "
62
+ f"TTR: {ttr:.4f}")
63
+ batch_counter += 1
64
+
65
+ # resultado final
66
+ oov_rate = unk_tokens / total_tokens * 100
67
+ frag = total_tokens / total_words
68
+ coverage = len(seen_ids) / vocab_size * 100
69
+ ttr = len(seen_ids) / total_tokens
70
+
71
+ print("\n=== Métricas Finais ===")
72
+ print(f"Total de tokens: {total_tokens}")
73
+ print(f"Total de palavras: {total_words}")
74
+ print(f"OOV rate: {oov_rate:.3f}%")
75
+ print(f"Fragmentação: {frag:.3f} tokens/palavra")
76
+ print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado")
77
+ print(f"Type–Token Ratio: {ttr:.4f}")
src/tokenizer/tokens-bpe-36k.json ADDED
The diff for this file is too large to render. See raw diff
 
src/tokenizer/trainer.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import Tokenizer
3
+ from tokenizers.models import BPE
4
+ from tokenizers.trainers import BpeTrainer
5
+ from tokenizers.pre_tokenizers import Whitespace, ByteLevel
6
+ import time # Para medir o tempo
7
+ from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents, NFC
8
+ from tokenizers.decoders import ByteLevel as ByteLevelDecoder
9
+ # 1. Carregar o dataset em modo streaming
10
+ dataset_stream = load_dataset("bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens", split="train", streaming=True)
11
+
12
+ print("Dataset carregado em modo streaming:")
13
+ print(dataset_stream)
14
+
15
+ # Nome da coluna que contém o texto
16
+ coluna_texto = "text"
17
+
18
+
19
+ # 2. Criar o gerador para o treinamento do tokenizador
20
+ # Esta função irá iterar sobre o dataset streaming e fornecer o texto
21
+ def get_training_corpus_streaming():
22
+ count = 0
23
+ start_time = time.time()
24
+ print("Iniciando iteração sobre o dataset streaming para o tokenizador...")
25
+ for sample in dataset_stream:
26
+ # Certifique-se de que a amostra não é None e a coluna existe
27
+ if sample and coluna_texto in sample and isinstance(sample[coluna_texto], str):
28
+ yield sample[coluna_texto]
29
+ count += 1
30
+ if count % 10000 == 0: # Log a cada 10000 amostras
31
+ elapsed = time.time() - start_time
32
+ print(f" Processadas {count} amostras para o tokenizador em {elapsed:.2f} segundos...")
33
+ else: # Opcional: Logar amostras inválidas/puladas
34
+ print(f"Aviso: Pulando amostra inválida ou sem coluna '{coluna_texto}': {sample}")
35
+ end_time = time.time()
36
+ print(
37
+ f"Iteração completa. Total de {count} amostras fornecidas ao tokenizador em {end_time - start_time:.2f} segundos.")
38
+
39
+ special_tokens=[
40
+ "[UNK]", "<|endoftext|>",
41
+ "<|user_start|>", "<|user_end|>",
42
+ "<|assistant_start|>", "<|assistant_end|>",
43
+ "<|think_start|>", "<|think_end|>",
44
+ "<|command_start|>", "<|command_end|>",
45
+ ]
46
+
47
+ if __name__ == "__main__":
48
+ print("Inicializando o tokenizador BPE...")
49
+ # tokenizer.pre_tokenizer = Whitespace()
50
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
51
+ tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
52
+ tokenizer.normalizer = NFC()
53
+ tokenizer.decoder = ByteLevelDecoder(add_prefix_space=True)
54
+
55
+ # Aqui: merges mais agressivos
56
+ trainer = BpeTrainer(
57
+ vocab_size=36000 + len(special_tokens),
58
+ min_frequency=7,
59
+ limit_alphabet=1300,
60
+ # continuing_subword_prefix="##",
61
+ # end_of_word_suffix="</w>", # baixa frequência mínima para 1
62
+ show_progress=True, # barra de progresso
63
+ special_tokens=special_tokens,
64
+ )
65
+
66
+ print("Iniciando o treinamento do tokenizador a partir do stream...")
67
+ start_train_time = time.time()
68
+ tokenizer.train_from_iterator(
69
+ get_training_corpus_streaming(),
70
+ trainer=trainer
71
+ )
72
+ end_train_time = time.time()
73
+ print(f"Treinamento do tokenizador concluído em {end_train_time - start_train_time:.2f} segundos!")
74
+
75
+ save_path = "tokens-bpe-36k.json"
76
+ tokenizer.save("tokens-bpe-36k.json", pretty=True)
77
+ print(f"Tokenizador salvo em {save_path}")
src/training.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import time
3
+ from typing import Any, Optional, Dict, List
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from tqdm import tqdm
9
+ from logger.logger import TrainerLogger
10
+ from torch.utils.data import DataLoader
11
+ from transformers import PreTrainedModel
12
+
13
+ # Configuração do dispositivo
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+
17
+ class BaseTrainer:
18
+ def __init__(
19
+ self,
20
+ model: PreTrainedModel,
21
+ optimizer: torch.optim.Optimizer,
22
+ scheduler: torch.optim.lr_scheduler._LRScheduler,
23
+ tokenizer: Any,
24
+ train_loader: DataLoader,
25
+ test_loader: Optional[DataLoader] = None,
26
+ logger_config: Dict[str, Any] = None,
27
+ use_amp: bool = True,
28
+ ):
29
+ self.model = model.to(device)
30
+ self.optimizer = optimizer
31
+ self.scheduler = scheduler
32
+ self.tokenizer = tokenizer
33
+ self.train_loader = train_loader
34
+ self.test_loader = test_loader
35
+ self.use_amp = use_amp
36
+ self.scaler = torch.amp.GradScaler('cuda') if use_amp else None
37
+ self.train_step = 0
38
+ self._best_perplexity = float('inf')
39
+ self._epochs_no_improve = 0
40
+
41
+ total_params = sum(p.numel() for p in model.parameters())
42
+ self.logger = TrainerLogger(
43
+ tracking_uri=logger_config["tracking_uri"],
44
+ experiment=logger_config["experiment"],
45
+ run_name=logger_config["model_name"],
46
+ model_name=logger_config["model_name"],
47
+ total_params=total_params,
48
+ tags={"version": "1.0", "environment": "development"},
49
+ )
50
+
51
+ def _generate_sample(self, sample_prompts: List[str] = []):
52
+ self.model.eval()
53
+ samples_html = ""
54
+ for prompt in sample_prompts:
55
+ try:
56
+ # sample_text = generate_text_sample(self.model, self.tokenizer, prompt)
57
+ inputs = self.tokenizer(prompt, return_tensors="pt")
58
+ input_ids = inputs.input_ids.to(self.model.device)
59
+
60
+ # 4) Gere texto
61
+ with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
62
+ generated_ids = self.model.generate(
63
+ input_ids=input_ids,
64
+ max_length=100, # comprimento total (prompt + continuação)
65
+ num_beams=5, # número de “hips” em beam search
66
+ do_sample=True, # ativa amostragem (em vez de pura greed)
67
+ top_k=50, # restringe sampling aos top-50 tokens
68
+ top_p=0.95, # usa nucleus sampling (p acumulado ≤ 0.95)
69
+ temperature=0.7, # controle de “criatividade”
70
+ repetition_penalty=1.2, # penaliza repetições exatas
71
+ use_cache=True, # reutiliza past_key_values (default)
72
+ eos_token_id=self.tokenizer.eos_token_id,
73
+ pad_token_id=self.tokenizer.pad_token_id,
74
+ )
75
+
76
+ # 5) Decode para string
77
+ generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
78
+ except Exception as e:
79
+ generated_text = f"Erro: {e}"
80
+ samples_html += f"<h4><b>prompt:</b> {prompt}</h4><p><b>Resposta:</b> {generated_text}</p>"
81
+ self.model.train()
82
+ return samples_html
83
+
84
+ def _calc_loss_batch(self, inputs: torch.Tensor) -> torch.Tensor:
85
+ """
86
+ Calcula apenas a entropia cruzada para um batch de input_ids,
87
+ desativando o cache de chaves/valores durante o treinamento.
88
+ """
89
+ ignore_idx = -100
90
+ # valida que todos os tokens estão no vocabulário ou são tokens de ignore
91
+ valid = ((inputs >= 0) | (inputs == ignore_idx)) & (inputs < self.tokenizer.vocab_size)
92
+ assert valid.all(), f"Há labels inválidos: min={inputs.min().item()}, max={inputs.max().item()}"
93
+
94
+ inputs = inputs.to(device)
95
+ with torch.autocast(device_type="cuda", dtype=torch.float16):
96
+ outputs = self.model(
97
+ input_ids=inputs,
98
+ labels=inputs,
99
+ use_cache=False, # desabilita o KV-cache no treino
100
+ return_dict=True # garante acesso via .loss e .logits
101
+ )
102
+ loss = outputs.loss
103
+ logits = outputs.logits
104
+ if torch.isnan(logits).any() or torch.isinf(logits).any():
105
+ raise RuntimeError("Logits inválidos detectados")
106
+ return loss
107
+
108
+ def _train_epoch(self, epoch: int, sample_prompts: Optional[List[str]] = None) -> List[float]:
109
+ if sample_prompts is None:
110
+ sample_prompts = []
111
+
112
+ self.model.train()
113
+ losses = []
114
+ size_dataset = len(self.train_loader)
115
+ pbar = tqdm(
116
+ self.train_loader,
117
+ total=size_dataset,
118
+ desc=f"Epoch {epoch + 1}",
119
+ unit="batch",
120
+ leave=False,
121
+ )
122
+
123
+ for i, batch in enumerate(pbar):
124
+ start_time = time.time()
125
+ self.optimizer.zero_grad()
126
+ loss = self._calc_loss_batch(batch['input_ids'])
127
+ losses.append(loss.item())
128
+
129
+ if self.use_amp:
130
+ self.scaler.scale(loss).backward()
131
+ self.scaler.unscale_(self.optimizer)
132
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
133
+ self.scaler.step(self.optimizer)
134
+ self.scaler.update()
135
+ else:
136
+ loss.backward()
137
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
138
+ self.optimizer.step()
139
+
140
+ self.scheduler.step()
141
+ perplexity = math.exp(loss.item())
142
+ current_lr = self.optimizer.param_groups[0].get('lr', 0.0)
143
+ elapsed_time = time.time() - start_time
144
+
145
+ pbar.set_postfix({
146
+ "loss": f"{loss.item():.4f}",
147
+ "perplexity": f"{perplexity:.4f}",
148
+ "lr": f"{current_lr:.2e}",
149
+ "elapsed_time": f"{elapsed_time:.2f}s",
150
+ })
151
+
152
+ # Logging a cada 100 batches
153
+ if (i + 1) % 100 == 0:
154
+ self.train_step += 1
155
+ avg_loss = sum(losses[-100:]) / 100
156
+ avg_perplexity = math.exp(sum(losses[-100:]) / 100)
157
+ self.logger.log_metrics(
158
+ {
159
+ "train_loss": avg_loss,
160
+ "train_perplexity": avg_perplexity,
161
+ "lr": current_lr,
162
+ },
163
+ step=self.train_step,
164
+ )
165
+
166
+ # Gera samples
167
+ if (i + 1) % 500 == 0:
168
+ samples_html = self._generate_sample(sample_prompts)
169
+ self.logger.log_html(f"<html><head><meta charset='utf-8'></head><body>{samples_html}</body></html>",
170
+ step=self.train_step)
171
+
172
+ # Checkpoint a cada 1000 batches
173
+ if (i + 1) % 1000 == 0:
174
+ avg_loss = sum(losses[-1000:]) / 1000
175
+ avg_perplexity = math.exp(sum(losses[-1000:]) / 1000)
176
+ self.logger.log_checkpoint_table(current_lr, avg_loss, avg_perplexity, i + 1)
177
+ self.logger.checkpoint_model(self.model)
178
+ self.model.save_pretrained(f"../")
179
+
180
+
181
+ return losses
182
+
183
+ def train(self, num_epochs: int = 500, sample_prompts: Optional[List[str]] = None):
184
+ for epoch in range(num_epochs):
185
+ train_losses = self._train_epoch(epoch, sample_prompts)
186
+ mean_train_loss = sum(train_losses) / len(train_losses)
187
+ self.logger.log_metrics(
188
+ {"mean_train_loss": mean_train_loss},
189
+ step=epoch,
190
+ )
191
+ print(f"Epoch {epoch + 1} | Train Loss: {mean_train_loss:.4f}")
192
+
193
+ self.logger.finish()
194
+ print("Treinamento concluído!")
195
+
196
+
197
+ # Exemplo de uso para Fine-Tuning:
198
+ class TuningTrainer(BaseTrainer):
199
+ pass
200
+
201
+ # Exemplo de uso para Pré-Treinamento:
202
+ class PreTrainer(BaseTrainer):
203
+ pass
src/tynerox/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .modeling import TyneRoxModel, TyneRoxConfig
2
+
3
+ __all__ = ["TyneRoxConfig", "TyneRoxModel"]
src/tynerox/modeling.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from flash_attn.flash_attn_interface import flash_attn_func
6
+ from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
7
+ from transformers.modeling_outputs import CausalLMOutputWithPast
8
+ from typing import Optional, Literal, Union, Tuple
9
+
10
+
11
+ class PositionalEncoding(nn.Module):
12
+ """
13
+ Implements positional encoding (sinusoidal or rotary).
14
+ """
15
+ def __init__(
16
+ self,
17
+ embed_dim: int,
18
+ context_length: int,
19
+ dropout: float = 0.1,
20
+ encoding_type: Literal['sinusoidal', 'rotary'] = 'rotary',
21
+ ):
22
+ super().__init__()
23
+ if embed_dim <= 0 or context_length <= 0:
24
+ raise ValueError("embed_dim and context_length must be positive integers")
25
+ if not 0 <= dropout < 1:
26
+ raise ValueError("dropout must be between 0 and 1")
27
+
28
+ self.dropout = nn.Dropout(dropout)
29
+ self.encoding_type = encoding_type.lower()
30
+ self.max_seq_len = context_length
31
+ self.embed_dim = embed_dim
32
+
33
+ if self.encoding_type == 'sinusoidal':
34
+ pe = self._create_sinusoidal_embeddings(context_length, embed_dim)
35
+ self.register_buffer('pe', pe.unsqueeze(0), persistent=True)
36
+ elif self.encoding_type == 'rotary':
37
+ if embed_dim % 2 != 0:
38
+ raise ValueError("embed_dim must be even for rotary encoding")
39
+ # inv_freq of size D/2
40
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, embed_dim, 2).float() / embed_dim))
41
+ self.register_buffer('inv_freq', inv_freq, persistent=True)
42
+ else:
43
+ raise ValueError("Unsupported encoding_type: 'sinusoidal' or 'rotary'")
44
+
45
+ def _create_sinusoidal_embeddings(self, max_seq_len: int, dim: int) -> torch.Tensor:
46
+ position = torch.arange(max_seq_len).unsqueeze(1).float()
47
+ div_term = torch.exp(torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim))
48
+ pe = torch.zeros(max_seq_len, dim)
49
+ pe[:, 0::2] = torch.sin(position * div_term)
50
+ pe[:, 1::2] = torch.cos(position * div_term)
51
+ return pe
52
+
53
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
54
+ # x shape: [B, T, D]
55
+ if self.encoding_type == 'sinusoidal':
56
+ seq_len = x.size(1)
57
+ x = x + self.pe[:, :seq_len, :]
58
+ else:
59
+ # rotary: split even/odd dims and apply rotary
60
+ seq_len = x.size(1)
61
+ positions = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
62
+ # freqs of shape [T, D/2]
63
+ freqs = torch.einsum('i , j -> i j', positions, self.inv_freq)
64
+ x = self.apply_rotary(x, freqs)
65
+ return self.dropout(x)
66
+
67
+ @staticmethod
68
+ def apply_rotary(x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
69
+ # x: [B, T, D], emb: [T, D/2]
70
+ x1, x2 = x.chunk(2, dim=-1) # each [B, T, D/2]
71
+ emb_sin = emb.sin()[None, :, :] # [1, T, D/2]
72
+ emb_cos = emb.cos()[None, :, :] # [1, T, D/2]
73
+ # apply rotary
74
+ rotated1 = x1 * emb_cos + x2 * emb_sin
75
+ rotated2 = x2 * emb_cos - x1 * emb_sin
76
+ return torch.cat([rotated1, rotated2], dim=-1) # [B, T, D]
77
+
78
+
79
+ class PositionalEmbedding(nn.Module):
80
+ """
81
+ Combines token embedding with positional encoding.
82
+ """
83
+ def __init__(
84
+ self,
85
+ vocab_size: int,
86
+ embed_dim: int,
87
+ context_length: int,
88
+ dropout: float = 0.05,
89
+ encoding_type: Literal['sinusoidal', 'rotary'] = 'rotary'
90
+ ):
91
+ super().__init__()
92
+ if vocab_size <= 0 or embed_dim <= 0 or context_length <= 0:
93
+ raise ValueError("vocab_size, embed_dim, context_length must be > 0")
94
+
95
+ self.token_embedding = nn.Embedding(vocab_size, embed_dim)
96
+ self.scale = math.sqrt(embed_dim)
97
+ self.pos_encoding = PositionalEncoding(
98
+ embed_dim=embed_dim,
99
+ context_length=context_length,
100
+ dropout=dropout,
101
+ encoding_type=encoding_type
102
+ )
103
+
104
+ def forward(self, input_ids: torch.LongTensor) -> torch.Tensor:
105
+ # input_ids: [B, T]
106
+ x = self.token_embedding(input_ids) * self.scale # [B, T, D]
107
+ return self.pos_encoding(x)
108
+
109
+
110
+ def get_alibi_slopes(n_heads: int) -> torch.Tensor:
111
+ def _get_slopes(n):
112
+ base = 2 ** (-8.0 / n)
113
+ return torch.tensor([base ** (i + 1) for i in range(n)])
114
+ if math.log2(n_heads).is_integer():
115
+ return _get_slopes(n_heads)
116
+ m = 2 ** math.floor(math.log2(n_heads))
117
+ slopes = _get_slopes(m)
118
+ extra = _get_slopes(2 * m)[::2][: n_heads - m]
119
+ return torch.cat([slopes, extra], dim=0)
120
+
121
+ # -----------------------------------------------------------------------------
122
+ # Feed-Forward
123
+ # -----------------------------------------------------------------------------
124
+
125
+ class FeedForward(nn.Module):
126
+ def __init__(self, emb_dim: int, hidden_dim_multiplier: int = 4):
127
+ super().__init__()
128
+ hidden_dim = emb_dim * hidden_dim_multiplier
129
+ self.fc1 = nn.Linear(emb_dim, hidden_dim)
130
+ self.fc2 = nn.Linear(hidden_dim // 2, emb_dim)
131
+ self.activation = nn.SiLU()
132
+
133
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
134
+ x_fc1 = self.fc1(x)
135
+ x_up, x_gate = x_fc1.chunk(2, dim=-1)
136
+ return self.fc2(x_up * self.activation(x_gate))
137
+
138
+ # -----------------------------------------------------------------------------
139
+ # Attention-Free Transformer (AFT) Simple
140
+ # -----------------------------------------------------------------------------
141
+
142
+ class AFTSimple(nn.Module):
143
+ def __init__(
144
+ self,
145
+ embed_dim: int,
146
+ activation=torch.sigmoid,
147
+ causal: bool = True,
148
+ ):
149
+ super().__init__()
150
+ self.embed_dim = embed_dim
151
+ self.causal = causal
152
+ self.activation = activation
153
+
154
+ self.qkv = nn.Linear(embed_dim, 3 * embed_dim, bias=False)
155
+ self.project = nn.Linear(embed_dim, embed_dim)
156
+
157
+ def forward(
158
+ self,
159
+ x: torch.Tensor,
160
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
161
+ ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
162
+ # x: [B, T_new, D]
163
+ B, T_new, D = x.shape
164
+ if D != self.embed_dim:
165
+ raise ValueError(f"Input dim ({D}) != embed_dim ({self.embed_dim})")
166
+
167
+ qkv = self.qkv(x) # [B, T_new, 3*D]
168
+ Q, K_new, V_new = qkv.chunk(3, dim=-1) # each [B, T_new, D]
169
+
170
+ # concatenate past if provided
171
+ if past_key_values is not None:
172
+ K_past, V_past = past_key_values
173
+ K = torch.cat([K_past, K_new], dim=1) # [B, T_all, D]
174
+ V = torch.cat([V_past, V_new], dim=1)
175
+ else:
176
+ K, V = K_new, V_new
177
+
178
+ # compute attention-free aggregate
179
+ softmax_k = F.softmax(K, dim=1) # [B, T_all, D]
180
+ weighted_v = softmax_k * V # [B, T_all, D]
181
+
182
+ if self.causal:
183
+ context = torch.cumsum(weighted_v, dim=1) # [B, T_all, D]
184
+ else:
185
+ total = weighted_v.sum(dim=1, keepdim=True) # [B, 1, D]
186
+ context = total.expand(-1, K.size(1), -1) # [B, T_all, D]
187
+
188
+ # slice only the new positions
189
+ context_new = context[:, -T_new:, :] # [B, T_new, D]
190
+ gate = self.activation(Q) # [B, T_new, D]
191
+ Y = gate * context_new # [B, T_new, D]
192
+ Y = self.project(Y) # [B, T_new, D]
193
+
194
+ # return output and updated cache
195
+ return Y, (K, V)
196
+
197
+ # -----------------------------------------------------------------------------
198
+ # Flash Attention with ALiBi and KV-cache
199
+ # -----------------------------------------------------------------------------
200
+
201
+ class FlashAttention(nn.Module):
202
+ def __init__(
203
+ self,
204
+ embed_dim: int,
205
+ num_heads: int,
206
+ window_size: int,
207
+ causal: bool = True,
208
+ qkv_bias: bool = False,
209
+ ):
210
+ super().__init__()
211
+ assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
212
+ self.num_heads = num_heads
213
+ self.head_dim = embed_dim // num_heads
214
+ self.causal = causal
215
+ self.window_size = window_size
216
+
217
+ self.qkv = nn.Linear(embed_dim, 3 * embed_dim, bias=qkv_bias)
218
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
219
+
220
+ # precompute ALiBi slopes
221
+ self.register_buffer('alibi', get_alibi_slopes(num_heads))
222
+
223
+ def forward(
224
+ self,
225
+ x: torch.Tensor,
226
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
227
+ ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
228
+ # x: [B, T_new, D]
229
+ B, T_new, _ = x.size()
230
+ qkv = self.qkv(x).view(B, T_new, self.num_heads, 3 * self.head_dim)
231
+ q, k_new, v_new = torch.chunk(qkv, 3, dim=-1) # each [B, T_new, H, Dh]
232
+
233
+ if past_key_values is not None:
234
+ k_past, v_past = past_key_values
235
+ k = torch.cat([k_past, k_new], dim=1) # [B, T_all, H, Dh]
236
+ v = torch.cat([v_past, v_new], dim=1)
237
+ else:
238
+ k, v = k_new, v_new
239
+
240
+ attn_out = flash_attn_func(
241
+ q, k, v,
242
+ softmax_scale=1.0 / math.sqrt(self.head_dim),
243
+ causal=self.causal,
244
+ window_size=(self.window_size - 1, 0) if self.causal else (-1, -1),
245
+ alibi_slopes=self.alibi,
246
+ return_attn_probs=False,
247
+ )
248
+ # attn_out: [B, T_new, H, Dh]
249
+ out = attn_out.contiguous().view(B, T_new, -1) # [B, T_new, D]
250
+ y = self.out_proj(out) # [B, T_new, D]
251
+
252
+ return y, (k, v)
253
+
254
+ # -----------------------------------------------------------------------------
255
+ # Transformer Blocks and Model
256
+ # -----------------------------------------------------------------------------
257
+
258
+ class TransformerBlock(nn.Module):
259
+ def __init__(self, config, att_global: bool = True):
260
+ super().__init__()
261
+ if att_global:
262
+ self.attn = AFTSimple(embed_dim=config.d_model, causal=config.causal)
263
+ else:
264
+ self.attn = FlashAttention(
265
+ embed_dim=config.d_model,
266
+ num_heads=config.num_attention_heads,
267
+ window_size=config.window_size,
268
+ causal=config.causal,
269
+ qkv_bias=True,
270
+ )
271
+ self.ff = nn.Sequential(
272
+ FeedForward(config.d_model),
273
+ FeedForward(config.d_model),
274
+ )
275
+ self.ln1 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
276
+ self.ln2 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
277
+ self.drop = nn.Dropout(config.dropout)
278
+
279
+ def forward(
280
+ self,
281
+ x: torch.Tensor,
282
+ past_key_values: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
283
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
284
+ # Attention + residual
285
+ residual = x
286
+ x = self.ln1(x)
287
+ y, present = self.attn(x, past_key_values=past_key_values)
288
+ x = self.drop(y) + residual
289
+
290
+ # Feed-forward + residual
291
+ residual = x
292
+ x = self.ln2(x)
293
+ x = self.ff(x)
294
+ x = self.drop(x) + residual
295
+
296
+ return x, present
297
+
298
+ class ResidualBlocks(nn.Module):
299
+ def __init__(self, config):
300
+ super().__init__()
301
+ blocks = []
302
+ for i in range(config.num_hidden_layers):
303
+ # alternate local/global: every 3rd layer global
304
+ att_global = ((i + 1) % 3 == 0)
305
+ blocks.append(TransformerBlock(config, att_global=att_global))
306
+ self.layers = nn.ModuleList(blocks)
307
+ self.final_ln = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
308
+
309
+ def forward(
310
+ self,
311
+ x: torch.Tensor,
312
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor,torch.Tensor], ...]] = None
313
+ ) -> Tuple[torch.Tensor, Tuple[Tuple[torch.Tensor,torch.Tensor], ...]]:
314
+ new_past = []
315
+ for i, layer in enumerate(self.layers):
316
+ pkv = None if past_key_values is None else past_key_values[i]
317
+ x, present = layer(x, past_key_values=pkv)
318
+ new_past.append(present)
319
+ x = self.final_ln(x)
320
+ return x, tuple(new_past)
321
+
322
+ # -----------------------------------------------------------------------------
323
+ # Configuration and Model
324
+ # -----------------------------------------------------------------------------
325
+
326
+ class TyneRoxConfig(PretrainedConfig):
327
+ model_type = "tynerox"
328
+
329
+ def __init__(
330
+ self,
331
+ vocab_size: int = 30522,
332
+ context_length: int = 2048,
333
+ d_model: int = 1024,
334
+ num_heads: int = 16,
335
+ window_size: int = 512,
336
+ num_hidden_layers: int = 12,
337
+ causal: bool = True,
338
+ dropout: float = 0.1,
339
+ layer_norm_eps: float = 1e-5,
340
+ tie_word_embeddings: bool = False,
341
+ pad_token_id:int = 0,
342
+ **kwargs
343
+ ):
344
+ super().__init__(**kwargs)
345
+ self.vocab_size = vocab_size
346
+ self.max_position_embeddings = context_length
347
+ self.d_model = d_model
348
+ self.num_attention_heads = num_heads
349
+ self.window_size = window_size
350
+ self.num_hidden_layers = num_hidden_layers
351
+ self.causal = causal
352
+ self.dropout = dropout
353
+ self.layer_norm_eps = layer_norm_eps
354
+ self.tie_word_embeddings = tie_word_embeddings
355
+ self.pad_token_id = pad_token_id
356
+
357
+ class TyneRoxModel(PreTrainedModel, GenerationMixin):
358
+ config_class = TyneRoxConfig
359
+
360
+ def __init__(self, config: TyneRoxConfig):
361
+ super().__init__(config)
362
+ self.embed = PositionalEmbedding(
363
+ config.vocab_size,
364
+ config.d_model,
365
+ config.max_position_embeddings,
366
+ dropout=config.dropout,
367
+ encoding_type='rotary'
368
+ )
369
+ self.transformer = ResidualBlocks(config)
370
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
371
+ self.post_init()
372
+
373
+ def get_input_embeddings(self):
374
+ return self.embed.token_embedding
375
+
376
+ def set_input_embeddings(self, value):
377
+ self.embed.token_embedding = value
378
+
379
+ def get_output_embeddings(self):
380
+ return self.lm_head
381
+
382
+ def set_output_embeddings(self, value):
383
+ self.lm_head = value
384
+
385
+ def forward(
386
+ self,
387
+ input_ids: torch.LongTensor,
388
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
389
+ labels: Optional[torch.LongTensor] = None,
390
+ use_cache: bool = True,
391
+ return_dict: bool = True,
392
+ **kwargs
393
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
394
+ # 1) Embeddings
395
+ x = self.embed(input_ids) # [B, T, D]
396
+
397
+ # 2) Transformer blocks with KV-cache
398
+ x, new_past = self.transformer(x, past_key_values=past_key_values)
399
+
400
+ # 3) Project to vocabulary logits
401
+ logits = self.lm_head(x) # [B, T, V]
402
+
403
+ # 4) Compute loss if labels provided
404
+ loss = None
405
+ if labels is not None:
406
+ shift_logits = logits[:, :-1, :].contiguous()
407
+ shift_labels = labels[:, 1:].contiguous()
408
+ loss = F.cross_entropy(
409
+ shift_logits.view(-1, shift_logits.size(-1)),
410
+ shift_labels.view(-1),
411
+ ignore_index=-100,
412
+ )
413
+
414
+ # 5) Return standardized output
415
+ if not return_dict:
416
+ output = (logits, new_past) if use_cache else (logits,)
417
+ return ((loss,) + output) if loss is not None else output
418
+
419
+ return CausalLMOutputWithPast(
420
+ loss=loss,
421
+ logits=logits,
422
+ past_key_values=new_past if use_cache else None,
423
+ )
424
+
425
+ def _reorder_cache(
426
+ self,
427
+ past_key_values: Tuple[Tuple[torch.Tensor, torch.Tensor], ...],
428
+ beam_idx: torch.Tensor
429
+ ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
430
+ reordered = []
431
+ for k, v in past_key_values:
432
+ # ambos têm batch dim = dim 0
433
+ reordered.append((k.index_select(0, beam_idx),
434
+ v.index_select(0, beam_idx)))
435
+ return tuple(reordered)
436
+
437
+ def prepare_inputs_for_generation(
438
+ self,
439
+ input_ids: torch.LongTensor,
440
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
441
+ **kwargs
442
+ ) -> dict:
443
+ # at generation time, only feed in the last token
444
+ if past_key_values is not None:
445
+ input_ids = input_ids[:, -1:].contiguous()
446
+ return {
447
+ "input_ids": input_ids,
448
+ "past_key_values": past_key_values,
449
+ }
src/visualizations/sample.html ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html><head><meta charset='utf-8'></head><body><h4><b>prompt:</b> Olá, como vai você? </h4><p><b>Resposta:</b> Olá, como vai você? .
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ </p><h4><b>prompt:</b> Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que </h4><p><b>Resposta:</b> Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que .
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+ </p><h4><b>prompt:</b> Não, respondeu; na verdade, estou com medo </h4><p><b>Resposta:</b> Não, respondeu; na verdade, estou com medo .
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+ </p><h4><b>prompt:</b> O resultado representa uma desaceleração </h4><p><b>Resposta:</b> O resultado representa uma desaceleração .
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+ </p><h4><b>prompt:</b> No vídeo, é possível ver </h4><p><b>Resposta:</b> No vídeo, é possível ver .
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+ </p><h4><b>prompt:</b> Essa receita de torta de frango </h4><p><b>Resposta:</b> Essa receita de torta de frango .
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+
462
+
463
+
464
+
465
+
466
+
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
533
+
534
+
535
+ </p><h4><b>prompt:</b> Durante o primeiro mandato </h4><p><b>Resposta:</b> Durante o primeiro mandato .
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+
619
+
620
+
621
+
622
+
623
+
624
+
625
+
626
+
627
+
628
+
629
+ </p><h4><b>prompt:</b> Os donos de cães </h4><p><b>Resposta:</b> Os donos de cães .
630
+
631
+
632
+
633
+
634
+
635
+
636
+
637
+
638
+
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+
680
+
681
+
682
+
683
+
684
+
685
+
686
+
687
+
688
+
689
+
690
+
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
+
708
+
709
+
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+
722
+
723
+ </p></body></html>
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<|user_start|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<|user_end|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<|assistant_start|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<|assistant_end|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "<|think_start|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "<|think_end|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "<|command_start|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<|command_end|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ }
83
+ },
84
+ "clean_up_tokenization_spaces": false,
85
+ "eos_token": "<|endoftext|>",
86
+ "extra_special_tokens": {},
87
+ "model_max_length": 1000000000000000019884624838656,
88
+ "pad_token": "<|endoftext|>",
89
+ "tokenizer_class": "PreTrainedTokenizer",
90
+ "unk_token": "[UNK]"
91
+ }