Spaces:
Build error
Build error
feat(main): init train and dvc
Browse files- .dvc/.gitignore +3 -0
- .dvc/config +9 -0
- .dvcignore +3 -0
- .idea/jsonSchemas.xml +25 -0
- data/models/.gitignore +1 -0
- data/raw/.gitignore +1 -0
- data/raw/arxivData.json.dvc +4 -0
- dvc.lock +24 -0
- dvc.yaml +16 -0
- poetry.lock +0 -0
- pyproject.toml +8 -0
- shad_mlops_transformers/__init__.py +0 -0
- shad_mlops_transformers/config.py +17 -0
- shad_mlops_transformers/main.py +0 -0
- shad_mlops_transformers/model.py +56 -8
- shad_mlops_transformers/trainer.py +103 -0
- shad_mlops_transformers/ui.py +0 -0
.dvc/.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/config.local
|
| 2 |
+
/tmp
|
| 3 |
+
/cache
|
.dvc/config
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[core]
|
| 2 |
+
remote = ya-s3
|
| 3 |
+
autostage = true
|
| 4 |
+
['remote "ya-s3"']
|
| 5 |
+
url = s3://shad-ml-2-hw-5/dvc
|
| 6 |
+
endpointurl = https://storage.yandexcloud.net
|
| 7 |
+
|
| 8 |
+
[cache]
|
| 9 |
+
type = reflink,hardlink,symlink,copy
|
.dvcignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Add patterns of files dvc should ignore, which could improve
|
| 2 |
+
# the performance. Learn more at
|
| 3 |
+
# https://dvc.org/doc/user-guide/dvcignore
|
.idea/jsonSchemas.xml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="JsonSchemaMappingsProjectConfiguration">
|
| 4 |
+
<state>
|
| 5 |
+
<map>
|
| 6 |
+
<entry key="dvc.yaml">
|
| 7 |
+
<value>
|
| 8 |
+
<SchemaInfo>
|
| 9 |
+
<option name="name" value="dvc.yaml" />
|
| 10 |
+
<option name="relativePathToSchema" value="https://raw.githubusercontent.com/iterative/dvcyaml-schema/master/schema.json" />
|
| 11 |
+
<option name="applicationDefined" value="true" />
|
| 12 |
+
<option name="patterns">
|
| 13 |
+
<list>
|
| 14 |
+
<Item>
|
| 15 |
+
<option name="path" value="dvc.yaml" />
|
| 16 |
+
</Item>
|
| 17 |
+
</list>
|
| 18 |
+
</option>
|
| 19 |
+
</SchemaInfo>
|
| 20 |
+
</value>
|
| 21 |
+
</entry>
|
| 22 |
+
</map>
|
| 23 |
+
</state>
|
| 24 |
+
</component>
|
| 25 |
+
</project>
|
data/models/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/model.torch
|
data/raw/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/arxivData.json
|
data/raw/arxivData.json.dvc
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
outs:
|
| 2 |
+
- md5: a314e2f4eab544a46e6f95802ecde647
|
| 3 |
+
size: 72422946
|
| 4 |
+
path: arxivData.json
|
dvc.lock
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
schema: '2.0'
|
| 2 |
+
stages:
|
| 3 |
+
train:
|
| 4 |
+
cmd: poetry run train
|
| 5 |
+
deps:
|
| 6 |
+
- path: data/raw/arxivData.json
|
| 7 |
+
md5: a314e2f4eab544a46e6f95802ecde647
|
| 8 |
+
size: 72422946
|
| 9 |
+
- path: shad_mlops_transformers/model.py
|
| 10 |
+
md5: 9b932a6cb0cb46fc7c656e7c80c442e0
|
| 11 |
+
size: 2008
|
| 12 |
+
isexec: true
|
| 13 |
+
- path: shad_mlops_transformers/trainer.py
|
| 14 |
+
md5: 61acf28399fadfd2495dc48242c594ba
|
| 15 |
+
size: 3650
|
| 16 |
+
params:
|
| 17 |
+
shad_mlops_transformers/config.py:
|
| 18 |
+
Config.batch_size: 32
|
| 19 |
+
Config.random_seed: 42
|
| 20 |
+
Config.test_size: 0.2
|
| 21 |
+
outs:
|
| 22 |
+
- path: data/models/model.torch
|
| 23 |
+
md5: f110836b7b7585efdbfcb8ab7d5df76c
|
| 24 |
+
size: 438187413
|
dvc.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
stages:
|
| 2 |
+
train:
|
| 3 |
+
cmd: poetry run train
|
| 4 |
+
deps:
|
| 5 |
+
- shad_mlops_transformers/trainer.py
|
| 6 |
+
- shad_mlops_transformers/model.py
|
| 7 |
+
- data/raw/arxivData.json
|
| 8 |
+
params:
|
| 9 |
+
- shad_mlops_transformers/config.py:
|
| 10 |
+
- Config.batch_size
|
| 11 |
+
- Config.random_seed
|
| 12 |
+
- Config.test_size
|
| 13 |
+
outs:
|
| 14 |
+
# NOTE должно совпадать с конфигом
|
| 15 |
+
- data/models/model.torch
|
| 16 |
+
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -12,6 +12,11 @@ python = "^3.10"
|
|
| 12 |
streamlit = "^1.21.0"
|
| 13 |
torch = "^1.13"
|
| 14 |
transformers = "^4.27.4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
[tool.poetry.group.dev.dependencies]
|
|
@@ -20,6 +25,9 @@ black = "^23.3.0"
|
|
| 20 |
docformatter = "^1.6.0"
|
| 21 |
isort = "^5.12.0"
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
[build-system]
|
| 24 |
requires = ["poetry-core"]
|
| 25 |
build-backend = "poetry.core.masonry.api"
|
|
|
|
| 12 |
streamlit = "^1.21.0"
|
| 13 |
torch = "^1.13"
|
| 14 |
transformers = "^4.27.4"
|
| 15 |
+
pydantic = "^1.10.7"
|
| 16 |
+
scikit-learn = "^1.2.2"
|
| 17 |
+
numpy = "^1.24.2"
|
| 18 |
+
loguru = "^0.7.0"
|
| 19 |
+
dvc = {version = "^2.54.0", extras = ["s3"]}
|
| 20 |
|
| 21 |
|
| 22 |
[tool.poetry.group.dev.dependencies]
|
|
|
|
| 25 |
docformatter = "^1.6.0"
|
| 26 |
isort = "^5.12.0"
|
| 27 |
|
| 28 |
+
[tool.poetry.scripts]
|
| 29 |
+
train = "shad_mlops_transformers.trainer:main"
|
| 30 |
+
|
| 31 |
[build-system]
|
| 32 |
requires = ["poetry-core"]
|
| 33 |
build-backend = "poetry.core.masonry.api"
|
shad_mlops_transformers/__init__.py
CHANGED
|
File without changes
|
shad_mlops_transformers/config.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseSettings
|
| 4 |
+
|
| 5 |
+
basedir = Path(__file__).parent
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Config(BaseSettings):
|
| 9 |
+
data_dir: Path = basedir.parent / "data"
|
| 10 |
+
raw_data_dir: Path = data_dir / "raw"
|
| 11 |
+
batch_size: int = 32
|
| 12 |
+
random_seed: int = 42
|
| 13 |
+
test_size: float = 0.2
|
| 14 |
+
weights_path: Path = data_dir / "models" / "model.torch"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
config = Config()
|
shad_mlops_transformers/main.py
CHANGED
|
File without changes
|
shad_mlops_transformers/model.py
CHANGED
|
@@ -1,8 +1,56 @@
|
|
| 1 |
-
from
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import OrderedDict
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
from transformers import AutoModel, AutoTokenizer
|
| 7 |
+
|
| 8 |
+
from shad_mlops_transformers.config import config
|
| 9 |
+
|
| 10 |
+
# example = ["Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute."]
|
| 11 |
+
# model_name = "bert-base-uncased"
|
| 12 |
+
# model_name = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
|
| 13 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 14 |
+
# model = AutoModel.from_pretrained(model_name)
|
| 15 |
+
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
|
| 16 |
+
# toks = tokenizer(example, padding=True, truncation=True, return_tensors="pt")
|
| 17 |
+
# with torch.no_grad():
|
| 18 |
+
# p = model(**toks)
|
| 19 |
+
# print(p)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class DocumentClassifier(nn.Module):
|
| 23 |
+
def __init__(self, n_classes: int = 2):
|
| 24 |
+
super().__init__()
|
| 25 |
+
self.model_name = "bert-base-uncased"
|
| 26 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 27 |
+
self.encoder = AutoModel.from_pretrained(self.model_name)
|
| 28 |
+
self.n_classes = n_classes
|
| 29 |
+
self.model = nn.Sequential(
|
| 30 |
+
OrderedDict(
|
| 31 |
+
[
|
| 32 |
+
("fc", nn.Linear(in_features=self.encoder.pooler.dense.out_features, out_features=n_classes)),
|
| 33 |
+
("sm", nn.Softmax()),
|
| 34 |
+
]
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
self.trainable_params = self.model.parameters()
|
| 38 |
+
|
| 39 |
+
def forward(self, text):
|
| 40 |
+
tok_info = self.tokenize(text)
|
| 41 |
+
with torch.no_grad():
|
| 42 |
+
embeddings = self.encoder(**tok_info)["pooler_output"]
|
| 43 |
+
return self.model(embeddings)
|
| 44 |
+
|
| 45 |
+
def tokenize(self, x: str) -> dict:
|
| 46 |
+
return self.tokenizer(x, padding=True, truncation=True, return_tensors="pt")
|
| 47 |
+
|
| 48 |
+
def from_file(self, path: Path = config.weights_path) -> "DocumentClassifier":
|
| 49 |
+
self.load_state_dict(torch.load(path))
|
| 50 |
+
return self
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
data = ["This article describes machine learning"]
|
| 55 |
+
model = DocumentClassifier(n_classes=61).from_file()
|
| 56 |
+
model(data)
|
shad_mlops_transformers/trainer.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from sklearn.model_selection import train_test_split
|
| 8 |
+
from torch.utils.data import DataLoader, Dataset
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
from shad_mlops_transformers.config import config
|
| 12 |
+
from shad_mlops_transformers.model import DocumentClassifier
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ArxivDataset(Dataset):
|
| 16 |
+
def __init__(self, raw_data: list[dict]):
|
| 17 |
+
"""Разово вычитываем и сохраняем весь датасет."""
|
| 18 |
+
logger.info("reading data")
|
| 19 |
+
self.x = []
|
| 20 |
+
self.y = []
|
| 21 |
+
# self.data = []
|
| 22 |
+
whitelist_labels = ["math", "cs"]
|
| 23 |
+
i = 0
|
| 24 |
+
self.class_mapper = {}
|
| 25 |
+
for item in raw_data:
|
| 26 |
+
tmp_y = []
|
| 27 |
+
# да простят мне это потомки, но там зачем-то люди засунули питоновский dict в строку!
|
| 28 |
+
for tag_desc in eval(item["tag"].replace("'", '"')):
|
| 29 |
+
real_tag: str = tag_desc["term"]
|
| 30 |
+
# пока берем только теги из whitelist
|
| 31 |
+
if not any([real_tag.startswith(x) for x in whitelist_labels]):
|
| 32 |
+
continue
|
| 33 |
+
if real_tag not in self.class_mapper:
|
| 34 |
+
self.class_mapper[real_tag] = i
|
| 35 |
+
i += 1
|
| 36 |
+
tmp_y.append(self.class_mapper[real_tag])
|
| 37 |
+
# берем только один тег
|
| 38 |
+
break
|
| 39 |
+
# если был хотя бы один валидный тег, добавляем в датасет
|
| 40 |
+
if len(tmp_y):
|
| 41 |
+
# NOTE берем только один тег
|
| 42 |
+
# self.data.append({"label": tmp_y[0], "text": item["summary"]})
|
| 43 |
+
self.x.append(item["summary"])
|
| 44 |
+
self.y.append(tmp_y[0])
|
| 45 |
+
self.classes = sorted(list(self.class_mapper.keys()))
|
| 46 |
+
logger.info("[Done] reading data")
|
| 47 |
+
|
| 48 |
+
def __getitem__(self, i):
|
| 49 |
+
# return self.data[i]
|
| 50 |
+
return self.x[i], self.y[i]
|
| 51 |
+
|
| 52 |
+
def __len__(self):
|
| 53 |
+
# return len(self.data)
|
| 54 |
+
return len(self.x)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def make_train_val():
|
| 58 |
+
with open(config.raw_data_dir / "arxivData.json", "r") as f:
|
| 59 |
+
_raw_json = json.load(f)
|
| 60 |
+
return train_test_split(_raw_json, test_size=config.test_size, shuffle=True, random_state=config.random_seed)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def train_model(model: DocumentClassifier, optimizer: torch.optim.Optimizer, loader: DataLoader, criterion):
|
| 64 |
+
model.train()
|
| 65 |
+
losses_tr = []
|
| 66 |
+
for text, true_label in tqdm(loader):
|
| 67 |
+
optimizer.zero_grad()
|
| 68 |
+
pred = model(text)
|
| 69 |
+
loss = criterion(pred, true_label)
|
| 70 |
+
|
| 71 |
+
loss.backward()
|
| 72 |
+
optimizer.step()
|
| 73 |
+
losses_tr.append(loss.item())
|
| 74 |
+
break
|
| 75 |
+
|
| 76 |
+
return model, optimizer, np.mean(losses_tr)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def collator(x):
|
| 80 |
+
return x[0]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def save_model(model: DocumentClassifier):
|
| 84 |
+
config.weights_path.parent.mkdir(parents=True, exist_ok=True)
|
| 85 |
+
torch.save(model.state_dict(), config.weights_path)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def main():
|
| 89 |
+
train, val = make_train_val()
|
| 90 |
+
dataset_train = ArxivDataset(train)
|
| 91 |
+
dataset_val = ArxivDataset(val)
|
| 92 |
+
loader_train = DataLoader(dataset_train, batch_size=config.batch_size, shuffle=True, drop_last=True)
|
| 93 |
+
loader_val = DataLoader(dataset_val, batch_size=config.batch_size, shuffle=True, drop_last=True)
|
| 94 |
+
|
| 95 |
+
model = DocumentClassifier(n_classes=len(dataset_train.classes))
|
| 96 |
+
optimizer = torch.optim.Adam(model.trainable_params)
|
| 97 |
+
loss = nn.CrossEntropyLoss()
|
| 98 |
+
train_model(model=model, optimizer=optimizer, loader=loader_train, criterion=loss)
|
| 99 |
+
save_model(model)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
main()
|
shad_mlops_transformers/ui.py
CHANGED
|
File without changes
|