astart01 commited on
Commit
99a17a1
·
verified ·
1 Parent(s): efdbaa5

Upload 9 files

Browse files
Files changed (10) hide show
  1. .gitattributes +1 -0
  2. bert.py +37 -0
  3. config.json +34 -0
  4. labeled(1).csv +0 -0
  5. model.safetensors +3 -0
  6. ru2.ipynb +557 -0
  7. special_tokens_map.json +37 -0
  8. tokenizer.json +0 -0
  9. tokenizer_config.json +64 -0
  10. vocab.txt +0 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
bert.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ import streamlit as st
3
+ import torch
4
+
5
+
6
+ MODEL_PATH = "rubert-finetuned"
7
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
8
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
9
+ model.eval()
10
+
11
+ # === Streamlit UI ===
12
+ st.set_page_config(page_title="Оценка токсичности", layout="centered")
13
+ st.title("💬 Оценка токсичности текста")
14
+
15
+ text = st.text_area("Введите сообщение", "Ты ужасный человек!")
16
+ submit = st.button("Проверить токсичность")
17
+
18
+ if submit and text.strip():
19
+ # Токенизация
20
+ inputs = tokenizer(text, return_tensors="pt", truncation=True)
21
+
22
+ # Предсказание
23
+ with torch.no_grad():
24
+ outputs = model(**inputs)
25
+ logits = outputs.logits
26
+ score = torch.sigmoid(logits).item() # степень токсичности
27
+
28
+ # Вывод
29
+ st.subheader("Результат:")
30
+ st.write(f"**Степень токсичности:** `{score:.3f}`")
31
+
32
+ if score > 0.8:
33
+ st.error("⚠️ Высокая токсичность!")
34
+ elif score > 0.4:
35
+ st.warning("⚠️ Средняя токсичность")
36
+ else:
37
+ st.success("✅ Низкая токсичность")
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cointegrated/rubert-tiny2",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "emb_size": 312,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 312,
13
+ "id2label": {
14
+ "0": "LABEL_0"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 600,
18
+ "label2id": {
19
+ "LABEL_0": 0
20
+ },
21
+ "layer_norm_eps": 1e-12,
22
+ "max_position_embeddings": 2048,
23
+ "model_type": "bert",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 3,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.36.2",
31
+ "type_vocab_size": 2,
32
+ "use_cache": true,
33
+ "vocab_size": 83828
34
+ }
labeled(1).csv ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ee8832f961188a19c6bd6e55845433ca1ddd79483ebb461cb69ef64c16fa182
3
+ size 116782884
ru2.ipynb ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 27,
6
+ "id": "09d75e68",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "d:\\ds\\ds-phase-2-master\\myenv\\Lib\\site-packages\\huggingface_hub\\file_download.py:896: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
14
+ " warnings.warn(\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "application/vnd.jupyter.widget-view+json": {
20
+ "model_id": "3a5777df3bb04084bb8717df64fdb6d5",
21
+ "version_major": 2,
22
+ "version_minor": 0
23
+ },
24
+ "text/plain": [
25
+ "Map: 0%| | 0/11529 [00:00<?, ? examples/s]"
26
+ ]
27
+ },
28
+ "metadata": {},
29
+ "output_type": "display_data"
30
+ },
31
+ {
32
+ "data": {
33
+ "application/vnd.jupyter.widget-view+json": {
34
+ "model_id": "f3a855aaeb844c9ba8ab8d2d141f4ea7",
35
+ "version_major": 2,
36
+ "version_minor": 0
37
+ },
38
+ "text/plain": [
39
+ "Map: 0%| | 0/2883 [00:00<?, ? examples/s]"
40
+ ]
41
+ },
42
+ "metadata": {},
43
+ "output_type": "display_data"
44
+ },
45
+ {
46
+ "name": "stderr",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
50
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "import os\n",
56
+ "\n",
57
+ "import pandas as pd\n",
58
+ "import torch\n",
59
+ "from datasets import Dataset\n",
60
+ "from transformers import (\n",
61
+ " AutoTokenizer,\n",
62
+ " AutoModelForSequenceClassification,\n",
63
+ " DataCollatorWithPadding,\n",
64
+ " TrainingArguments,\n",
65
+ " Trainer\n",
66
+ ")\n",
67
+ "\n",
68
+ "# === 1. Загрузка и преобразование датасета ===\n",
69
+ "df = pd.read_csv(r\"D:\\ds\\ds-phase-2-master\\Rubert\\labeled(1).csv\").dropna()\n",
70
+ "df = df[[\"comment\", \"toxic\"]].rename(columns={\"comment\": \"text\", \"toxic\": \"label\"})\n",
71
+ "df[\"label\"] = df[\"label\"].astype(int)\n",
72
+ "\n",
73
+ "ds = Dataset.from_pandas(df)\n",
74
+ "ds = ds.train_test_split(test_size=0.2, seed=42)\n",
75
+ "\n",
76
+ "# === 2. Токенизация ===\n",
77
+ "model_name = \"cointegrated/rubert-tiny2\"\n",
78
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
79
+ "\n",
80
+ "def tokenize(example):\n",
81
+ " return tokenizer(example[\"text\"], truncation=True)\n",
82
+ "\n",
83
+ "tokenized_ds = ds.map(tokenize, batched=True)\n",
84
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
85
+ "\n",
86
+ "# === 3. Загрузка модели ===\n",
87
+ "model = AutoModelForSequenceClassification.from_pretrained(\n",
88
+ " model_name,\n",
89
+ " num_labels=1,\n",
90
+ " problem_type=\"single_label_classification\"\n",
91
+ ")"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 28,
97
+ "id": "c6a96f56",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "import evaluate\n",
102
+ "import numpy as np\n",
103
+ "\n",
104
+ "accuracy = evaluate.load(\"accuracy\")\n",
105
+ "\n",
106
+ "def compute_metrics(eval_pred):\n",
107
+ " logits, labels = eval_pred\n",
108
+ " preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy() # бинаризация\n",
109
+ " return accuracy.compute(predictions=preds, references=labels)"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 29,
115
+ "id": "ed9a53f6",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# === 4. Кастомный Trainer с float-метками ===\n",
120
+ "class CustomTrainer(Trainer):\n",
121
+ " def compute_loss(self, model, inputs, return_outputs=False):\n",
122
+ " labels = inputs.pop(\"labels\").float().squeeze()\n",
123
+ " inputs = {k: v for k, v in inputs.items()}\n",
124
+ " outputs = model(**inputs)\n",
125
+ " logits = outputs.logits.squeeze()\n",
126
+ "\n",
127
+ " # Проверка размерности\n",
128
+ " assert logits.shape == labels.shape, f\"Shape mismatch: logits {logits.shape}, labels {labels.shape}\"\n",
129
+ "\n",
130
+ " loss_fn = torch.nn.BCEWithLogitsLoss()\n",
131
+ " loss = loss_fn(logits, labels)\n",
132
+ "\n",
133
+ " return (loss, outputs) if return_outputs else loss\n",
134
+ "model = model.to(\"cpu\")\n",
135
+ "\n",
136
+ "# === 5. Настройки обучения ===\n",
137
+ "training_args = TrainingArguments(\n",
138
+ " output_dir=\"./rubert-finetuned\",\n",
139
+ " evaluation_strategy=\"epoch\",\n",
140
+ " save_strategy=\"epoch\",\n",
141
+ " learning_rate=2e-5,\n",
142
+ " per_device_train_batch_size=8,\n",
143
+ " per_device_eval_batch_size=8,\n",
144
+ " num_train_epochs=10,\n",
145
+ " weight_decay=0.01,\n",
146
+ " save_total_limit=1,\n",
147
+ " load_best_model_at_end=True,\n",
148
+ " logging_dir=\"./logs\",)"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 30,
154
+ "id": "fa0b11a9",
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "application/vnd.jupyter.widget-view+json": {
160
+ "model_id": "6394c2f19b0b4730af7de82fdd4b9c8d",
161
+ "version_major": 2,
162
+ "version_minor": 0
163
+ },
164
+ "text/plain": [
165
+ " 0%| | 0/14420 [00:00<?, ?it/s]"
166
+ ]
167
+ },
168
+ "metadata": {},
169
+ "output_type": "display_data"
170
+ },
171
+ {
172
+ "name": "stderr",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stdout",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "{'loss': 0.41, 'learning_rate': 1.9306518723994454e-05, 'epoch': 0.35}\n",
183
+ "{'loss': 0.2648, 'learning_rate': 1.8613037447988906e-05, 'epoch': 0.69}\n"
184
+ ]
185
+ },
186
+ {
187
+ "data": {
188
+ "application/vnd.jupyter.widget-view+json": {
189
+ "model_id": "feda23f3072d484fa6c0a9bf0a625c7d",
190
+ "version_major": 2,
191
+ "version_minor": 0
192
+ },
193
+ "text/plain": [
194
+ " 0%| | 0/361 [00:00<?, ?it/s]"
195
+ ]
196
+ },
197
+ "metadata": {},
198
+ "output_type": "display_data"
199
+ },
200
+ {
201
+ "name": "stderr",
202
+ "output_type": "stream",
203
+ "text": [
204
+ "Checkpoint destination directory ./rubert-finetuned\\checkpoint-1442 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n"
205
+ ]
206
+ },
207
+ {
208
+ "name": "stdout",
209
+ "output_type": "stream",
210
+ "text": [
211
+ "{'eval_loss': 0.23574143648147583, 'eval_accuracy': 0.9219562955254943, 'eval_runtime': 9.7843, 'eval_samples_per_second': 294.655, 'eval_steps_per_second': 36.896, 'epoch': 1.0}\n",
212
+ "{'loss': 0.2889, 'learning_rate': 1.791955617198336e-05, 'epoch': 1.04}\n",
213
+ "{'loss': 0.2153, 'learning_rate': 1.7226074895977807e-05, 'epoch': 1.39}\n",
214
+ "{'loss': 0.2467, 'learning_rate': 1.6532593619972263e-05, 'epoch': 1.73}\n"
215
+ ]
216
+ },
217
+ {
218
+ "data": {
219
+ "application/vnd.jupyter.widget-view+json": {
220
+ "model_id": "475198dc77b04764bc26b06630c99993",
221
+ "version_major": 2,
222
+ "version_minor": 0
223
+ },
224
+ "text/plain": [
225
+ " 0%| | 0/361 [00:00<?, ?it/s]"
226
+ ]
227
+ },
228
+ "metadata": {},
229
+ "output_type": "display_data"
230
+ },
231
+ {
232
+ "name": "stdout",
233
+ "output_type": "stream",
234
+ "text": [
235
+ "{'eval_loss': 0.2591148912906647, 'eval_accuracy': 0.9223031564342699, 'eval_runtime': 9.2707, 'eval_samples_per_second': 310.978, 'eval_steps_per_second': 38.94, 'epoch': 2.0}\n",
236
+ "{'loss': 0.2281, 'learning_rate': 1.5839112343966715e-05, 'epoch': 2.08}\n",
237
+ "{'loss': 0.1865, 'learning_rate': 1.5145631067961166e-05, 'epoch': 2.43}\n",
238
+ "{'loss': 0.1844, 'learning_rate': 1.4452149791955618e-05, 'epoch': 2.77}\n"
239
+ ]
240
+ },
241
+ {
242
+ "data": {
243
+ "application/vnd.jupyter.widget-view+json": {
244
+ "model_id": "06f7aff6306f4c16a4356b590ff94327",
245
+ "version_major": 2,
246
+ "version_minor": 0
247
+ },
248
+ "text/plain": [
249
+ " 0%| | 0/361 [00:00<?, ?it/s]"
250
+ ]
251
+ },
252
+ "metadata": {},
253
+ "output_type": "display_data"
254
+ },
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "{'eval_loss': 0.26540955901145935, 'eval_accuracy': 0.9288935137010059, 'eval_runtime': 9.2567, 'eval_samples_per_second': 311.45, 'eval_steps_per_second': 38.999, 'epoch': 3.0}\n",
260
+ "{'loss': 0.1857, 'learning_rate': 1.375866851595007e-05, 'epoch': 3.12}\n",
261
+ "{'loss': 0.1412, 'learning_rate': 1.3065187239944523e-05, 'epoch': 3.47}\n",
262
+ "{'loss': 0.1687, 'learning_rate': 1.2371705963938973e-05, 'epoch': 3.81}\n"
263
+ ]
264
+ },
265
+ {
266
+ "data": {
267
+ "application/vnd.jupyter.widget-view+json": {
268
+ "model_id": "8cde784edb3848739c66b7262d335a8e",
269
+ "version_major": 2,
270
+ "version_minor": 0
271
+ },
272
+ "text/plain": [
273
+ " 0%| | 0/361 [00:00<?, ?it/s]"
274
+ ]
275
+ },
276
+ "metadata": {},
277
+ "output_type": "display_data"
278
+ },
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "{'eval_loss': 0.3157169818878174, 'eval_accuracy': 0.9219562955254943, 'eval_runtime': 9.2591, 'eval_samples_per_second': 311.368, 'eval_steps_per_second': 38.988, 'epoch': 4.0}\n",
284
+ "{'loss': 0.1426, 'learning_rate': 1.1678224687933426e-05, 'epoch': 4.16}\n",
285
+ "{'loss': 0.1228, 'learning_rate': 1.0984743411927878e-05, 'epoch': 4.51}\n",
286
+ "{'loss': 0.1185, 'learning_rate': 1.029126213592233e-05, 'epoch': 4.85}\n"
287
+ ]
288
+ },
289
+ {
290
+ "data": {
291
+ "application/vnd.jupyter.widget-view+json": {
292
+ "model_id": "009d58cef57d4fbeb14ce06b6e2c9c00",
293
+ "version_major": 2,
294
+ "version_minor": 0
295
+ },
296
+ "text/plain": [
297
+ " 0%| | 0/361 [00:00<?, ?it/s]"
298
+ ]
299
+ },
300
+ "metadata": {},
301
+ "output_type": "display_data"
302
+ },
303
+ {
304
+ "name": "stdout",
305
+ "output_type": "stream",
306
+ "text": [
307
+ "{'eval_loss': 0.31897348165512085, 'eval_accuracy': 0.9257717655220257, 'eval_runtime': 9.2918, 'eval_samples_per_second': 310.274, 'eval_steps_per_second': 38.851, 'epoch': 5.0}\n",
308
+ "{'loss': 0.1039, 'learning_rate': 9.597780859916783e-06, 'epoch': 5.2}\n",
309
+ "{'loss': 0.1146, 'learning_rate': 8.904299583911235e-06, 'epoch': 5.55}\n",
310
+ "{'loss': 0.0903, 'learning_rate': 8.210818307905687e-06, 'epoch': 5.89}\n"
311
+ ]
312
+ },
313
+ {
314
+ "data": {
315
+ "application/vnd.jupyter.widget-view+json": {
316
+ "model_id": "9e64d3f3da6a4db697a30b93fa80f944",
317
+ "version_major": 2,
318
+ "version_minor": 0
319
+ },
320
+ "text/plain": [
321
+ " 0%| | 0/361 [00:00<?, ?it/s]"
322
+ ]
323
+ },
324
+ "metadata": {},
325
+ "output_type": "display_data"
326
+ },
327
+ {
328
+ "name": "stdout",
329
+ "output_type": "stream",
330
+ "text": [
331
+ "{'eval_loss': 0.3766789138317108, 'eval_accuracy': 0.9236906000693722, 'eval_runtime': 9.5138, 'eval_samples_per_second': 303.034, 'eval_steps_per_second': 37.945, 'epoch': 6.0}\n",
332
+ "{'loss': 0.0931, 'learning_rate': 7.5173370319001396e-06, 'epoch': 6.24}\n",
333
+ "{'loss': 0.0809, 'learning_rate': 6.823855755894592e-06, 'epoch': 6.59}\n",
334
+ "{'loss': 0.0645, 'learning_rate': 6.130374479889043e-06, 'epoch': 6.93}\n"
335
+ ]
336
+ },
337
+ {
338
+ "data": {
339
+ "application/vnd.jupyter.widget-view+json": {
340
+ "model_id": "46b7197e07b642d78e508adf22d06ad4",
341
+ "version_major": 2,
342
+ "version_minor": 0
343
+ },
344
+ "text/plain": [
345
+ " 0%| | 0/361 [00:00<?, ?it/s]"
346
+ ]
347
+ },
348
+ "metadata": {},
349
+ "output_type": "display_data"
350
+ },
351
+ {
352
+ "name": "stdout",
353
+ "output_type": "stream",
354
+ "text": [
355
+ "{'eval_loss': 0.3949281573295593, 'eval_accuracy': 0.9219562955254943, 'eval_runtime': 9.9838, 'eval_samples_per_second': 288.767, 'eval_steps_per_second': 36.158, 'epoch': 7.0}\n",
356
+ "{'loss': 0.09, 'learning_rate': 5.436893203883496e-06, 'epoch': 7.28}\n",
357
+ "{'loss': 0.0623, 'learning_rate': 4.743411927877948e-06, 'epoch': 7.63}\n",
358
+ "{'loss': 0.0651, 'learning_rate': 4.049930651872399e-06, 'epoch': 7.98}\n"
359
+ ]
360
+ },
361
+ {
362
+ "data": {
363
+ "application/vnd.jupyter.widget-view+json": {
364
+ "model_id": "b36046deee6d49c999568ec571721a86",
365
+ "version_major": 2,
366
+ "version_minor": 0
367
+ },
368
+ "text/plain": [
369
+ " 0%| | 0/361 [00:00<?, ?it/s]"
370
+ ]
371
+ },
372
+ "metadata": {},
373
+ "output_type": "display_data"
374
+ },
375
+ {
376
+ "name": "stdout",
377
+ "output_type": "stream",
378
+ "text": [
379
+ "{'eval_loss': 0.4220237731933594, 'eval_accuracy': 0.920568851890392, 'eval_runtime': 9.8029, 'eval_samples_per_second': 294.096, 'eval_steps_per_second': 36.826, 'epoch': 8.0}\n",
380
+ "{'loss': 0.0466, 'learning_rate': 3.356449375866852e-06, 'epoch': 8.32}\n",
381
+ "{'loss': 0.0724, 'learning_rate': 2.662968099861304e-06, 'epoch': 8.67}\n"
382
+ ]
383
+ },
384
+ {
385
+ "data": {
386
+ "application/vnd.jupyter.widget-view+json": {
387
+ "model_id": "38d29c2f867b421e8242b58df331dbe2",
388
+ "version_major": 2,
389
+ "version_minor": 0
390
+ },
391
+ "text/plain": [
392
+ " 0%| | 0/361 [00:00<?, ?it/s]"
393
+ ]
394
+ },
395
+ "metadata": {},
396
+ "output_type": "display_data"
397
+ },
398
+ {
399
+ "name": "stdout",
400
+ "output_type": "stream",
401
+ "text": [
402
+ "{'eval_loss': 0.4382440149784088, 'eval_accuracy': 0.9181408255289629, 'eval_runtime': 9.9388, 'eval_samples_per_second': 290.076, 'eval_steps_per_second': 36.322, 'epoch': 9.0}\n",
403
+ "{'loss': 0.069, 'learning_rate': 1.969486823855756e-06, 'epoch': 9.02}\n",
404
+ "{'loss': 0.0614, 'learning_rate': 1.276005547850208e-06, 'epoch': 9.36}\n",
405
+ "{'loss': 0.0588, 'learning_rate': 5.825242718446603e-07, 'epoch': 9.71}\n"
406
+ ]
407
+ },
408
+ {
409
+ "data": {
410
+ "application/vnd.jupyter.widget-view+json": {
411
+ "model_id": "6a8d5ddb3c724b7ca59613d8cecf2c09",
412
+ "version_major": 2,
413
+ "version_minor": 0
414
+ },
415
+ "text/plain": [
416
+ " 0%| | 0/361 [00:00<?, ?it/s]"
417
+ ]
418
+ },
419
+ "metadata": {},
420
+ "output_type": "display_data"
421
+ },
422
+ {
423
+ "name": "stdout",
424
+ "output_type": "stream",
425
+ "text": [
426
+ "{'eval_loss': 0.4383704662322998, 'eval_accuracy': 0.9171002428026361, 'eval_runtime': 9.3759, 'eval_samples_per_second': 307.491, 'eval_steps_per_second': 38.503, 'epoch': 10.0}\n",
427
+ "{'train_runtime': 422.8343, 'train_samples_per_second': 272.66, 'train_steps_per_second': 34.103, 'train_loss': 0.13939501686995634, 'epoch': 10.0}\n"
428
+ ]
429
+ },
430
+ {
431
+ "data": {
432
+ "text/plain": [
433
+ "('rubert-finetuned\\\\tokenizer_config.json',\n",
434
+ " 'rubert-finetuned\\\\special_tokens_map.json',\n",
435
+ " 'rubert-finetuned\\\\vocab.txt',\n",
436
+ " 'rubert-finetuned\\\\added_tokens.json',\n",
437
+ " 'rubert-finetuned\\\\tokenizer.json')"
438
+ ]
439
+ },
440
+ "execution_count": 30,
441
+ "metadata": {},
442
+ "output_type": "execute_result"
443
+ }
444
+ ],
445
+ "source": [
446
+ "# === 6. Trainer ===\n",
447
+ "trainer = CustomTrainer(\n",
448
+ " model=model,\n",
449
+ " args=training_args,\n",
450
+ " train_dataset=tokenized_ds[\"train\"],\n",
451
+ " eval_dataset=tokenized_ds[\"test\"],\n",
452
+ " tokenizer=tokenizer,\n",
453
+ " data_collator=data_collator,\n",
454
+ " compute_metrics=compute_metrics,\n",
455
+ ")\n",
456
+ "\n",
457
+ "# === 7. Обучение ===\n",
458
+ "trainer.train()\n",
459
+ "\n",
460
+ "# === 8. Сохранение ===\n",
461
+ "model.save_pretrained(\"rubert-finetuned\")\n",
462
+ "tokenizer.save_pretrained(\"rubert-finetuned\")"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": 31,
468
+ "id": "e28f42e7",
469
+ "metadata": {},
470
+ "outputs": [
471
+ {
472
+ "data": {
473
+ "application/vnd.jupyter.widget-view+json": {
474
+ "model_id": "4c469848135a496f9a0d690c367b274f",
475
+ "version_major": 2,
476
+ "version_minor": 0
477
+ },
478
+ "text/plain": [
479
+ " 0%| | 0/361 [00:00<?, ?it/s]"
480
+ ]
481
+ },
482
+ "metadata": {},
483
+ "output_type": "display_data"
484
+ },
485
+ {
486
+ "name": "stdout",
487
+ "output_type": "stream",
488
+ "text": [
489
+ "Accuracy: 0.9220\n"
490
+ ]
491
+ }
492
+ ],
493
+ "source": [
494
+ "metrics = trainer.evaluate()\n",
495
+ "print(f\"Accuracy: {metrics['eval_accuracy']:.4f}\")"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 2,
501
+ "id": "ccdb2304",
502
+ "metadata": {},
503
+ "outputs": [
504
+ {
505
+ "name": "stdout",
506
+ "output_type": "stream",
507
+ "text": [
508
+ "Токсичность: 0.968\n"
509
+ ]
510
+ }
511
+ ],
512
+ "source": [
513
+ "import torch\n",
514
+ "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
515
+ "\n",
516
+ "\n",
517
+ "model_path = \"rubert-finetuned\"\n",
518
+ "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
519
+ "model = AutoModelForSequenceClassification.from_pretrained(model_path)\n",
520
+ "model = model.to(\"cuda\") # или \"cpu\", если хочешь\n",
521
+ "\n",
522
+ "def predict_toxicity(text: str) -> float:\n",
523
+ " inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True).to(\"cuda\")\n",
524
+ " with torch.no_grad():\n",
525
+ " outputs = model(**inputs)\n",
526
+ " logits = outputs.logits\n",
527
+ " prob = torch.sigmoid(logits).item() \n",
528
+ " return prob\n",
529
+ "\n",
530
+ "text = \"Ты говно\"\n",
531
+ "score = predict_toxicity(text)\n",
532
+ "print(f\"Токсичность: {score:.3f}\")"
533
+ ]
534
+ }
535
+ ],
536
+ "metadata": {
537
+ "kernelspec": {
538
+ "display_name": "myenv",
539
+ "language": "python",
540
+ "name": "python3"
541
+ },
542
+ "language_info": {
543
+ "codemirror_mode": {
544
+ "name": "ipython",
545
+ "version": 3
546
+ },
547
+ "file_extension": ".py",
548
+ "mimetype": "text/x-python",
549
+ "name": "python",
550
+ "nbconvert_exporter": "python",
551
+ "pygments_lexer": "ipython3",
552
+ "version": "3.12.3"
553
+ }
554
+ },
555
+ "nbformat": 4,
556
+ "nbformat_minor": 5
557
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
50
+ "model_max_length": 2048,
51
+ "never_split": null,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "stride": 0,
58
+ "strip_accents": null,
59
+ "tokenize_chinese_chars": true,
60
+ "tokenizer_class": "BertTokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]"
64
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff