datasetsANDmodels commited on
Commit
61fe8b0
·
verified ·
1 Parent(s): a644084

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. benmal.csv +3 -0
  3. finetune_mal_ben.py +83 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ benmal.csv filter=lfs diff=lfs merge=lfs -text
benmal.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80d9aeeaf97cd5a4d2541fdd29a822e53aa1c17d4dd851cb511dfc43eb0de949
3
+ size 19933794
finetune_mal_ben.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #https://huggingface.co/docs/transformers/v4.17.0/en/tasks/sequence_classification
2
+ from transformers import Trainer, TrainingArguments
3
+ from transformers import AutoTokenizer
4
+ from transformers import AutoModelForSequenceClassification,BertForSequenceClassification
5
+ from datasets import load_dataset
6
+ import numpy as np
7
+ import evaluate
8
+ from huggingface_hub import HfFolder
9
+ tokenizer = AutoTokenizer.from_pretrained("roberta-large")
10
+ file_dict = {
11
+ "train" : "benmal.csv",
12
+ "test" :"benmal.csv"
13
+
14
+ }
15
+
16
+ dataset=load_dataset(
17
+ 'csv',
18
+ data_files=file_dict,
19
+ delimiter=',',
20
+ column_names=['text', 'label'],
21
+ skiprows=1
22
+ )
23
+ raw_dataset=dataset.shuffle()
24
+ def tokenize(batch):
25
+ return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt")
26
+ tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["text"])
27
+
28
+ model_id = "roberta-large"
29
+
30
+ model = AutoModelForSequenceClassification.from_pretrained(
31
+ model_id, num_labels=2, ignore_mismatched_sizes=True
32
+ )
33
+ metric = evaluate.load("f1")
34
+
35
+ def compute_metrics(eval_pred):
36
+ predictions, labels = eval_pred
37
+ predictions = np.argmax(predictions, axis=1)
38
+ return metric.compute(predictions=predictions, references=labels, average="weighted")
39
+
40
+ from transformers import DataCollatorWithPadding
41
+
42
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
43
+ repository_id = "azadeh1972/bm"
44
+
45
+ training_args= TrainingArguments(
46
+ output_dir=repository_id,
47
+ per_device_train_batch_size=8,
48
+ per_device_eval_batch_size=8,
49
+ learning_rate=2e-5,
50
+ num_train_epochs=10,
51
+ # torch_compile=True,
52
+ evaluation_strategy="epoch",
53
+ save_strategy="epoch",
54
+ save_total_limit=2,
55
+ load_best_model_at_end=True,
56
+ # metric_for_best_model="f1",
57
+ # report_to="tensorboard",
58
+ push_to_hub=True,
59
+ hub_strategy="every_save",
60
+ hub_model_id=repository_id,
61
+ hub_token=HfFolder.get_token(),
62
+
63
+ )
64
+
65
+ trainer = Trainer(
66
+ model=model,
67
+ args=training_args,
68
+ train_dataset=tokenized_dataset["train"],
69
+ eval_dataset=tokenized_dataset["train"],
70
+ # compute_metrics=compute_metrics,
71
+ # tokenizer=tokenizer,
72
+ # data_collator=data_collator,
73
+ )
74
+ import torch._dynamo
75
+ torch._dynamo.config.suppress_errors = True
76
+ trainer.train()
77
+ tokenizer.save_pretrained(repository_id)
78
+ trainer.create_model_card()
79
+ trainer.push_to_hub()
80
+
81
+
82
+
83
+