DARKWICK commited on
Commit
a9f62ed
·
verified ·
1 Parent(s): 11d4128

Upload 3 files

Browse files
Files changed (3) hide show
  1. requirements.txt +54 -0
  2. run.py +5 -0
  3. train.py +104 -0
requirements.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ autopep8==2.0.2
2
+ blinker==1.6.2
3
+ certifi==2022.12.7
4
+ charset-normalizer==3.1.0
5
+ click==8.1.3
6
+ cmake==3.26.3
7
+ filelock==3.12.0
8
+ Flask==2.3.1
9
+ Flask-Cors==3.0.10
10
+ fsspec==2023.4.0
11
+ huggingface-hub==0.14.1
12
+ idna==3.4
13
+ importlib-metadata==6.6.0
14
+ itsdangerous==2.1.2
15
+ Jinja2==3.1.2
16
+ lit==16.0.2
17
+ MarkupSafe==2.1.2
18
+ mpmath==1.3.0
19
+ networkx==3.1
20
+ numpy==1.24.3
21
+ nvidia-cublas-cu11==11.10.3.66
22
+ nvidia-cuda-cupti-cu11==11.7.101
23
+ nvidia-cuda-nvrtc-cu11==11.7.99
24
+ nvidia-cuda-runtime-cu11==11.7.99
25
+ nvidia-cudnn-cu11==8.5.0.96
26
+ nvidia-cufft-cu11==10.9.0.58
27
+ nvidia-curand-cu11==10.2.10.91
28
+ nvidia-cusolver-cu11==11.4.0.1
29
+ nvidia-cusparse-cu11==11.7.4.91
30
+ nvidia-nccl-cu11==2.14.3
31
+ nvidia-nvtx-cu11==11.7.91
32
+ packaging==23.1
33
+ pycodestyle==2.10.0
34
+ PyYAML==6.0
35
+ regex==2023.3.23
36
+ requests==2.28.2
37
+ six==1.16.0
38
+ sympy==1.11.1
39
+ tokenizers==0.13.3
40
+ tomli==2.0.1
41
+ torch==2.0.0
42
+ tqdm==4.65.0
43
+ transformers==4.28.1
44
+ triton==2.0.0
45
+ typing_extensions==4.5.0
46
+ urllib3==1.26.15
47
+ Werkzeug==2.3.0
48
+ zipp==3.15.0
49
+ # Added for HF training and Spaces
50
+ gradio==3.38.0
51
+ datasets==2.13.1
52
+ accelerate==0.20.3
53
+ scikit-learn==1.2.2
54
+
run.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from toxic_comment.src.app import app
2
+
3
+ if __name__ == '__main__':
4
+ app.run(debug=True)
5
+
train.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple training script for a text toxicity classifier.
2
+
3
+ Usage examples:
4
+ - Train from a CSV: python train.py --dataset_csv data/toxic_train.csv --text_col text --label_col label --output_dir ./outputs
5
+ - Push to Hub: python train.py --dataset_csv data/toxic_train.csv --output_dir ./outputs --push_to_hub --hub_model_id your-username/toxic-detector
6
+
7
+ Expect CSV with columns: text, label (0/1) for single-label classification. For multi-label adjust the preprocessing.
8
+ """
9
+ import argparse
10
+ from pathlib import Path
11
+ from datasets import load_dataset, Dataset
12
+ from transformers import (
13
+ AutoTokenizer,
14
+ AutoModelForSequenceClassification,
15
+ TrainingArguments,
16
+ Trainer,
17
+ DataCollatorWithPadding,
18
+ )
19
+ import numpy as np
20
+ import evaluate
21
+
22
+
23
+ def parse_args():
24
+ p = argparse.ArgumentParser()
25
+ p.add_argument("--dataset_csv", type=str, default=None, help="Path to CSV dataset with text and label columns")
26
+ p.add_argument("--text_col", type=str, default="text")
27
+ p.add_argument("--label_col", type=str, default="label")
28
+ p.add_argument("--model_name_or_path", type=str, default="distilbert-base-uncased")
29
+ p.add_argument("--output_dir", type=str, default="./model_output")
30
+ p.add_argument("--push_to_hub", action="store_true")
31
+ p.add_argument("--hub_model_id", type=str, default=None)
32
+ p.add_argument("--num_train_epochs", type=int, default=1)
33
+ p.add_argument("--per_device_train_batch_size", type=int, default=16)
34
+ return p.parse_args()
35
+
36
+
37
+ def main():
38
+ args = parse_args()
39
+
40
+ if args.dataset_csv:
41
+ ds = load_dataset("csv", data_files={"train": args.dataset_csv})
42
+ # if no validation split, take 10% for val
43
+ ds = ds["train"].train_test_split(test_size=0.1)
44
+ dataset = ds
45
+ else:
46
+ # small built-in fallback: use a tiny subset of imdb for demo (binary sentiment)
47
+ dataset = load_dataset("imdb", split={"train": "train[:2000]","test": "test[:500]"})
48
+ dataset = dataset
49
+
50
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
51
+
52
+ def preprocess_function(examples):
53
+ texts = examples[args.text_col] if args.dataset_csv else examples["text"]
54
+ return tokenizer(texts, truncation=True)
55
+
56
+ if args.dataset_csv:
57
+ tokenized = dataset.map(preprocess_function, batched=True)
58
+ else:
59
+ # imdb default has 'text' and 'label'
60
+ tokenized = dataset.map(lambda x: tokenizer(x['text'], truncation=True), batched=True)
61
+
62
+ labels = tokenized["train"].features[args.label_col] if args.dataset_csv else None
63
+
64
+ num_labels = 2
65
+ model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, num_labels=num_labels)
66
+
67
+ metric_acc = evaluate.load("accuracy")
68
+
69
+ def compute_metrics(eval_pred):
70
+ logits, labels = eval_pred
71
+ preds = np.argmax(logits, axis=-1)
72
+ return metric_acc.compute(predictions=preds, references=labels)
73
+
74
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
75
+
76
+ training_args = TrainingArguments(
77
+ output_dir=args.output_dir,
78
+ evaluation_strategy="epoch",
79
+ num_train_epochs=args.num_train_epochs,
80
+ per_device_train_batch_size=args.per_device_train_batch_size,
81
+ save_total_limit=2,
82
+ push_to_hub=args.push_to_hub,
83
+ hub_model_id=args.hub_model_id,
84
+ )
85
+
86
+ trainer = Trainer(
87
+ model=model,
88
+ args=training_args,
89
+ train_dataset=tokenized["train"],
90
+ eval_dataset=tokenized.get("test", None),
91
+ tokenizer=tokenizer,
92
+ data_collator=data_collator,
93
+ compute_metrics=compute_metrics,
94
+ )
95
+
96
+ trainer.train()
97
+
98
+ trainer.save_model()
99
+ if args.push_to_hub and args.hub_model_id:
100
+ trainer.push_to_hub()
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()