yagnik12 commited on
Commit
b7b9e5f
·
verified ·
1 Parent(s): c913035

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +11 -35
train.py CHANGED
@@ -1,61 +1,37 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
- import evaluate
4
- from huggingface_hub import login
5
-
6
- # 🔑 Login to Hugging Face (add HF_TOKEN as a secret in your Space settings)
7
  import os
8
- hf_token = os.getenv("HF_TOKEN")
9
- if hf_token:
10
- login(hf_token)
11
 
12
- # 1. Load BiScope dataset
13
  dataset = load_dataset("HanxiGuo/BiScope_Data")
 
 
14
 
15
- # 2. Tokenizer
16
- MODEL = "microsoft/deberta-v3-small"
17
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
18
-
19
- def preprocess(examples):
20
- return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
21
-
22
- encoded_dataset = dataset.map(preprocess, batched=True)
23
-
24
- # 3. Model
25
- model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)
26
 
27
- # 4. Metrics
28
- accuracy = evaluate.load("accuracy")
29
 
30
- def compute_metrics(eval_pred):
31
- logits, labels = eval_pred
32
- predictions = logits.argmax(axis=-1)
33
- return accuracy.compute(predictions=predictions, references=labels)
34
 
35
- # 5. Training args
36
  training_args = TrainingArguments(
37
  output_dir="./results",
38
  evaluation_strategy="epoch",
39
  save_strategy="epoch",
40
- learning_rate=2e-5,
41
  per_device_train_batch_size=16,
42
  per_device_eval_batch_size=16,
43
- num_train_epochs=2,
44
- weight_decay=0.01,
45
  push_to_hub=True,
46
- hub_model_id="yagnik12/AI_Text_Detecter_HanxiGuo_BiScope-Data"
 
47
  )
48
 
49
- # 6. Trainer
50
  trainer = Trainer(
51
  model=model,
52
  args=training_args,
53
- train_dataset=encoded_dataset["train"],
54
- eval_dataset=encoded_dataset["validation"],
55
  tokenizer=tokenizer,
56
- compute_metrics=compute_metrics,
57
  )
58
 
59
- # 7. Train & Push
60
  trainer.train()
61
  trainer.push_to_hub()
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 
 
 
 
3
  import os
 
 
 
4
 
 
5
  dataset = load_dataset("HanxiGuo/BiScope_Data")
6
+ model_name = "distilbert-base-uncased"
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
 
9
+ def tokenize(batch):
10
+ return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
 
 
 
 
 
 
 
 
 
11
 
12
+ tokenized = dataset.map(tokenize, batched=True)
 
13
 
14
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
 
 
 
15
 
 
16
  training_args = TrainingArguments(
17
  output_dir="./results",
18
  evaluation_strategy="epoch",
19
  save_strategy="epoch",
20
+ num_train_epochs=1,
21
  per_device_train_batch_size=16,
22
  per_device_eval_batch_size=16,
 
 
23
  push_to_hub=True,
24
+ hub_model_id="yagnik12/AI_Text_Detecter_HanxiGuo_BiScope-Data", # ✅ model repo, not Space
25
+ hub_token=os.getenv("HF_TOKEN"),
26
  )
27
 
 
28
  trainer = Trainer(
29
  model=model,
30
  args=training_args,
31
+ train_dataset=tokenized["train"],
32
+ eval_dataset=tokenized["test"],
33
  tokenizer=tokenizer,
 
34
  )
35
 
 
36
  trainer.train()
37
  trainer.push_to_hub()