veryfansome commited on
Commit
0cdb887
·
1 Parent(s): a674fb1

feat: UD is back, LlaMA play

Browse files
dataset_splitter.py CHANGED
@@ -1,7 +1,7 @@
1
  from datasets import DatasetDict, load_from_disk
2
  import argparse
3
 
4
- from dataset_maker import features
5
 
6
  def has_all_valid_labels(exp):
7
  for col, labels in exp.items():
 
1
  from datasets import DatasetDict, load_from_disk
2
  import argparse
3
 
4
+ from openai_dataset_maker import features
5
 
6
  def has_all_valid_labels(exp):
7
  for col, labels in exp.items():
llama_dataset_maker.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Pipeline, pipeline
3
+ import logging
4
+ import torch
5
+
6
+ from utils import get_torch_device
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ChatModel(ABC):
12
+
13
+ @abstractmethod
14
+ def generate(self, messages: list[dict[str, str]]) -> dict[str, str]:
15
+ pass
16
+
17
+
18
+ class AdjLabeler:
19
+ def __init__(self, model: ChatModel):
20
+ self.model = model
21
+
22
+ def label_example(self, exp, feature_name):
23
+ messages = [
24
+ {"role": "system",
25
+ "content": "You are a helpful Grammar tutor."},
26
+ {"role": "user",
27
+ "content": "An adjective is a word that describes a noun?"},
28
+ {"role": "assistant",
29
+ "content": "Yes, that's correct! An adjective relates to, modifies, or describes nouns."},
30
+ {"role": "user",
31
+ "content": "Are they always used with nouns?"},
32
+ {"role": "assistant",
33
+ "content": ("No, adjectives often appear directly before nouns (e.g. \"a red apple\") "
34
+ "but they can also follow linking verbs to describe the subject (e.g. \"The sky is blue\"). "
35
+ "Sometimes, adjectives are used as complements in certain constructions or phrases "
36
+ "(e.g. \"the rich\" or \"well-known author\").")},
37
+ {"role": "user",
38
+ "content": "They can have comparative or superlative forms too, right?"},
39
+ {"role": "assistant",
40
+ "content": ("Yes, that's right! The word \"fast\" can take a comparative form as in \"faster\" "
41
+ "or a superlative form as in \"fastest\". Some adjectives don't have comparative or "
42
+ "superlative forms but use the word \"more\" or \"most\" to become comparative or "
43
+ "superlative.")},
44
+ {"role": "user",
45
+ "content": f"How about this example: {exp['tokens']}"},
46
+ ]
47
+
48
+ token_labels = []
49
+ for idx, token in enumerate(exp["tokens"]):
50
+ token_messages = messages.copy()
51
+ token_messages.append({"role": "user",
52
+ "content": f"Is '{token}' at position {idx} an adjective? Answer 'yes' or 'no'."})
53
+ #logger.info(f"token_messages: {token_messages}")
54
+
55
+ assistant_message = self.model.generate(token_messages)
56
+ logger.info(f"{assistant_message} - {token}")
57
+ token_messages.append(assistant_message)
58
+ messages += token_messages
59
+ return token_labels
60
+
61
+
62
+ class LlamaPipeline(ChatModel):
63
+ def __init__(self, model_name: str):
64
+ self.device = get_torch_device()
65
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
66
+ self.pipeline = pipeline(
67
+ "text-generation",
68
+ model=model_name,
69
+ model_kwargs={"torch_dtype": torch.bfloat16},
70
+ device_map="auto",
71
+ )
72
+
73
+ def generate(self, messages, max_new_tokens=1) :
74
+ outputs = self.pipeline(
75
+ messages,
76
+ max_new_tokens=max_new_tokens,
77
+ pad_token_id=self.tokenizer.eos_token_id,
78
+ temperature=0.6,
79
+ top_p=0.9,
80
+ )
81
+ return outputs[0]["generated_text"][-1]
82
+
83
+
84
+ class LlamaModel(ChatModel):
85
+ """
86
+ A wrapper around a Llama model checkpoint using Hugging Face Transformers.
87
+ """
88
+
89
+ def __init__(self, model_name: str):
90
+ torch_device = get_torch_device()
91
+
92
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
93
+ self.model = AutoModelForCausalLM.from_pretrained(
94
+ model_name,
95
+ device_map=str(torch_device),
96
+ torch_dtype=torch.float16,
97
+ )
98
+ self.model.to(torch_device)
99
+ self.model.eval()
100
+
101
+ # Adjust generation parameters as needed
102
+ self.generation_config = GenerationConfig(
103
+ max_new_tokens=1,
104
+ pad_token_id=self.tokenizer.eos_token_id,
105
+ temperature=0.7,
106
+ top_p=0.9,
107
+ do_sample=True,
108
+ )
109
+
110
+ def generate(self, prompt: str) -> str:
111
+ """
112
+ Generate text from the model given a prompt.
113
+ """
114
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
115
+ with torch.no_grad():
116
+ output_ids = self.model.generate(
117
+ **inputs,
118
+ generation_config=self.generation_config
119
+ )
120
+ raw_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
121
+ return raw_output[len(prompt):]
122
+
123
+
124
+ # ----------------------------------
125
+ # Putting It All Together
126
+ # ----------------------------------
127
+
128
+ if __name__ == "__main__":
129
+ import logging.config
130
+ from utils import default_logging_config
131
+ logging.config.dictConfig(default_logging_config)
132
+
133
+ llama_pipeline = LlamaPipeline(
134
+ model_name="meta-llama/Llama-3.2-3B-Instruct",
135
+ #model_name="meta-llama/Llama-3.1-8B-Instruct",
136
+ )
137
+ adj_labeler = AdjLabeler(llama_pipeline)
138
+
139
+ basic_cases = [
140
+ #{"text": "Joan has a nice dog.",
141
+ # "tokens": ["Joan", "has", "a", "nice", "dog."]},
142
+ #{"text": "Bob is the most agile person I have ever met.",
143
+ # "tokens": ["Bob", "is", "the", "most", "agile", "person", "I", "have", "ever", "met."]},
144
+ #{"text": "He's a total shit head",
145
+ # "tokens": ["He's", "a", "total", "shit", "head"]},
146
+ #{"text": "The old, creaky house stood on the quiet street.",
147
+ # "tokens": ["The", "old,", "creaky", "house", "stood", "on", "the", "quiet", "street."]},
148
+ #{"text": "The sky turned brilliant blue as the sun emerged.",
149
+ # "tokens": ["The", "sky", "turned", "brilliant", "blue", "as", "the", "sun", "emerged."]},
150
+ #{"text": "They admired the well-behaved and enthusiastic children at the party.",
151
+ # "tokens": ["They", "admired", "the", "well-behaved", "and", "enthusiastic", "children", "at", "the",
152
+ # "party."]},
153
+ #{"text": "After dinner, she felt tired and content.",
154
+ # "tokens": ["After", "dinner,", "she", "felt", "tired", "and", "content."]},
155
+ #{"text": "The resourceful team devised a clever plan.",
156
+ # "tokens": ["The", "resourceful", "team", "devised", "a", "clever", "plan."]},
157
+ #{"text": "He handed over the thick book to the eager student.",
158
+ # "tokens": ["He", "handed", "over", "the", "thick", "book", "to", "the", "eager", "student."]},
159
+ #{"text": "We appreciated the delicious, handmade pie from our neighbor.",
160
+ # "tokens": ["We", "appreciated", "the", "delicious,", "handmade", "pie", "from", "our", "neighbor."]},
161
+ #{"text": "In the enchanted forest, sparkling fairies danced under the moonlight.",
162
+ # "tokens": ["In", "the", "enchanted", "forest,", "sparkling", "fairies", "danced", "under", "the", "moonlight."]},
163
+ #{"text": "The stray cats, hungry and dirty, roamed the narrow alley.",
164
+ # "tokens": ["The", "stray", "cats,", "hungry", "and", "dirty,", "roamed", "the", "narrow", "alley."]},
165
+ #{"text": "The challenging puzzle left the determined young boy both frustrated and excited.",
166
+ # "tokens": ["The", "challenging", "puzzle", "left", "the", "determined", "young", "boy", "both", "frustrated",
167
+ # "and", "excited."]},
168
+
169
+ {"text": "Big cars use a lot more gas.",
170
+ "tokens": ["Big", "cars", "use", "a", "lot", "more", "gas."]},
171
+ {"text": "My car is faster than my bicycle.",
172
+ "tokens": ["My", "car", "is", "faster", "than", "my", "bicycle."]},
173
+ #{"text": "This puzzle is more challenging than the one we solved yesterday.",
174
+ # "tokens": ["This", "puzzle", "is", "more", "challenging", "than", "the", "one", "we", "solved", "yesterday."]},
175
+ #{"text": "Among all the students, Lara is the most diligent.",
176
+ # "tokens": ["Among", "all", "the", "students,", "Lara", "is", "the", "most", "diligent."]},
177
+ #{"text": "That building is taller than the one next to it.",
178
+ # "tokens": ["That", "building", "is", "taller", "than", "the", "one", "next", "to", "it."]},
179
+ #{"text": "This book is more interesting than the movie adaptation.",
180
+ # "tokens": ["This", "book", "is", "more", "interesting", "than", "the", "movie", "adaptation."]},
181
+ #{"text": "Of all the fruits, mangoes are the sweetest.",
182
+ # "tokens": ["Of", "all", "the", "fruits,", "mangoes", "are", "the", "sweetest."]},
183
+ #{"text": "His running speed is quicker than anyone else's on the team.",
184
+ # "tokens": ["His", "running", "speed", "is", "quicker", "than", "anyone", "else's", "on", "the", "team."]},
185
+ #{"text": "The exam was easier than I had anticipated.",
186
+ # "tokens": ["The", "exam", "was", "easier", "than", "I", "had", "anticipated."]},
187
+ #{"text": "Among all the flavors, vanilla is the mildest.",
188
+ # "tokens": ["Among", "all", "the", "flavors,", "vanilla", "is", "the", "mildest."]},
189
+ #{"text": "The new smartphone is lighter than the previous version.",
190
+ # "tokens": ["The", "new", "smartphone", "is", "lighter", "than", "the", "previous", "version."]},
191
+ ]
192
+ for case in basic_cases:
193
+ adj_labels = adj_labeler.label_example(case, "adj")
194
+ logger.info(f"\ntokens:\t{case['tokens']}\nadj:\t{adj_labels}")
multi_head_trainer.py CHANGED
@@ -305,7 +305,7 @@ if __name__ == "__main__":
305
  arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
306
  action="store_true", default=False)
307
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
308
- action="store", default="./final")
309
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
310
  action="store", default=None)
311
  arg_parser.add_argument("--train", help='Train model using loaded examples.',
@@ -392,22 +392,14 @@ if __name__ == "__main__":
392
  # Train the model!
393
  # ------------------------------------------------------------------------------
394
 
395
- """
396
- Current bests:
397
-
398
- deberta-v3-base:
399
- num_train_epochs=3,
400
- learning_rate=5e-5,
401
- per_device_train_batch_size=2,
402
- gradient_accumulation_steps=8,
403
- """
404
-
405
  trainer = MultiHeadTrainer(
406
  ALL_LABELS,
407
  model=multi_head_model,
408
  args=TrainingArguments(
409
  # Evaluate less frequently or keep the same
410
- eval_strategy="epoch",
 
 
411
  num_train_epochs=args.train_epochs,
412
  learning_rate=args.learning_rate,
413
 
@@ -419,10 +411,12 @@ if __name__ == "__main__":
419
  logging_steps=100,
420
 
421
  # Effective batch size = train_batch_size x gradient_accumulation_steps
 
422
  per_device_train_batch_size=args.train_batch_size,
423
  gradient_accumulation_steps=args.accumulation_steps,
424
 
425
- per_device_eval_batch_size=args.eval_batch_size,
 
426
  ),
427
  train_dataset=tokenized_dataset["train"],
428
  eval_dataset=tokenized_dataset["validation"],
 
305
  arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
306
  action="store_true", default=False)
307
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
308
+ action="store", default="./ud_final")
309
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
310
  action="store", default=None)
311
  arg_parser.add_argument("--train", help='Train model using loaded examples.',
 
392
  # Train the model!
393
  # ------------------------------------------------------------------------------
394
 
 
 
 
 
 
 
 
 
 
 
395
  trainer = MultiHeadTrainer(
396
  ALL_LABELS,
397
  model=multi_head_model,
398
  args=TrainingArguments(
399
  # Evaluate less frequently or keep the same
400
+ eval_strategy="steps",
401
+ save_strategy="steps",
402
+ load_best_model_at_end=True,
403
  num_train_epochs=args.train_epochs,
404
  learning_rate=args.learning_rate,
405
 
 
411
  logging_steps=100,
412
 
413
  # Effective batch size = train_batch_size x gradient_accumulation_steps
414
+ per_device_eval_batch_size=args.eval_batch_size,
415
  per_device_train_batch_size=args.train_batch_size,
416
  gradient_accumulation_steps=args.accumulation_steps,
417
 
418
+ warmup_ratio=0.1,
419
+ weight_decay=0.01,
420
  ),
421
  train_dataset=tokenized_dataset["train"],
422
  eval_dataset=tokenized_dataset["validation"],
multi_predict.py CHANGED
@@ -2,7 +2,7 @@ from transformers import DebertaV2TokenizerFast
2
  import torch
3
 
4
  from multi_head_model import MultiHeadModel
5
- from utils import get_torch_device, sp_tokenize
6
 
7
 
8
  class MultiHeadPredictor:
@@ -24,7 +24,7 @@ class MultiHeadPredictor:
24
 
25
  :return: A dict with {head_name: [predicted_label_for_each_token]} for the tokens in `text`.
26
  """
27
- raw_tokens = sp_tokenize(text)
28
 
29
  # We'll do a single-example batch to replicate training chunk logic.
30
  # is_split_into_words=True => we pass a list of tokens, not a single string.
 
2
  import torch
3
 
4
  from multi_head_model import MultiHeadModel
5
+ from utils import get_torch_device
6
 
7
 
8
  class MultiHeadPredictor:
 
24
 
25
  :return: A dict with {head_name: [predicted_label_for_each_token]} for the tokens in `text`.
26
  """
27
+ raw_tokens = text.split()
28
 
29
  # We'll do a single-example batch to replicate training chunk logic.
30
  # is_split_into_words=True => we pass a list of tokens, not a single string.
dataset_maker.py → openai_dataset_maker.py RENAMED
@@ -8,7 +8,7 @@ import asyncio
8
  import json
9
  import logging
10
 
11
- from utils import default_logging_config, sp_tokenize
12
 
13
  client = AsyncOpenAI()
14
  logger = logging.getLogger(__name__)
@@ -177,7 +177,7 @@ async def classify_with_retry(args, prompt, labels, tokens, retry=10):
177
 
178
 
179
  async def generate_token_labels(args, case):
180
- tokens = sp_tokenize(case)
181
  sorted_cols = list(sorted(features.keys()))
182
  example = {}
183
  for idx, labels in enumerate(list(await asyncio.gather(
 
8
  import json
9
  import logging
10
 
11
+ from utils import default_logging_config
12
 
13
  client = AsyncOpenAI()
14
  logger = logging.getLogger(__name__)
 
177
 
178
 
179
  async def generate_token_labels(args, case):
180
+ tokens = case.split()
181
  sorted_cols = list(sorted(features.keys()))
182
  example = {}
183
  for idx, labels in enumerate(list(await asyncio.gather(
sp.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2676ad813627497b95ce13c8ebe6b3313391c6df4b75909b5d6f68dcdde716b
3
- size 18104223
 
 
 
 
sp.vocab DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3a11823032d025ecd19a1e6bfef167b9a9ef6489d81eff726d4b399a20163ce
3
- size 18715604
 
 
 
 
ud_dataset_maker.py CHANGED
@@ -286,7 +286,7 @@ if __name__ == "__main__":
286
  arg_parser.add_argument("--save", help='Save dataset to disk.',
287
  action="store_true", default=False)
288
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
289
- action="store", default="./training_data")
290
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
291
  action="store", default=None)
292
  args = arg_parser.parse_args()
@@ -352,15 +352,15 @@ if __name__ == "__main__":
352
  final_dataset["test"] = concatenate_datasets(
353
  [
354
  en_ewt_processed["test"],
355
- #en_gum_processed["test"].filter(is_rare_case),
 
356
  ]
357
  )
358
 
359
  final_dataset["train"] = concatenate_datasets(
360
  [
361
  en_ewt_processed["train"],
362
- #en_gum_processed["train"].filter(is_rare_case),
363
- #en_pud_processed["test"].filter(is_rare_case),
364
  ]
365
  )
366
  if args.augment_typos:
@@ -369,11 +369,10 @@ if __name__ == "__main__":
369
  final_dataset["validation"] = concatenate_datasets(
370
  [
371
  en_ewt_processed["validation"],
372
- #en_gum_processed["validation"].filter(is_rare_case),
373
  ]
374
  )
375
  show_examples(final_dataset, args.show)
376
  get_uniq_training_labels(final_dataset)
377
  if args.save:
378
  final_dataset.save_to_disk(args.save_path)
379
-
 
286
  arg_parser.add_argument("--save", help='Save dataset to disk.',
287
  action="store_true", default=False)
288
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
289
+ action="store", default="./ud_training_data")
290
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
291
  action="store", default=None)
292
  args = arg_parser.parse_args()
 
352
  final_dataset["test"] = concatenate_datasets(
353
  [
354
  en_ewt_processed["test"],
355
+ en_gum_processed["test"], #.filter(is_rare_case),
356
+ en_pud_processed["test"], #.filter(is_rare_case),
357
  ]
358
  )
359
 
360
  final_dataset["train"] = concatenate_datasets(
361
  [
362
  en_ewt_processed["train"],
363
+ en_gum_processed["train"], #.filter(is_rare_case),
 
364
  ]
365
  )
366
  if args.augment_typos:
 
369
  final_dataset["validation"] = concatenate_datasets(
370
  [
371
  en_ewt_processed["validation"],
372
+ en_gum_processed["validation"], #.filter(is_rare_case),
373
  ]
374
  )
375
  show_examples(final_dataset, args.show)
376
  get_uniq_training_labels(final_dataset)
377
  if args.save:
378
  final_dataset.save_to_disk(args.save_path)
 
utils/__init__.py CHANGED
@@ -1,36 +1,31 @@
1
  from datasets import DatasetDict
2
  from typing import Optional
3
- import itertools
4
  import logging
5
- import sentencepiece as spm
6
  import torch
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
- sp = spm.SentencePieceProcessor()
11
- sp.LoadFromFile(f"sp.model")
12
-
13
  default_logging_config = {
14
- "version": 1,
15
- "disable_existing_loggers": False,
16
- "formatters": {
17
- "default": {
18
- "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
19
- },
20
  },
21
- "handlers": {
22
- "console": {
23
- "class": "logging.StreamHandler",
24
- "formatter": "default",
25
- },
26
  },
27
- "loggers": {
28
- "": {
29
- "level": "INFO",
30
- "handlers": ["console"],
31
- },
32
  },
33
- }
 
34
 
35
 
36
  def get_torch_device():
@@ -89,7 +84,3 @@ def show_examples(ds: DatasetDict, show_expr: Optional[str]):
89
  logger.info(f"Example {i}:")
90
  for feature in examples_to_show.keys():
91
  logger.info(f" {feature}: {examples_to_show[feature][i]}")
92
-
93
-
94
- def sp_tokenize(text: str):
95
- return list(itertools.chain.from_iterable([s.strip("▁").split("▁") for s in sp.EncodeAsPieces(text)]))
 
1
  from datasets import DatasetDict
2
  from typing import Optional
 
3
  import logging
 
4
  import torch
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
 
 
8
  default_logging_config = {
9
+ "version": 1,
10
+ "disable_existing_loggers": False,
11
+ "formatters": {
12
+ "default": {
13
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 
14
  },
15
+ },
16
+ "handlers": {
17
+ "console": {
18
+ "class": "logging.StreamHandler",
19
+ "formatter": "default",
20
  },
21
+ },
22
+ "loggers": {
23
+ "": {
24
+ "level": "INFO",
25
+ "handlers": ["console"],
26
  },
27
+ },
28
+ }
29
 
30
 
31
  def get_torch_device():
 
84
  logger.info(f"Example {i}:")
85
  for feature in examples_to_show.keys():
86
  logger.info(f" {feature}: {examples_to_show[feature][i]}")