feat: UD is back, LlaMA play

Browse files

Files changed (9) hide show

dataset_splitter.py +1 -1
llama_dataset_maker.py +194 -0
multi_head_trainer.py +7 -13
multi_predict.py +2 -2
dataset_maker.py → openai_dataset_maker.py +2 -2
sp.model +0 -3
sp.vocab +0 -3
ud_dataset_maker.py +5 -6
utils/__init__.py +17 -26

dataset_splitter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from datasets import DatasetDict, load_from_disk
 import argparse
-from dataset_maker import features
 def has_all_valid_labels(exp):
     for col, labels in exp.items():

 from datasets import DatasetDict, load_from_disk
 import argparse
+from openai_dataset_maker import features
 def has_all_valid_labels(exp):
     for col, labels in exp.items():

llama_dataset_maker.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from abc import ABC, abstractmethod
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Pipeline, pipeline
+import logging
+import torch
+from utils import get_torch_device
+logger = logging.getLogger(__name__)
+class ChatModel(ABC):
+    @abstractmethod
+    def generate(self, messages: list[dict[str, str]]) -> dict[str, str]:
+        pass
+class AdjLabeler:
+    def __init__(self, model: ChatModel):
+        self.model = model
+    def label_example(self, exp, feature_name):
+        messages = [
+            {"role": "system",
+             "content": "You are a helpful Grammar tutor."},
+            {"role": "user",
+             "content": "An adjective is a word that describes a noun?"},
+            {"role": "assistant",
+             "content": "Yes, that's correct! An adjective relates to, modifies, or describes nouns."},
+            {"role": "user",
+             "content": "Are they always used with nouns?"},
+            {"role": "assistant",
+             "content": ("No, adjectives often appear directly before nouns (e.g. \"a red apple\") "
+                         "but they can also follow linking verbs to describe the subject (e.g. \"The sky is blue\"). "
+                         "Sometimes, adjectives are used as complements in certain constructions or phrases "
+                         "(e.g. \"the rich\" or \"well-known author\").")},
+            {"role": "user",
+             "content": "They can have comparative or superlative forms too, right?"},
+            {"role": "assistant",
+             "content": ("Yes, that's right! The word \"fast\" can take a comparative form as in \"faster\" "
+                         "or a superlative form as in \"fastest\". Some adjectives don't have comparative or "
+                         "superlative forms but use the word \"more\" or \"most\" to become comparative or "
+                         "superlative.")},
+            {"role": "user",
+             "content": f"How about this example: {exp['tokens']}"},
+        ]
+        token_labels = []
+        for idx, token in enumerate(exp["tokens"]):
+            token_messages = messages.copy()
+            token_messages.append({"role": "user",
+                             "content": f"Is '{token}' at position {idx} an adjective? Answer 'yes' or 'no'."})
+            #logger.info(f"token_messages: {token_messages}")
+            assistant_message = self.model.generate(token_messages)
+            logger.info(f"{assistant_message} - {token}")
+            token_messages.append(assistant_message)
+            messages += token_messages
+        return token_labels
+class LlamaPipeline(ChatModel):
+    def __init__(self, model_name: str):
+        self.device = get_torch_device()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.pipeline = pipeline(
+            "text-generation",
+            model=model_name,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+    def generate(self, messages, max_new_tokens=1) :
+        outputs = self.pipeline(
+            messages,
+            max_new_tokens=max_new_tokens,
+            pad_token_id=self.tokenizer.eos_token_id,
+            temperature=0.6,
+            top_p=0.9,
+        )
+        return outputs[0]["generated_text"][-1]
+class LlamaModel(ChatModel):
+    """
+    A wrapper around a Llama  model checkpoint using Hugging Face Transformers.
+    """
+    def __init__(self, model_name: str):
+        torch_device = get_torch_device()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map=str(torch_device),
+            torch_dtype=torch.float16,
+        )
+        self.model.to(torch_device)
+        self.model.eval()
+        # Adjust generation parameters as needed
+        self.generation_config = GenerationConfig(
+            max_new_tokens=1,
+            pad_token_id=self.tokenizer.eos_token_id,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+        )
+    def generate(self, prompt: str) -> str:
+        """
+        Generate text from the model given a prompt.
+        """
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                generation_config=self.generation_config
+            )
+        raw_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        return raw_output[len(prompt):]
+# ----------------------------------
+# Putting It All Together
+# ----------------------------------
+if __name__ == "__main__":
+    import logging.config
+    from utils import default_logging_config
+    logging.config.dictConfig(default_logging_config)
+    llama_pipeline = LlamaPipeline(
+        model_name="meta-llama/Llama-3.2-3B-Instruct",
+        #model_name="meta-llama/Llama-3.1-8B-Instruct",
+    )
+    adj_labeler = AdjLabeler(llama_pipeline)
+    basic_cases = [
+        #{"text": "Joan has a nice dog.",
+        # "tokens": ["Joan", "has", "a", "nice", "dog."]},
+        #{"text": "Bob is the most agile person I have ever met.",
+        # "tokens": ["Bob", "is", "the", "most", "agile", "person", "I", "have", "ever", "met."]},
+        #{"text": "He's a total shit head",
+        # "tokens": ["He's", "a", "total", "shit", "head"]},
+        #{"text": "The old, creaky house stood on the quiet street.",
+        # "tokens": ["The", "old,", "creaky", "house", "stood", "on", "the", "quiet", "street."]},
+        #{"text": "The sky turned brilliant blue as the sun emerged.",
+        # "tokens": ["The", "sky", "turned", "brilliant", "blue", "as", "the", "sun", "emerged."]},
+        #{"text": "They admired the well-behaved and enthusiastic children at the party.",
+        # "tokens": ["They", "admired", "the", "well-behaved", "and", "enthusiastic", "children", "at", "the",
+        #            "party."]},
+        #{"text": "After dinner, she felt tired and content.",
+        # "tokens": ["After", "dinner,", "she", "felt", "tired", "and", "content."]},
+        #{"text": "The resourceful team devised a clever plan.",
+        # "tokens": ["The", "resourceful", "team", "devised", "a", "clever", "plan."]},
+        #{"text": "He handed over the thick book to the eager student.",
+        # "tokens": ["He", "handed", "over", "the", "thick", "book", "to", "the", "eager", "student."]},
+        #{"text": "We appreciated the delicious, handmade pie from our neighbor.",
+        # "tokens": ["We", "appreciated", "the", "delicious,", "handmade", "pie", "from", "our", "neighbor."]},
+        #{"text": "In the enchanted forest, sparkling fairies danced under the moonlight.",
+        # "tokens": ["In", "the", "enchanted", "forest,", "sparkling", "fairies", "danced", "under", "the", "moonlight."]},
+        #{"text": "The stray cats, hungry and dirty, roamed the narrow alley.",
+        # "tokens": ["The", "stray", "cats,", "hungry", "and", "dirty,", "roamed", "the", "narrow", "alley."]},
+        #{"text": "The challenging puzzle left the determined young boy both frustrated and excited.",
+        # "tokens": ["The", "challenging", "puzzle", "left", "the", "determined", "young", "boy", "both", "frustrated",
+        #            "and", "excited."]},
+        {"text": "Big cars use a lot more gas.",
+         "tokens": ["Big", "cars", "use", "a", "lot", "more", "gas."]},
+        {"text": "My car is faster than my bicycle.",
+         "tokens": ["My", "car", "is", "faster", "than", "my", "bicycle."]},
+        #{"text": "This puzzle is more challenging than the one we solved yesterday.",
+        # "tokens": ["This", "puzzle", "is", "more", "challenging", "than", "the", "one", "we", "solved", "yesterday."]},
+        #{"text": "Among all the students, Lara is the most diligent.",
+        # "tokens": ["Among", "all", "the", "students,", "Lara", "is", "the", "most", "diligent."]},
+        #{"text": "That building is taller than the one next to it.",
+        # "tokens": ["That", "building", "is", "taller", "than", "the", "one", "next", "to", "it."]},
+        #{"text": "This book is more interesting than the movie adaptation.",
+        # "tokens": ["This", "book", "is", "more", "interesting", "than", "the", "movie", "adaptation."]},
+        #{"text": "Of all the fruits, mangoes are the sweetest.",
+        # "tokens": ["Of", "all", "the", "fruits,", "mangoes", "are", "the", "sweetest."]},
+        #{"text": "His running speed is quicker than anyone else's on the team.",
+        # "tokens": ["His", "running", "speed", "is", "quicker", "than", "anyone", "else's", "on", "the", "team."]},
+        #{"text": "The exam was easier than I had anticipated.",
+        # "tokens": ["The", "exam", "was", "easier", "than", "I", "had", "anticipated."]},
+        #{"text": "Among all the flavors, vanilla is the mildest.",
+        # "tokens": ["Among", "all", "the", "flavors,", "vanilla", "is", "the", "mildest."]},
+        #{"text": "The new smartphone is lighter than the previous version.",
+        # "tokens": ["The", "new", "smartphone", "is", "lighter", "than", "the", "previous", "version."]},
+    ]
+    for case in basic_cases:
+        adj_labels = adj_labeler.label_example(case, "adj")
+        logger.info(f"\ntokens:\t{case['tokens']}\nadj:\t{adj_labels}")

multi_head_trainer.py CHANGED Viewed

@@ -305,7 +305,7 @@ if __name__ == "__main__":
     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
-                            action="store", default="./final")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     arg_parser.add_argument("--train", help='Train model using loaded examples.',
@@ -392,22 +392,14 @@ if __name__ == "__main__":
     # Train the model!
     # ------------------------------------------------------------------------------
-    """
-    Current bests:
-    deberta-v3-base:
-        num_train_epochs=3,
-        learning_rate=5e-5,
-        per_device_train_batch_size=2,
-        gradient_accumulation_steps=8,
-    """
     trainer = MultiHeadTrainer(
         ALL_LABELS,
         model=multi_head_model,
         args=TrainingArguments(
             # Evaluate less frequently or keep the same
-            eval_strategy="epoch",
             num_train_epochs=args.train_epochs,
             learning_rate=args.learning_rate,
@@ -419,10 +411,12 @@ if __name__ == "__main__":
             logging_steps=100,
             # Effective batch size = train_batch_size x gradient_accumulation_steps
             per_device_train_batch_size=args.train_batch_size,
             gradient_accumulation_steps=args.accumulation_steps,
-            per_device_eval_batch_size=args.eval_batch_size,
         ),
         train_dataset=tokenized_dataset["train"],
         eval_dataset=tokenized_dataset["validation"],

     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
+                            action="store", default="./ud_final")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     arg_parser.add_argument("--train", help='Train model using loaded examples.',
     # Train the model!
     # ------------------------------------------------------------------------------
     trainer = MultiHeadTrainer(
         ALL_LABELS,
         model=multi_head_model,
         args=TrainingArguments(
             # Evaluate less frequently or keep the same
+            eval_strategy="steps",
+            save_strategy="steps",
+            load_best_model_at_end=True,
             num_train_epochs=args.train_epochs,
             learning_rate=args.learning_rate,
             logging_steps=100,
             # Effective batch size = train_batch_size x gradient_accumulation_steps
+            per_device_eval_batch_size=args.eval_batch_size,
             per_device_train_batch_size=args.train_batch_size,
             gradient_accumulation_steps=args.accumulation_steps,
+            warmup_ratio=0.1,
+            weight_decay=0.01,
         ),
         train_dataset=tokenized_dataset["train"],
         eval_dataset=tokenized_dataset["validation"],

multi_predict.py CHANGED Viewed

@@ -2,7 +2,7 @@ from transformers import DebertaV2TokenizerFast
 import torch
 from multi_head_model import MultiHeadModel
-from utils import get_torch_device, sp_tokenize
 class MultiHeadPredictor:
@@ -24,7 +24,7 @@ class MultiHeadPredictor:
         :return: A dict with {head_name: [predicted_label_for_each_token]} for the tokens in `text`.
         """
-        raw_tokens = sp_tokenize(text)
         # We'll do a single-example batch to replicate training chunk logic.
         # is_split_into_words=True => we pass a list of tokens, not a single string.

 import torch
 from multi_head_model import MultiHeadModel
+from utils import get_torch_device
 class MultiHeadPredictor:
         :return: A dict with {head_name: [predicted_label_for_each_token]} for the tokens in `text`.
         """
+        raw_tokens = text.split()
         # We'll do a single-example batch to replicate training chunk logic.
         # is_split_into_words=True => we pass a list of tokens, not a single string.

dataset_maker.py → openai_dataset_maker.py RENAMED Viewed

@@ -8,7 +8,7 @@ import asyncio
 import json
 import logging
-from utils import default_logging_config, sp_tokenize
 client = AsyncOpenAI()
 logger = logging.getLogger(__name__)
@@ -177,7 +177,7 @@ async def classify_with_retry(args, prompt, labels, tokens, retry=10):
 async def generate_token_labels(args, case):
-    tokens = sp_tokenize(case)
     sorted_cols = list(sorted(features.keys()))
     example = {}
     for idx, labels in  enumerate(list(await asyncio.gather(

 import json
 import logging
+from utils import default_logging_config
 client = AsyncOpenAI()
 logger = logging.getLogger(__name__)
 async def generate_token_labels(args, case):
+    tokens = case.split()
     sorted_cols = list(sorted(features.keys()))
     example = {}
     for idx, labels in  enumerate(list(await asyncio.gather(

sp.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2676ad813627497b95ce13c8ebe6b3313391c6df4b75909b5d6f68dcdde716b
-size 18104223

sp.vocab DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3a11823032d025ecd19a1e6bfef167b9a9ef6489d81eff726d4b399a20163ce
-size 18715604

ud_dataset_maker.py CHANGED Viewed

@@ -286,7 +286,7 @@ if __name__ == "__main__":
     arg_parser.add_argument("--save", help='Save dataset to disk.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
-                            action="store", default="./training_data")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     args = arg_parser.parse_args()
@@ -352,15 +352,15 @@ if __name__ == "__main__":
     final_dataset["test"] = concatenate_datasets(
         [
             en_ewt_processed["test"],
-            #en_gum_processed["test"].filter(is_rare_case),
         ]
     )
     final_dataset["train"] = concatenate_datasets(
         [
             en_ewt_processed["train"],
-            #en_gum_processed["train"].filter(is_rare_case),
-            #en_pud_processed["test"].filter(is_rare_case),
         ]
     )
     if args.augment_typos:
@@ -369,11 +369,10 @@ if __name__ == "__main__":
     final_dataset["validation"] = concatenate_datasets(
         [
             en_ewt_processed["validation"],
-            #en_gum_processed["validation"].filter(is_rare_case),
         ]
     )
     show_examples(final_dataset, args.show)
     get_uniq_training_labels(final_dataset)
     if args.save:
         final_dataset.save_to_disk(args.save_path)

     arg_parser.add_argument("--save", help='Save dataset to disk.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
+                            action="store", default="./ud_training_data")
     arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
                             action="store", default=None)
     args = arg_parser.parse_args()
     final_dataset["test"] = concatenate_datasets(
         [
             en_ewt_processed["test"],
+            en_gum_processed["test"],  #.filter(is_rare_case),
+            en_pud_processed["test"],  #.filter(is_rare_case),
         ]
     )
     final_dataset["train"] = concatenate_datasets(
         [
             en_ewt_processed["train"],
+            en_gum_processed["train"],  #.filter(is_rare_case),
         ]
     )
     if args.augment_typos:
     final_dataset["validation"] = concatenate_datasets(
         [
             en_ewt_processed["validation"],
+            en_gum_processed["validation"],  #.filter(is_rare_case),
         ]
     )
     show_examples(final_dataset, args.show)
     get_uniq_training_labels(final_dataset)
     if args.save:
         final_dataset.save_to_disk(args.save_path)

utils/__init__.py CHANGED Viewed

@@ -1,36 +1,31 @@
 from datasets import DatasetDict
 from typing import Optional
-import itertools
 import logging
-import sentencepiece as spm
 import torch
 logger = logging.getLogger(__name__)
-sp = spm.SentencePieceProcessor()
-sp.LoadFromFile(f"sp.model")
 default_logging_config = {
-        "version": 1,
-        "disable_existing_loggers": False,
-        "formatters": {
-            "default": {
-                "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-            },
         },
-        "handlers": {
-            "console": {
-                "class": "logging.StreamHandler",
-                "formatter": "default",
-            },
         },
-        "loggers": {
-            "": {
-                "level": "INFO",
-                "handlers": ["console"],
-            },
         },
-    }
 def get_torch_device():
@@ -89,7 +84,3 @@ def show_examples(ds: DatasetDict, show_expr: Optional[str]):
         logger.info(f"Example {i}:")
         for feature in examples_to_show.keys():
             logger.info(f"  {feature}: {examples_to_show[feature][i]}")
-def sp_tokenize(text: str):
-    return list(itertools.chain.from_iterable([s.strip("▁").split("▁") for s in sp.EncodeAsPieces(text)]))

 from datasets import DatasetDict
 from typing import Optional
 import logging
 import torch
 logger = logging.getLogger(__name__)
 default_logging_config = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "default": {
+            "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
         },
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "default",
         },
+    },
+    "loggers": {
+        "": {
+            "level": "INFO",
+            "handlers": ["console"],
         },
+    },
+}
 def get_torch_device():
         logger.info(f"Example {i}:")
         for feature in examples_to_show.keys():
             logger.info(f"  {feature}: {examples_to_show[feature][i]}")