Initial commit including model and configuration

Files changed (6) hide show

config.json +20 -0
configuration_stacked.py +22 -0
impresso_langident_wrapper.py +65 -0
modeling_stacked.py +159 -0
push_to_hf.py +157 -0
test.py +21 -0

config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_name_or_path": "Maslionok/pipeline1",
+  "architectures": [
+    "Floret"
+  ],
+  "num_labels": 3,
+  "model_type": "floret",
+  "custom_pipelines": {
+    "language-detection-gleb": {
+      "impl": "impresso_langident_wrapper.Pipeline_One",
+      "pt": "AutoModelForSequenceClassification"
+    }
+  },
+  "repo_id": "Maslionok/pipeline1",
+  "flename": "LID-40-3-2000000-1-4.bin",
+  "auto_map": {
+    "AutoConfig": "configuration_stacked.ImpressoConfig",
+    "AutoModelForSequenceClassification": "modeling_stacked.ExtendedMultitaskModelForTokenClassification"
+  }
+}

configuration_stacked.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers import PretrainedConfig
+import torch
+class ImpressoConfig(PretrainedConfig):
+    model_type = "floret"
+    def __init__(self, filename="LID-40-3-2000000-1-4.bin", **kwargs):
+        super().__init__(**kwargs)
+        self.filename = filename
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        # Bypass JSON loading and create config directly
+        print(f"Loading ImpressoConfig from {pretrained_model_name_or_path}")
+        config = cls(filename="LID-40-3-2000000-1-4.bin", **kwargs)
+        return config
+# Register the configuration with the transformers library
+ImpressoConfig.register_for_auto_class()

impresso_langident_wrapper.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from transformers import Pipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+import floret
+from huggingface_hub import hf_hub_download
+class Pipeline_One(Pipeline):
+    # def __init__(self, model_path: str):
+    #     """
+    #     Initialize the Floret language detection pipeline
+    #     Args:
+    #         model_path (str): Path to the .bin model file
+    #     """
+    #     super().__init__()
+    #     self.model = floret.FastText.load_model(model_path)
+    # def __init__(self, model_name="floret_model.bin", repo_id="Maslionok/pipeline1", revision="main", **kwargs):
+    #     """
+    #     Initialize the Floret language detection pipeline.
+    #     Args:
+    #         model_name (str): The name of the Floret model file.
+    #         repo_id (str): The Hugging Face repository ID.
+    #         revision (str): The branch/revision to download from.
+    #     """
+    #     super().__init__(**kwargs)
+    #     model_path = hf_hub_download(repo_id=repo_id, filename=model_name, revision=revision)
+    #     self.model = floret.load_model(model_path)
+    # def _sanitize_parameters(self, **kwargs):
+    #     # Add any additional parameter handling if necessary
+    #     return {}, {}, {}
+    def _sanitize_parameters(self, **kwargs):
+        print("000000000")
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        print("this is preprocessing:")
+        print(text)
+        return text
+    def _forward(self, inputs):
+        model_output = self.model.predict(**inputs, k=1)
+        return model_output
+    def postprocess(self, outputs, **kwargs):
+        return outputs
+# PIPELINE_REGISTRY.register_pipeline(
+#     task="language-detection",
+#     pipeline_class=Pipeline_One,
+#     default={"model": None},
+# )

modeling_stacked.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from transformers import PreTrainedModel, AutoModel, AutoConfig, PretrainedConfig
+import floret, torch
+import os, shutil
+from configuration_stacked import ImpressoConfig
+from transformers.modeling_utils import (
+    get_parameter_device as original_get_parameter_device,
+)
+import torch
+# Import Hugging Face dependencies
+import transformers.modeling_utils
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import (
+    get_parameter_device as original_get_parameter_device,
+)
+# Custom get_parameter_device
+def custom_get_parameter_device(module):
+    """
+    Custom get_parameter_device() to handle floret models.
+    Returns 'cpu' for FloretModelWrapper, otherwise uses the original implementation.
+    """
+    # Check if the model is an instance of your FloretModelWrapper
+    if isinstance(module, FloretModelWrapper):
+        print(
+            "Custom get_parameter_device(): Detected FloretModelWrapper. Returning 'cpu'."
+        )
+        return torch.device("cpu")
+    # Otherwise, fall back to Hugging Face's original implementation
+    return original_get_parameter_device(module)
+# Custom device property
+@property
+def custom_device(self) -> torch.device:
+    """
+    Custom device() method to handle floret models.
+    Always returns torch.device('cpu') for FloretModelWrapper.
+    """
+    # Check if the model is an instance of your FloretModelWrapper
+    if isinstance(self, FloretModelWrapper):
+        print(
+            "Custom device(): Detected FloretModelWrapper. Returning torch.device('cpu')."
+        )
+        return torch.device("cpu")
+    # Otherwise, fall back to Hugging Face's original implementation
+    return torch.device("cpu")  # original_device.__get__(self, type(self))
+# Monkey-patch get_parameter_device and device property
+transformers.modeling_utils.get_parameter_device = custom_get_parameter_device
+PreTrainedModel.device = custom_device
+print("Monkey-patch applied: get_parameter_device and device property")
+# logger = logging.getLogger(__name__)
+original_device = PreTrainedModel.device
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+class FloretModelWrapper:
+    """
+    Wrapper for floret model to make it compatible with Hugging Face pipeline.
+    Mocks the .device attribute and passes predict() unchanged.
+    """
+    def __init__(self, floret_model):
+        self.floret_model = floret_model
+        # Mocking the .device attribute to make Hugging Face happy
+        self.device = torch.device("cpu")  # floret is always on CPU
+    def predict(self, text, k=1):
+        """
+        Pass-through for floret's predict() method.
+        """
+        return self.floret_model.predict(text, k=k)
+class ExtendedMultitaskModelForTokenClassification(PreTrainedModel):
+    config_class = ImpressoConfig
+    # Monkey-patch get_parameter_device
+    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+        super().__init__(config)
+        self.config = config
+        print("Doest is it even pass through here?")
+        print(
+            f"The config in ExtendedMultitaskModelForTokenClassification is: {self.config}"
+        )
+        # self.model = floret.load_model(self.config.filename)
+    def predict(self, text, k=1):
+        predictions = self.model.predict(text, k)
+        return predictions
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        print("Calling from_pretrained...")
+        # Initialize model with config
+        model = cls(ImpressoConfig())
+        # Load model using floret
+        print(f"---Loading model from: {model.config.filename}")
+        floret_model = floret.load_model(model.config.filename)
+        # Wrap the model to fake .device attribute
+        model.model = FloretModelWrapper(floret_model)
+        print(model.model, "device:", model.model.device)
+        print(f"Model loaded and wrapped from: {model.config.filename}")
+        return model
+    def save_pretrained(self, save_directory, *args, **kwargs):
+        # Ignore Hugging Face-specific arguments
+        max_shard_size = kwargs.pop("max_shard_size", None)
+        safe_serialization = kwargs.pop("safe_serialization", False)
+        # Ensure directory exists
+        os.makedirs(save_directory, exist_ok=True)
+        # Save the model file
+        model_file = os.path.join(save_directory, "LID-40-3-2000000-1-4.bin")
+        shutil.copy(self.config.filename, model_file)
+        # Save the config file
+        config_file = os.path.join(save_directory, "config.json")
+        self.config.save_pretrained(save_directory)
+        print(f"Model saved to: {save_directory}")
+    def get_parameter_device(module):
+        """
+        Custom get_parameter_device() to handle floret models.
+        Returns 'cpu' for floret models, and falls back to the original method otherwise.
+        """
+        # Check if the model is an instance of your FloretModelWrapper
+        if isinstance(module, FloretModelWrapper):
+            print(
+                "Custom get_parameter_device(): Detected FloretModelWrapper. Returning 'cpu'."
+            )
+            return "cpu"
+        # Otherwise, fall back to Hugging Face's original implementation
+        return original_get_parameter_device(module)

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import shutil
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+)
+from huggingface_hub import HfApi, Repository
+from transformers.pipelines import PIPELINE_REGISTRY
+# import json
+from configuration_stacked import ImpressoConfig
+from modeling_stacked import ExtendedMultitaskModelForTokenClassification
+import subprocess
+from impresso_langident_wrapper import Pipeline_One
+def get_latest_checkpoint(checkpoint_dir):
+    checkpoints = [
+        d
+        for d in os.listdir(checkpoint_dir)
+        if os.path.isdir(os.path.join(checkpoint_dir, d))
+        and d.startswith("checkpoint-")
+    ]
+    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
+    return os.path.join(checkpoint_dir, checkpoints[0])
+def get_info(label_map):
+    num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
+    return num_token_labels_dict
+def push_model_to_hub(checkpoint_dir, repo_name):
+    # checkpoint_path = get_latest_checkpoint(checkpoint_dir)
+    checkpoint_path = checkpoint_dir
+    config = ImpressoConfig.from_pretrained(checkpoint_path)
+    print(config)
+    config.pretrained_config = ImpressoConfig.from_pretrained(config.filename)
+    config.save_pretrained("floret")
+    config = ImpressoConfig.from_pretrained("floret")
+    PIPELINE_REGISTRY.register_pipeline(
+        "lang-ident",
+        pipeline_class=Pipeline_One,
+        pt_model=ExtendedMultitaskModelForTokenClassification,
+    )
+    # PIPELINE_REGISTRY.register_pipeline(
+    #     "pair-classification",
+    #     pipeline_class=PairClassificationPipeline,
+    #     pt_model=AutoModelForSequenceClassification,
+    #     tf_model=TFAutoModelForSequenceClassification,
+    # )
+    config.custom_pipelines = {
+        "lang-ident": {
+            "impl": "lang_ident.LangIdentPipeline",
+            "pt": ["AutoModelForSequenceClassification"],
+            "tf": [],
+        }
+    }
+    model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
+        checkpoint_path, config=config
+    )
+    local_repo_path = "lang-detect"
+    repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
+    repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
+    try:
+        # Try to pull the latest changes from the remote repository using subprocess
+        subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
+    except subprocess.CalledProcessError as e:
+        # If fast-forward is not possible, reset the local branch to match the remote branch
+        subprocess.run(
+            ["git", "reset", "--hard", "origin/main"],
+            check=True,
+            cwd=local_repo_path,
+        )
+    # Copy all Python files to the local repository directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    for filename in os.listdir(current_dir):
+        if filename.endswith(".py") or filename.endswith(".json"):
+            shutil.copy(
+                os.path.join(current_dir, filename),
+                os.path.join(local_repo_path, filename),
+            )
+    ImpressoConfig.register_for_auto_class()
+    AutoConfig.register("floret", ImpressoConfig)
+    AutoModelForSequenceClassification.register(
+        ImpressoConfig, ExtendedMultitaskModelForTokenClassification
+    )
+    ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
+        "AutoModelForSequenceClassification"
+    )
+    # model.save_pretrained(local_repo_path)
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+    from transformers import pipeline
+    # Define the model name to be used for token classification, we use the Impresso NER
+    # that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
+    MODEL_NAME = "Maslionok/pipeline1"
+    #
+    # # Add, commit and push the changes to the repository
+    subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
+    subprocess.run(
+        ["git", "commit", "-m", "Initial commit including model and configuration"],
+        check=True,
+        cwd=local_repo_path,
+    )
+    subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
+    #
+    # Push the model to the hub (this includes the README template)
+    model.push_to_hub(repo_name)
+    lang_pipeline = pipeline(
+        "lang-ident", model=MODEL_NAME, trust_remote_code=True, device="cpu"
+    )
+    lang_pipeline.push_to_hub(MODEL_NAME)
+    sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+    #
+    print(lang_pipeline(sentence))
+    # lang_pipeline.push_to_hub(MODEL_NAME)
+    print(f"Model and repo pushed to: {repo_url}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        help="Type of the model (e.g., langident)",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        required=True,
+        help="Language of the model (e.g., multilingual)",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        default="LID-40-3-2000000-1-4.bin",
+        help="Directory containing checkpoint folders",
+    )
+    args = parser.parse_args()
+    repo_name = f"Maslionok/pipleline1"
+    push_model_to_hub(args.checkpoint_dir, repo_name)

test.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import floret
+# Load your pretrained model
+model = floret.load_model("LID-40-3-2000000-1-4.bin")
+print(model.get_labels())
+import torch
+# Try loading your Floret model with PyTorch
+try:
+    model = torch.load("LID-40-3-2000000-1-4.bin")
+    print("Model successfully loaded as a PyTorch model.")
+    print(model)
+except Exception as e:
+    print("Failed to load as a PyTorch model. Error:", e)