| import os |
| import shutil |
| import argparse |
| from transformers import ( |
| AutoTokenizer, |
| AutoConfig, |
| AutoModelForSequenceClassification, |
| ) |
| from huggingface_hub import HfApi, Repository |
| from transformers.pipelines import PIPELINE_REGISTRY |
|
|
| |
| from configuration_stacked import ImpressoConfig |
| from modeling_stacked import ExtendedMultitaskModelForTokenClassification |
| import subprocess |
| from lang_ident import LangIdentPipeline |
|
|
|
|
| def get_latest_checkpoint(checkpoint_dir): |
| checkpoints = [ |
| d |
| for d in os.listdir(checkpoint_dir) |
| if os.path.isdir(os.path.join(checkpoint_dir, d)) |
| and d.startswith("checkpoint-") |
| ] |
| checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True) |
| return os.path.join(checkpoint_dir, checkpoints[0]) |
|
|
|
|
| def get_info(label_map): |
| num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()} |
| return num_token_labels_dict |
|
|
|
|
| def push_model_to_hub(checkpoint_dir, repo_name): |
| |
| checkpoint_path = checkpoint_dir |
| config = ImpressoConfig.from_pretrained(checkpoint_path) |
| print(config) |
|
|
| config.pretrained_config = ImpressoConfig.from_pretrained(config.filename) |
| config.save_pretrained("floret") |
| config = ImpressoConfig.from_pretrained("floret") |
| PIPELINE_REGISTRY.register_pipeline( |
| "lang-ident", |
| pipeline_class=LangIdentPipeline, |
| pt_model=ExtendedMultitaskModelForTokenClassification, |
| ) |
|
|
| |
| |
| |
| |
| |
| |
|
|
| config.custom_pipelines = { |
| "lang-ident": { |
| "impl": "lang_ident.LangIdentPipeline", |
| "pt": ["AutoModelForSequenceClassification"], |
| "tf": [], |
| } |
| } |
| model = ExtendedMultitaskModelForTokenClassification.from_pretrained( |
| checkpoint_path, config=config |
| ) |
|
|
| local_repo_path = "lang-detect" |
| repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True) |
| repo = Repository(local_dir=local_repo_path, clone_from=repo_url) |
|
|
| try: |
| |
| subprocess.run(["git", "pull"], check=True, cwd=local_repo_path) |
| except subprocess.CalledProcessError as e: |
| |
| subprocess.run( |
| ["git", "reset", "--hard", "origin/main"], |
| check=True, |
| cwd=local_repo_path, |
| ) |
|
|
| |
| current_dir = os.path.dirname(os.path.abspath(__file__)) |
| for filename in os.listdir(current_dir): |
| if filename.endswith(".py") or filename.endswith(".json"): |
| shutil.copy( |
| os.path.join(current_dir, filename), |
| os.path.join(local_repo_path, filename), |
| ) |
|
|
| ImpressoConfig.register_for_auto_class() |
|
|
| AutoConfig.register("floret", ImpressoConfig) |
| AutoModelForSequenceClassification.register( |
| ImpressoConfig, ExtendedMultitaskModelForTokenClassification |
| ) |
| ExtendedMultitaskModelForTokenClassification.register_for_auto_class( |
| "AutoModelForSequenceClassification" |
| ) |
| |
|
|
| from transformers import AutoModelForTokenClassification, AutoTokenizer |
| from transformers import pipeline |
|
|
| |
| |
| MODEL_NAME = "Maslionok/lang-detect" |
| |
|
|
| |
| subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path) |
| subprocess.run( |
| ["git", "commit", "-m", "Initial commit including model and configuration"], |
| check=True, |
| cwd=local_repo_path, |
| ) |
| subprocess.run(["git", "push"], check=True, cwd=local_repo_path) |
| |
| |
| model.push_to_hub(repo_name) |
|
|
| lang_pipeline = pipeline( |
| "lang-ident", model=MODEL_NAME, trust_remote_code=True, device="cpu" |
| ) |
| lang_pipeline.push_to_hub(MODEL_NAME) |
| sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume." |
| |
| print(lang_pipeline(sentence)) |
| |
| print(f"Model and repo pushed to: {repo_url}") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub") |
| parser.add_argument( |
| "--model_type", |
| type=str, |
| required=True, |
| help="Type of the model (e.g., langident)", |
| ) |
| parser.add_argument( |
| "--language", |
| type=str, |
| required=True, |
| help="Language of the model (e.g., multilingual)", |
| ) |
| parser.add_argument( |
| "--checkpoint_dir", |
| type=str, |
| required=True, |
| default="LID-40-3-2000000-1-4.bin", |
| help="Directory containing checkpoint folders", |
| ) |
| args = parser.parse_args() |
| repo_name = f"Maslionok/lang-detect" |
| push_model_to_hub(args.checkpoint_dir, repo_name) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|