lang-detect / push_to_hf.py
Glebs Vinarskis
Initial commit including model and configuration
26b1bda
import os
import shutil
import argparse
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
)
from huggingface_hub import HfApi, Repository
from transformers.pipelines import PIPELINE_REGISTRY
# import json
from configuration_stacked import ImpressoConfig
from modeling_stacked import ExtendedMultitaskModelForTokenClassification
import subprocess
from lang_ident import LangIdentPipeline
def get_latest_checkpoint(checkpoint_dir):
checkpoints = [
d
for d in os.listdir(checkpoint_dir)
if os.path.isdir(os.path.join(checkpoint_dir, d))
and d.startswith("checkpoint-")
]
checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]), reverse=True)
return os.path.join(checkpoint_dir, checkpoints[0])
def get_info(label_map):
num_token_labels_dict = {task: len(labels) for task, labels in label_map.items()}
return num_token_labels_dict
def push_model_to_hub(checkpoint_dir, repo_name):
# checkpoint_path = get_latest_checkpoint(checkpoint_dir)
checkpoint_path = checkpoint_dir
config = ImpressoConfig.from_pretrained(checkpoint_path)
print(config)
config.pretrained_config = ImpressoConfig.from_pretrained(config.filename)
config.save_pretrained("floret")
config = ImpressoConfig.from_pretrained("floret")
PIPELINE_REGISTRY.register_pipeline(
"lang-ident",
pipeline_class=LangIdentPipeline,
pt_model=ExtendedMultitaskModelForTokenClassification,
)
# PIPELINE_REGISTRY.register_pipeline(
# "pair-classification",
# pipeline_class=PairClassificationPipeline,
# pt_model=AutoModelForSequenceClassification,
# tf_model=TFAutoModelForSequenceClassification,
# )
config.custom_pipelines = {
"lang-ident": {
"impl": "lang_ident.LangIdentPipeline",
"pt": ["AutoModelForSequenceClassification"],
"tf": [],
}
}
model = ExtendedMultitaskModelForTokenClassification.from_pretrained(
checkpoint_path, config=config
)
local_repo_path = "lang-detect"
repo_url = HfApi().create_repo(repo_id=repo_name, exist_ok=True)
repo = Repository(local_dir=local_repo_path, clone_from=repo_url)
try:
# Try to pull the latest changes from the remote repository using subprocess
subprocess.run(["git", "pull"], check=True, cwd=local_repo_path)
except subprocess.CalledProcessError as e:
# If fast-forward is not possible, reset the local branch to match the remote branch
subprocess.run(
["git", "reset", "--hard", "origin/main"],
check=True,
cwd=local_repo_path,
)
# Copy all Python files to the local repository directory
current_dir = os.path.dirname(os.path.abspath(__file__))
for filename in os.listdir(current_dir):
if filename.endswith(".py") or filename.endswith(".json"):
shutil.copy(
os.path.join(current_dir, filename),
os.path.join(local_repo_path, filename),
)
ImpressoConfig.register_for_auto_class()
AutoConfig.register("floret", ImpressoConfig)
AutoModelForSequenceClassification.register(
ImpressoConfig, ExtendedMultitaskModelForTokenClassification
)
ExtendedMultitaskModelForTokenClassification.register_for_auto_class(
"AutoModelForSequenceClassification"
)
# model.save_pretrained(local_repo_path)
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import pipeline
# Define the model name to be used for token classification, we use the Impresso NER
# that can be found at "https://huggingface.co/impresso-project/ner-stacked-bert-multilingual"
MODEL_NAME = "Maslionok/lang-detect"
#
# # Add, commit and push the changes to the repository
subprocess.run(["git", "add", "."], check=True, cwd=local_repo_path)
subprocess.run(
["git", "commit", "-m", "Initial commit including model and configuration"],
check=True,
cwd=local_repo_path,
)
subprocess.run(["git", "push"], check=True, cwd=local_repo_path)
#
# Push the model to the hub (this includes the README template)
model.push_to_hub(repo_name)
lang_pipeline = pipeline(
"lang-ident", model=MODEL_NAME, trust_remote_code=True, device="cpu"
)
lang_pipeline.push_to_hub(MODEL_NAME)
sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
#
print(lang_pipeline(sentence))
# lang_pipeline.push_to_hub(MODEL_NAME)
print(f"Model and repo pushed to: {repo_url}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Push NER model to Hugging Face Hub")
parser.add_argument(
"--model_type",
type=str,
required=True,
help="Type of the model (e.g., langident)",
)
parser.add_argument(
"--language",
type=str,
required=True,
help="Language of the model (e.g., multilingual)",
)
parser.add_argument(
"--checkpoint_dir",
type=str,
required=True,
default="LID-40-3-2000000-1-4.bin",
help="Directory containing checkpoint folders",
)
args = parser.parse_args()
repo_name = f"Maslionok/lang-detect"
push_model_to_hub(args.checkpoint_dir, repo_name)
# PIPELINE_REGISTRY.register_pipeline(
# "generic-ner",
# pipeline_class=MultitaskTokenClassificationPipeline,
# pt_model=ExtendedMultitaskModelForTokenClassification,
# )
# model.config.custom_pipelines = {
# "generic-ner": {
# "impl": "generic_ner.MultitaskTokenClassificationPipeline",
# "pt": ["ExtendedMultitaskModelForTokenClassification"],
# "tf": [],
# }
# }
# classifier = pipeline(
# "generic-ner", model=model, tokenizer=tokenizer, label_map=label_map
# )
# from pprint import pprint
#
# pprint(
# classifier(
# "1. Le public est averti que Charlotte née Bourgoin, femme-de Joseph Digiez, et Maurice Bourgoin, enfant mineur représenté par le sieur Jaques Charles Gicot son curateur, ont été admis par arrêt du Conseil d'Etat du 5 décembre 1797, à solliciter une renonciation générale et absolue aux biens et aux dettes présentes et futures de Jean-Baptiste Bourgoin leur père."
# )
# )
# repo.push_to_hub(commit_message="Initial commit of the trained NER model with code")