| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Script to find a candidate list of models to deprecate based on the number of downloads and the date of the last commit. |
| """ |
|
|
| import argparse |
| import glob |
| import json |
| import os |
| from collections import defaultdict |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
| from git import Repo |
| from huggingface_hub import HfApi |
|
|
|
|
| api = HfApi() |
|
|
| PATH_TO_REPO = Path(__file__).parent.parent.resolve() |
| repo = Repo(PATH_TO_REPO) |
|
|
|
|
| class HubModelLister: |
| """ |
| Utility for getting models from the hub based on tags. Handles errors without crashing the script. |
| """ |
|
|
| def __init__(self, tags): |
| self.tags = tags |
| self.model_list = api.list_models(tags=tags) |
|
|
| def __iter__(self): |
| try: |
| yield from self.model_list |
| except Exception as e: |
| print(f"Error: {e}") |
| return |
|
|
|
|
| def _extract_commit_hash(commits): |
| for commit in commits: |
| if commit.startswith("commit "): |
| return commit.split(" ")[1] |
| return "" |
|
|
|
|
| def get_list_of_repo_model_paths(models_dir): |
| |
| models = glob.glob(os.path.join(models_dir, "*/modeling_*.py")) |
|
|
| |
| models = [model for model in models if "_flax_" not in model] |
| models = [model for model in models if "_tf_" not in model] |
|
|
| |
| deprecated_models = glob.glob(os.path.join(models_dir, "deprecated", "*")) |
| |
| for deprecated_model in deprecated_models: |
| deprecated_model_name = "/" + deprecated_model.split("/")[-1] + "/" |
| models = [model for model in models if deprecated_model_name not in model] |
| |
| models = [model for model in models if "/deprecated" not in model] |
| |
| models = [model for model in models if "/auto/" not in model] |
| return models |
|
|
|
|
| def get_list_of_models_to_deprecate( |
| thresh_num_downloads=5_000, |
| thresh_date=None, |
| use_cache=False, |
| save_model_info=False, |
| max_num_models=-1, |
| ): |
| if thresh_date is None: |
| thresh_date = datetime.now(timezone.utc).replace(year=datetime.now(timezone.utc).year - 1) |
| else: |
| thresh_date = datetime.strptime(thresh_date, "%Y-%m-%d").replace(tzinfo=timezone.utc) |
|
|
| models_dir = PATH_TO_REPO / "src/transformers/models" |
| model_paths = get_list_of_repo_model_paths(models_dir=models_dir) |
|
|
| if use_cache and os.path.exists("models_info.json"): |
| with open("models_info.json", "r") as f: |
| models_info = json.load(f) |
| |
| for model, info in models_info.items(): |
| info["first_commit_datetime"] = datetime.fromisoformat(info["first_commit_datetime"]) |
|
|
| else: |
| |
| models_info = defaultdict(dict) |
| for model_path in model_paths: |
| model = model_path.split("/")[-2] |
| if model in models_info: |
| continue |
| commits = repo.git.log("--diff-filter=A", "--", model_path).split("\n") |
| commit_hash = _extract_commit_hash(commits) |
| commit_obj = repo.commit(commit_hash) |
| committed_datetime = commit_obj.committed_datetime |
| models_info[model]["commit_hash"] = commit_hash |
| models_info[model]["first_commit_datetime"] = committed_datetime |
| models_info[model]["model_path"] = model_path |
| models_info[model]["downloads"] = 0 |
|
|
| |
| tags = [model] |
| if "_" in model: |
| tags.append(model.replace("_", "-")) |
| models_info[model]["tags"] = tags |
|
|
| |
| models_info = { |
| model: info for model, info in models_info.items() if info["first_commit_datetime"] < thresh_date |
| } |
|
|
| |
| n_seen = 0 |
| for model, model_info in models_info.items(): |
| for model_tag in model_info["tags"]: |
| model_list = HubModelLister(tags=model_tag) |
| for i, hub_model in enumerate(model_list): |
| n_seen += 1 |
| if i % 100 == 0: |
| print(f"Processing model {i} for tag {model_tag}") |
| if max_num_models != -1 and i > n_seen: |
| break |
| if hub_model.private: |
| continue |
| model_info["downloads"] += hub_model.downloads |
|
|
| if save_model_info and not (use_cache and os.path.exists("models_info.json")): |
| |
| for model, info in models_info.items(): |
| info["first_commit_datetime"] = info["first_commit_datetime"].isoformat() |
| with open("models_info.json", "w") as f: |
| json.dump(models_info, f, indent=4) |
|
|
| print("\nFinding models to deprecate:") |
| n_models_to_deprecate = 0 |
| models_to_deprecate = {} |
| for model, info in models_info.items(): |
| n_downloads = info["downloads"] |
| if n_downloads < thresh_num_downloads: |
| n_models_to_deprecate += 1 |
| models_to_deprecate[model] = info |
| print(f"\nModel: {model}") |
| print(f"Downloads: {n_downloads}") |
| print(f"Date: {info['first_commit_datetime']}") |
| print("\nModels to deprecate: ", "\n" + "\n".join(models_to_deprecate.keys())) |
| print(f"\nNumber of models to deprecate: {n_models_to_deprecate}") |
| print("Before deprecating make sure to verify the models, including if they're used as a module in other models.") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--save_model_info", action="store_true", help="Save the retrieved model info to a json file.") |
| parser.add_argument( |
| "--use_cache", action="store_true", help="Use the cached model info instead of calling the hub." |
| ) |
| parser.add_argument( |
| "--thresh_num_downloads", |
| type=int, |
| default=5_000, |
| help="Threshold number of downloads below which a model should be deprecated. Default is 5,000.", |
| ) |
| parser.add_argument( |
| "--thresh_date", |
| type=str, |
| default=None, |
| help="Date to consider the first commit from. Format: YYYY-MM-DD. If unset, defaults to one year ago from today.", |
| ) |
| parser.add_argument( |
| "--max_num_models", |
| type=int, |
| default=-1, |
| help="Maximum number of models to consider from the hub. -1 means all models. Useful for testing.", |
| ) |
| args = parser.parse_args() |
|
|
| models_to_deprecate = get_list_of_models_to_deprecate( |
| thresh_num_downloads=args.thresh_num_downloads, |
| thresh_date=args.thresh_date, |
| use_cache=args.use_cache, |
| save_model_info=args.save_model_info, |
| max_num_models=args.max_num_models, |
| ) |
|
|