|
|
|
|
|
import pandas as pd |
|
|
from huggingface_hub import HfApi, list_models, hf_hub_download |
|
|
from datetime import datetime, timedelta, timezone |
|
|
from rich.console import Console |
|
|
from rich.table import Table |
|
|
import sys |
|
|
import re |
|
|
import traceback |
|
|
import time |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
TOP_N_MODELS = 50 |
|
|
RECENCY_DAYS = 180 |
|
|
REQUIRED_TAGS = ['text-generation', 'conversational'] |
|
|
MIN_DOWNLOADS = 5000 |
|
|
MIN_LIKES = 100 |
|
|
DEFAULT_OLLAMA_TAG_SUFFIX = ":latest" |
|
|
|
|
|
MODEL_FILE_EXTENSIONS = ('.safetensors', '.bin', '.gguf') |
|
|
|
|
|
MIN_SIZE_GB = 0.01 |
|
|
|
|
|
|
|
|
|
|
|
hf_api = HfApi() |
|
|
|
|
|
def fetch_popular_models_metadata(recency_days=RECENCY_DAYS, min_downloads=MIN_DOWNLOADS, min_likes=MIN_LIKES, required_tags=REQUIRED_TAGS): |
|
|
"""Fetches metadata for popular and recently updated models from Hugging Face Hub.""" |
|
|
print("Querying Hugging Face Hub for model information...") |
|
|
print(f"(Initial Filters: Updated in last {recency_days} days, Downloads > {min_downloads}, Likes > {min_likes}, Tags contain {required_tags or 'any'})") |
|
|
print("[INFO] This version fetches file sizes and will be significantly slower.") |
|
|
|
|
|
models_data = [] |
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=recency_days) |
|
|
|
|
|
try: |
|
|
model_iterator = list_models( |
|
|
sort="lastModified", |
|
|
direction=-1, |
|
|
filter=required_tags if required_tags else None, |
|
|
fetch_config=False |
|
|
) |
|
|
|
|
|
print("Iterating through models and fetching details (this may take a long time)...") |
|
|
|
|
|
processed_count = 0 |
|
|
fetched_count = 0 |
|
|
|
|
|
limit_processed_initial = 10000 |
|
|
limit_fetched_final = 100 |
|
|
|
|
|
initial_candidates = [] |
|
|
|
|
|
print("Step 1: Gathering initial candidates based on basic filters...") |
|
|
for model_info_basic in tqdm(model_iterator, desc="Initial Scan", unit=" model", smoothing=0.1): |
|
|
processed_count += 1 |
|
|
|
|
|
last_modified_aware = model_info_basic.lastModified |
|
|
if last_modified_aware and last_modified_aware < cutoff_date: |
|
|
print(f"\nReached cutoff date {cutoff_date.date()}. Stopping initial scan.") |
|
|
break |
|
|
|
|
|
downloads = model_info_basic.downloads |
|
|
likes = model_info_basic.likes |
|
|
|
|
|
if downloads is None or downloads < min_downloads or likes is None or likes < min_likes: |
|
|
continue |
|
|
|
|
|
tags = model_info_basic.tags or [] |
|
|
if required_tags and not any(tag in tags for tag in required_tags): |
|
|
continue |
|
|
|
|
|
|
|
|
initial_candidates.append(model_info_basic) |
|
|
|
|
|
if processed_count >= limit_processed_initial: |
|
|
print(f"\nReached initial processing limit ({limit_processed_initial}). Moving to detailed fetch.") |
|
|
break |
|
|
|
|
|
if not initial_candidates: |
|
|
print("\nNo models passed the initial filtering stage.") |
|
|
return None |
|
|
|
|
|
print(f"\nFound {len(initial_candidates)} initial candidates. Proceeding to fetch details and file sizes...") |
|
|
|
|
|
|
|
|
print("Step 2: Fetching detailed info and calculating sizes...") |
|
|
for model_info_basic in tqdm(initial_candidates, desc="Fetching Details", unit=" model"): |
|
|
model_id = model_info_basic.modelId |
|
|
downloads = model_info_basic.downloads |
|
|
likes = model_info_basic.likes |
|
|
last_updated_aware = model_info_basic.lastModified |
|
|
tags = model_info_basic.tags or [] |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
detailed_info = hf_api.model_info(model_id, files_metadata=True) |
|
|
|
|
|
|
|
|
|
|
|
total_size_bytes = 0 |
|
|
if detailed_info.siblings: |
|
|
for sibling in detailed_info.siblings: |
|
|
|
|
|
if sibling.rfilename.endswith(MODEL_FILE_EXTENSIONS) and sibling.size is not None: |
|
|
total_size_bytes += sibling.size |
|
|
|
|
|
|
|
|
size_gb = total_size_bytes / (1024**3) |
|
|
|
|
|
|
|
|
if size_gb < MIN_SIZE_GB: |
|
|
|
|
|
continue |
|
|
|
|
|
last_updated_str = last_updated_aware.strftime('%Y-%m-%d') if last_updated_aware else "N/A" |
|
|
params_str = "N/A" |
|
|
model_type = "N/A" |
|
|
|
|
|
if 'gguf' in tags: model_type = "GGUF" |
|
|
elif 'pytorch' in tags or 'safetensors' in tags: model_type = "PyTorch/SafeT." |
|
|
elif 'onnx' in tags: model_type = "ONNX" |
|
|
|
|
|
if model_type != "GGUF": |
|
|
if 'text-generation' in tags and ('instruct' in tags or 'chat' in tags): model_type = "Instruct/Chat" |
|
|
elif 'text-generation' in tags: model_type = "Base Model" |
|
|
elif 'conversational' in tags: model_type = "Conversational" |
|
|
|
|
|
models_data.append({ |
|
|
'model_id': model_id, |
|
|
'downloads': downloads, |
|
|
'likes': likes, |
|
|
'last_updated': last_updated_str, |
|
|
'params': params_str, |
|
|
'model_type': model_type, |
|
|
'total_size_bytes': total_size_bytes, |
|
|
'size_gb': size_gb, |
|
|
|
|
|
}) |
|
|
fetched_count += 1 |
|
|
|
|
|
if fetched_count >= limit_fetched_final: |
|
|
print(f"\nReached limit of {limit_fetched_final} models with detailed info fetched.") |
|
|
break |
|
|
|
|
|
except Exception as detail_e: |
|
|
|
|
|
|
|
|
|
|
|
if processed_count % 50 == 0: |
|
|
print(f"\n[Warning] Error fetching details for {model_id} (logged): {str(detail_e)[:100]}...") |
|
|
pass |
|
|
|
|
|
|
|
|
if not models_data: |
|
|
print("\nNo models satisfied all criteria after fetching details and size.") |
|
|
return None |
|
|
|
|
|
print(f"\nFetched details for {len(models_data)} models.") |
|
|
|
|
|
|
|
|
print("Step 3: Calculating scores and sorting...") |
|
|
df = pd.DataFrame(models_data) |
|
|
|
|
|
|
|
|
df['downloads'] = df['downloads'].fillna(0).astype(int) |
|
|
df['likes'] = df['likes'].fillna(0).astype(int) |
|
|
df['size_gb'] = df['size_gb'].fillna(0).astype(float) |
|
|
|
|
|
|
|
|
df = df[df['size_gb'] > 0].copy() |
|
|
if df.empty: |
|
|
print("\nNo models remaining after filtering for size > 0 GB.") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df['popularity_per_gb'] = (df['downloads'] * df['likes']) / df['size_gb'] |
|
|
|
|
|
|
|
|
df.sort_values(by='popularity_per_gb', ascending=False, inplace=True) |
|
|
df.reset_index(drop=True, inplace=True) |
|
|
|
|
|
print("Model scoring and sorting completed.") |
|
|
return df |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\n[Error] Failed during model fetching or processing: {e}", file=sys.stderr) |
|
|
traceback.print_exc() |
|
|
return None |
|
|
|
|
|
def display_models(df): |
|
|
"""Displays model information in a table using Rich, including clickable links.""" |
|
|
if df is None or df.empty: |
|
|
print("No model data to display.") |
|
|
return |
|
|
|
|
|
num_to_display = min(TOP_N_MODELS, len(df)) |
|
|
console = Console() |
|
|
|
|
|
title = f"Top {num_to_display} Hugging Face Models by Popularity/Size (Downloads*Likes/GB)" |
|
|
|
|
|
table = Table(title=title, show_header=True, header_style="bold magenta") |
|
|
|
|
|
|
|
|
table.add_column("#", style="dim", width=4, justify="right") |
|
|
|
|
|
table.add_column("Model ID (Link)", style="cyan", min_width=40, no_wrap=False) |
|
|
table.add_column("Downloads", style="green", width=12, justify="right") |
|
|
table.add_column("Likes", style="red", width=8, justify="right") |
|
|
table.add_column("Size (GB)", style="purple", width=10, justify="right") |
|
|
table.add_column("Score (Pop/GB)", style="yellow", width=14, justify="right") |
|
|
table.add_column("Last Updated", style="blue", width=14) |
|
|
table.add_column("Type (Guess)", style="magenta", width=15) |
|
|
|
|
|
base_url = "https://huggingface.co/" |
|
|
|
|
|
for index, row in df.head(num_to_display).iterrows(): |
|
|
rank = str(index + 1) |
|
|
model_id = row.get('model_id', 'N/A') |
|
|
|
|
|
|
|
|
if model_id != 'N/A': |
|
|
|
|
|
model_url = f"{base_url}{model_id}" |
|
|
|
|
|
|
|
|
model_id_display = f"[link={model_url}]{model_id}[/link]" |
|
|
else: |
|
|
model_id_display = 'N/A' |
|
|
|
|
|
|
|
|
downloads_str = f"{row.get('downloads', 0):,}" |
|
|
likes_str = f"{row.get('likes', 0):,}" |
|
|
|
|
|
size_gb = row.get('size_gb') |
|
|
size_str = f"{size_gb:.2f}" if pd.notna(size_gb) else "N/A" |
|
|
|
|
|
score = row.get('popularity_per_gb') |
|
|
|
|
|
score_str = f"{score:,.0f}" if pd.notna(score) else "N/A" |
|
|
|
|
|
last_updated = str(row.get('last_updated', 'N/A')) |
|
|
model_type = str(row.get('model_type', 'N/A')) |
|
|
|
|
|
table.add_row( |
|
|
rank, |
|
|
model_id_display, |
|
|
downloads_str, |
|
|
likes_str, |
|
|
size_str, |
|
|
score_str, |
|
|
last_updated, |
|
|
model_type |
|
|
) |
|
|
|
|
|
console.print(table) |
|
|
if len(df) > num_to_display: |
|
|
print(f"(Showing Top {num_to_display} of {len(df)} models meeting criteria and size threshold)") |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
df = fetch_popular_models_metadata() |
|
|
|
|
|
if df is None or df.empty: |
|
|
print("未能獲取或篩選到任何模型數據,程序退出。") |
|
|
sys.exit(1) |
|
|
|
|
|
display_models(df) |
|
|
|
|
|
max_choice = min(TOP_N_MODELS, len(df)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
import pandas |
|
|
import rich |
|
|
import huggingface_hub |
|
|
import tqdm as tqdm_module_check |
|
|
print(f"Using huggingface_hub version: {huggingface_hub.__version__}") |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
print("Checking rich version (attribute unavailable in older versions).") |
|
|
except ImportError: |
|
|
print("rich library not found.") |
|
|
except AttributeError: |
|
|
print("Installed rich version is old and does not have __version__ attribute.") |
|
|
print(f"Using tqdm version: {tqdm_module_check.__version__}") |
|
|
except ImportError as err: |
|
|
print(f"[錯誤] 缺少必要的 Python 庫: {err}", file=sys.stderr) |
|
|
print("請運行: pip install --upgrade pandas huggingface-hub rich tqdm requests", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
|
|
|
main() |