Upload check_HF_models.py
Browse files- check_HF_models.py +304 -0
check_HF_models.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from huggingface_hub import HfApi, list_models, hf_hub_download # Need HfApi for model_info
|
| 4 |
+
from datetime import datetime, timedelta, timezone
|
| 5 |
+
from rich.console import Console # Import Console
|
| 6 |
+
from rich.table import Table # Import Table
|
| 7 |
+
import sys
|
| 8 |
+
import re
|
| 9 |
+
import traceback
|
| 10 |
+
import time # For potential sleep to avoid rate limits
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
|
| 13 |
+
# --- Configuration ---
|
| 14 |
+
TOP_N_MODELS = 50
|
| 15 |
+
RECENCY_DAYS = 180
|
| 16 |
+
REQUIRED_TAGS = ['text-generation', 'conversational'] # Filter for relevant models
|
| 17 |
+
MIN_DOWNLOADS = 5000 # Maybe increase min downloads to reduce model_info calls
|
| 18 |
+
MIN_LIKES = 100 # Maybe increase min likes
|
| 19 |
+
DEFAULT_OLLAMA_TAG_SUFFIX = ":latest"
|
| 20 |
+
# Model file extensions to consider for size calculation
|
| 21 |
+
MODEL_FILE_EXTENSIONS = ('.safetensors', '.bin', '.gguf')
|
| 22 |
+
# Minimum size in GB to be considered (filters out repos with only tiny files)
|
| 23 |
+
MIN_SIZE_GB = 0.01 # 10 MB
|
| 24 |
+
|
| 25 |
+
# --- Hugging Face API Client ---
|
| 26 |
+
# We need HfApi again to get detailed model info including file sizes
|
| 27 |
+
hf_api = HfApi()
|
| 28 |
+
|
| 29 |
+
def fetch_popular_models_metadata(recency_days=RECENCY_DAYS, min_downloads=MIN_DOWNLOADS, min_likes=MIN_LIKES, required_tags=REQUIRED_TAGS):
|
| 30 |
+
"""Fetches metadata for popular and recently updated models from Hugging Face Hub."""
|
| 31 |
+
print("Querying Hugging Face Hub for model information...")
|
| 32 |
+
print(f"(Initial Filters: Updated in last {recency_days} days, Downloads > {min_downloads}, Likes > {min_likes}, Tags contain {required_tags or 'any'})")
|
| 33 |
+
print("[INFO] This version fetches file sizes and will be significantly slower.")
|
| 34 |
+
|
| 35 |
+
models_data = []
|
| 36 |
+
cutoff_date = datetime.now(timezone.utc) - timedelta(days=recency_days)
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
model_iterator = list_models(
|
| 40 |
+
sort="lastModified",
|
| 41 |
+
direction=-1,
|
| 42 |
+
filter=required_tags if required_tags else None,
|
| 43 |
+
fetch_config=False # Config not needed here
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print("Iterating through models and fetching details (this may take a long time)...")
|
| 47 |
+
|
| 48 |
+
processed_count = 0
|
| 49 |
+
fetched_count = 0
|
| 50 |
+
# Reduce limit because model_info calls are slow
|
| 51 |
+
limit_processed_initial = 10000 # Limit initial candidates to check
|
| 52 |
+
limit_fetched_final = 100 # Limit how many models we actually store after getting size
|
| 53 |
+
|
| 54 |
+
initial_candidates = []
|
| 55 |
+
# --- Step 1: Initial Filtering based on list_models ---
|
| 56 |
+
print("Step 1: Gathering initial candidates based on basic filters...")
|
| 57 |
+
for model_info_basic in tqdm(model_iterator, desc="Initial Scan", unit=" model", smoothing=0.1):
|
| 58 |
+
processed_count += 1
|
| 59 |
+
|
| 60 |
+
last_modified_aware = model_info_basic.lastModified
|
| 61 |
+
if last_modified_aware and last_modified_aware < cutoff_date:
|
| 62 |
+
print(f"\nReached cutoff date {cutoff_date.date()}. Stopping initial scan.")
|
| 63 |
+
break
|
| 64 |
+
|
| 65 |
+
downloads = model_info_basic.downloads
|
| 66 |
+
likes = model_info_basic.likes
|
| 67 |
+
|
| 68 |
+
if downloads is None or downloads < min_downloads or likes is None or likes < min_likes:
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
tags = model_info_basic.tags or []
|
| 72 |
+
if required_tags and not any(tag in tags for tag in required_tags):
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
# Passed initial filters, add to candidates for detailed check
|
| 76 |
+
initial_candidates.append(model_info_basic)
|
| 77 |
+
|
| 78 |
+
if processed_count >= limit_processed_initial:
|
| 79 |
+
print(f"\nReached initial processing limit ({limit_processed_initial}). Moving to detailed fetch.")
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
if not initial_candidates:
|
| 83 |
+
print("\nNo models passed the initial filtering stage.")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
print(f"\nFound {len(initial_candidates)} initial candidates. Proceeding to fetch details and file sizes...")
|
| 87 |
+
|
| 88 |
+
# --- Step 2: Fetch Detailed Info and Size for Candidates ---
|
| 89 |
+
print("Step 2: Fetching detailed info and calculating sizes...")
|
| 90 |
+
for model_info_basic in tqdm(initial_candidates, desc="Fetching Details", unit=" model"):
|
| 91 |
+
model_id = model_info_basic.modelId
|
| 92 |
+
downloads = model_info_basic.downloads
|
| 93 |
+
likes = model_info_basic.likes
|
| 94 |
+
last_updated_aware = model_info_basic.lastModified
|
| 95 |
+
tags = model_info_basic.tags or []
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Fetch detailed info including file metadata
|
| 99 |
+
# Use files_metadata=True which is generally faster than full=True
|
| 100 |
+
detailed_info = hf_api.model_info(model_id, files_metadata=True)
|
| 101 |
+
# Add a small delay to be nice to the API
|
| 102 |
+
# time.sleep(0.05)
|
| 103 |
+
|
| 104 |
+
total_size_bytes = 0
|
| 105 |
+
if detailed_info.siblings:
|
| 106 |
+
for sibling in detailed_info.siblings:
|
| 107 |
+
# Check if file name ends with specified extensions and size exists
|
| 108 |
+
if sibling.rfilename.endswith(MODEL_FILE_EXTENSIONS) and sibling.size is not None:
|
| 109 |
+
total_size_bytes += sibling.size
|
| 110 |
+
|
| 111 |
+
# Convert to GB
|
| 112 |
+
size_gb = total_size_bytes / (1024**3)
|
| 113 |
+
|
| 114 |
+
# Filter out models smaller than MIN_SIZE_GB
|
| 115 |
+
if size_gb < MIN_SIZE_GB:
|
| 116 |
+
# print(f"Skipping {model_id}: Size {size_gb:.3f} GB is below threshold {MIN_SIZE_GB} GB.")
|
| 117 |
+
continue
|
| 118 |
+
|
| 119 |
+
last_updated_str = last_updated_aware.strftime('%Y-%m-%d') if last_updated_aware else "N/A"
|
| 120 |
+
params_str = "N/A" # Parameter count still hard to get reliably
|
| 121 |
+
model_type = "N/A" # Determine type based on tags
|
| 122 |
+
|
| 123 |
+
if 'gguf' in tags: model_type = "GGUF"
|
| 124 |
+
elif 'pytorch' in tags or 'safetensors' in tags: model_type = "PyTorch/SafeT."
|
| 125 |
+
elif 'onnx' in tags: model_type = "ONNX"
|
| 126 |
+
|
| 127 |
+
if model_type != "GGUF":
|
| 128 |
+
if 'text-generation' in tags and ('instruct' in tags or 'chat' in tags): model_type = "Instruct/Chat"
|
| 129 |
+
elif 'text-generation' in tags: model_type = "Base Model"
|
| 130 |
+
elif 'conversational' in tags: model_type = "Conversational"
|
| 131 |
+
|
| 132 |
+
models_data.append({
|
| 133 |
+
'model_id': model_id,
|
| 134 |
+
'downloads': downloads,
|
| 135 |
+
'likes': likes,
|
| 136 |
+
'last_updated': last_updated_str,
|
| 137 |
+
'params': params_str,
|
| 138 |
+
'model_type': model_type,
|
| 139 |
+
'total_size_bytes': total_size_bytes, # Keep bytes for potential filtering
|
| 140 |
+
'size_gb': size_gb,
|
| 141 |
+
# 'tags': tags # Optional
|
| 142 |
+
})
|
| 143 |
+
fetched_count += 1
|
| 144 |
+
|
| 145 |
+
if fetched_count >= limit_fetched_final:
|
| 146 |
+
print(f"\nReached limit of {limit_fetched_final} models with detailed info fetched.")
|
| 147 |
+
break
|
| 148 |
+
|
| 149 |
+
except Exception as detail_e:
|
| 150 |
+
# Log error fetching details for a specific model but continue
|
| 151 |
+
# print(f"\n[Warning] Failed to fetch details for {model_id}: {detail_e}")
|
| 152 |
+
# Limit warnings to avoid spamming
|
| 153 |
+
if processed_count % 50 == 0: # Print warning every 50 errors
|
| 154 |
+
print(f"\n[Warning] Error fetching details for {model_id} (logged): {str(detail_e)[:100]}...")
|
| 155 |
+
pass # Continue to the next model
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
if not models_data:
|
| 159 |
+
print("\nNo models satisfied all criteria after fetching details and size.")
|
| 160 |
+
return None
|
| 161 |
+
|
| 162 |
+
print(f"\nFetched details for {len(models_data)} models.")
|
| 163 |
+
|
| 164 |
+
# --- Step 3: Calculate Score and Sort ---
|
| 165 |
+
print("Step 3: Calculating scores and sorting...")
|
| 166 |
+
df = pd.DataFrame(models_data)
|
| 167 |
+
|
| 168 |
+
# Ensure necessary columns are numeric and handle potential NaNs (though unlikely now)
|
| 169 |
+
df['downloads'] = df['downloads'].fillna(0).astype(int)
|
| 170 |
+
df['likes'] = df['likes'].fillna(0).astype(int)
|
| 171 |
+
df['size_gb'] = df['size_gb'].fillna(0).astype(float)
|
| 172 |
+
|
| 173 |
+
# Filter again for size > 0 before division (redundant if MIN_SIZE_GB > 0, but safe)
|
| 174 |
+
df = df[df['size_gb'] > 0].copy()
|
| 175 |
+
if df.empty:
|
| 176 |
+
print("\nNo models remaining after filtering for size > 0 GB.")
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
# Calculate the new score: Popularity per GB
|
| 180 |
+
# Adding a small epsilon to size_gb in denominator for numerical stability if needed,
|
| 181 |
+
# but filtering size > 0 should be sufficient.
|
| 182 |
+
df['popularity_per_gb'] = (df['downloads'] * df['likes']) / df['size_gb']
|
| 183 |
+
|
| 184 |
+
# Sort by the new score
|
| 185 |
+
df.sort_values(by='popularity_per_gb', ascending=False, inplace=True)
|
| 186 |
+
df.reset_index(drop=True, inplace=True)
|
| 187 |
+
|
| 188 |
+
print("Model scoring and sorting completed.")
|
| 189 |
+
return df
|
| 190 |
+
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"\n[Error] Failed during model fetching or processing: {e}", file=sys.stderr)
|
| 193 |
+
traceback.print_exc()
|
| 194 |
+
return None
|
| 195 |
+
|
| 196 |
+
def display_models(df):
|
| 197 |
+
"""Displays model information in a table using Rich, including clickable links."""
|
| 198 |
+
if df is None or df.empty:
|
| 199 |
+
print("No model data to display.")
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
num_to_display = min(TOP_N_MODELS, len(df))
|
| 203 |
+
console = Console()
|
| 204 |
+
# Update title to reflect the new sorting metric
|
| 205 |
+
title = f"Top {num_to_display} Hugging Face Models by Popularity/Size (Downloads*Likes/GB)"
|
| 206 |
+
# Removed box=None and show_lines=False to revert to default box for clarity, you can add them back if preferred
|
| 207 |
+
table = Table(title=title, show_header=True, header_style="bold magenta")
|
| 208 |
+
|
| 209 |
+
# Add columns, including the new Size column
|
| 210 |
+
table.add_column("#", style="dim", width=4, justify="right")
|
| 211 |
+
# ***** CHANGE HERE: Removed markup=True and highlight=False *****
|
| 212 |
+
table.add_column("Model ID (Link)", style="cyan", min_width=40, no_wrap=False)
|
| 213 |
+
table.add_column("Downloads", style="green", width=12, justify="right")
|
| 214 |
+
table.add_column("Likes", style="red", width=8, justify="right")
|
| 215 |
+
table.add_column("Size (GB)", style="purple", width=10, justify="right") # New column
|
| 216 |
+
table.add_column("Score (Pop/GB)", style="yellow", width=14, justify="right") # Show the score
|
| 217 |
+
table.add_column("Last Updated", style="blue", width=14)
|
| 218 |
+
table.add_column("Type (Guess)", style="magenta", width=15)
|
| 219 |
+
|
| 220 |
+
base_url = "https://huggingface.co/"
|
| 221 |
+
|
| 222 |
+
for index, row in df.head(num_to_display).iterrows():
|
| 223 |
+
rank = str(index + 1)
|
| 224 |
+
model_id = row.get('model_id', 'N/A')
|
| 225 |
+
|
| 226 |
+
# --- Link Creation (No changes needed here) ---
|
| 227 |
+
if model_id != 'N/A':
|
| 228 |
+
# Construct the URL for the model
|
| 229 |
+
model_url = f"{base_url}{model_id}"
|
| 230 |
+
# Use Rich's link markup: [link=URL]TEXT[/link]
|
| 231 |
+
# This markup will be interpreted correctly by the Console when printing the table
|
| 232 |
+
model_id_display = f"[link={model_url}]{model_id}[/link]"
|
| 233 |
+
else:
|
| 234 |
+
model_id_display = 'N/A' # Fallback if ID is somehow missing
|
| 235 |
+
# --- END: Link Creation ---
|
| 236 |
+
|
| 237 |
+
downloads_str = f"{row.get('downloads', 0):,}"
|
| 238 |
+
likes_str = f"{row.get('likes', 0):,}"
|
| 239 |
+
# Format size
|
| 240 |
+
size_gb = row.get('size_gb')
|
| 241 |
+
size_str = f"{size_gb:.2f}" if pd.notna(size_gb) else "N/A"
|
| 242 |
+
# Format score
|
| 243 |
+
score = row.get('popularity_per_gb')
|
| 244 |
+
# Use scientific notation for potentially large scores
|
| 245 |
+
score_str = f"{score:,.0f}" if pd.notna(score) else "N/A" # Comma separated integer part
|
| 246 |
+
|
| 247 |
+
last_updated = str(row.get('last_updated', 'N/A'))
|
| 248 |
+
model_type = str(row.get('model_type', 'N/A'))
|
| 249 |
+
|
| 250 |
+
table.add_row(
|
| 251 |
+
rank,
|
| 252 |
+
model_id_display, # Pass the string with link markup
|
| 253 |
+
downloads_str,
|
| 254 |
+
likes_str,
|
| 255 |
+
size_str,
|
| 256 |
+
score_str,
|
| 257 |
+
last_updated,
|
| 258 |
+
model_type
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
console.print(table) # The console handles rendering the markup in model_id_display
|
| 262 |
+
if len(df) > num_to_display:
|
| 263 |
+
print(f"(Showing Top {num_to_display} of {len(df)} models meeting criteria and size threshold)")
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# --- main function remains largely unchanged ---
|
| 267 |
+
def main():
|
| 268 |
+
df = fetch_popular_models_metadata()
|
| 269 |
+
|
| 270 |
+
if df is None or df.empty:
|
| 271 |
+
print("未能獲取或篩選到任何模型數據,程序退出。")
|
| 272 |
+
sys.exit(1)
|
| 273 |
+
|
| 274 |
+
display_models(df)
|
| 275 |
+
|
| 276 |
+
max_choice = min(TOP_N_MODELS, len(df))
|
| 277 |
+
# Removed the interactive choice part as it wasn't in the provided code snippet
|
| 278 |
+
# and the request was only about adding links to the table.
|
| 279 |
+
# If you need the choice part back, it would need to be added here.
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
if __name__ == "__main__":
|
| 283 |
+
try:
|
| 284 |
+
import pandas
|
| 285 |
+
import rich # Keep the import check
|
| 286 |
+
import huggingface_hub
|
| 287 |
+
import tqdm as tqdm_module_check # Check tqdm again
|
| 288 |
+
print(f"Using huggingface_hub version: {huggingface_hub.__version__}")
|
| 289 |
+
# It might be helpful to print the rich version too for debugging
|
| 290 |
+
try:
|
| 291 |
+
# import rich # Already imported above
|
| 292 |
+
# print(f"Using rich version: {rich.__version__}") # Comment out or delete this line
|
| 293 |
+
print("Checking rich version (attribute unavailable in older versions).") # Optional: inform the user
|
| 294 |
+
except ImportError:
|
| 295 |
+
print("rich library not found.") # Should not happen given the check below
|
| 296 |
+
except AttributeError: # Catch the specific error
|
| 297 |
+
print("Installed rich version is old and does not have __version__ attribute.")
|
| 298 |
+
print(f"Using tqdm version: {tqdm_module_check.__version__}")
|
| 299 |
+
except ImportError as err:
|
| 300 |
+
print(f"[錯誤] 缺少必要的 Python 庫: {err}", file=sys.stderr)
|
| 301 |
+
print("請運行: pip install --upgrade pandas huggingface-hub rich tqdm requests", file=sys.stderr)
|
| 302 |
+
sys.exit(1)
|
| 303 |
+
|
| 304 |
+
main()
|