# -*- coding: utf-8 -*- import pandas as pd from huggingface_hub import HfApi, list_models, hf_hub_download # Need HfApi for model_info from datetime import datetime, timedelta, timezone from rich.console import Console # Import Console from rich.table import Table # Import Table import sys import re import traceback import time # For potential sleep to avoid rate limits from tqdm import tqdm # --- Configuration --- TOP_N_MODELS = 50 RECENCY_DAYS = 180 REQUIRED_TAGS = ['text-generation', 'conversational'] # Filter for relevant models MIN_DOWNLOADS = 5000 # Maybe increase min downloads to reduce model_info calls MIN_LIKES = 100 # Maybe increase min likes DEFAULT_OLLAMA_TAG_SUFFIX = ":latest" # Model file extensions to consider for size calculation MODEL_FILE_EXTENSIONS = ('.safetensors', '.bin', '.gguf') # Minimum size in GB to be considered (filters out repos with only tiny files) MIN_SIZE_GB = 0.01 # 10 MB # --- Hugging Face API Client --- # We need HfApi again to get detailed model info including file sizes hf_api = HfApi() def fetch_popular_models_metadata(recency_days=RECENCY_DAYS, min_downloads=MIN_DOWNLOADS, min_likes=MIN_LIKES, required_tags=REQUIRED_TAGS): """Fetches metadata for popular and recently updated models from Hugging Face Hub.""" print("Querying Hugging Face Hub for model information...") print(f"(Initial Filters: Updated in last {recency_days} days, Downloads > {min_downloads}, Likes > {min_likes}, Tags contain {required_tags or 'any'})") print("[INFO] This version fetches file sizes and will be significantly slower.") models_data = [] cutoff_date = datetime.now(timezone.utc) - timedelta(days=recency_days) try: model_iterator = list_models( sort="lastModified", direction=-1, filter=required_tags if required_tags else None, fetch_config=False # Config not needed here ) print("Iterating through models and fetching details (this may take a long time)...") processed_count = 0 fetched_count = 0 # Reduce limit because model_info calls are slow limit_processed_initial = 10000 # Limit initial candidates to check limit_fetched_final = 100 # Limit how many models we actually store after getting size initial_candidates = [] # --- Step 1: Initial Filtering based on list_models --- print("Step 1: Gathering initial candidates based on basic filters...") for model_info_basic in tqdm(model_iterator, desc="Initial Scan", unit=" model", smoothing=0.1): processed_count += 1 last_modified_aware = model_info_basic.lastModified if last_modified_aware and last_modified_aware < cutoff_date: print(f"\nReached cutoff date {cutoff_date.date()}. Stopping initial scan.") break downloads = model_info_basic.downloads likes = model_info_basic.likes if downloads is None or downloads < min_downloads or likes is None or likes < min_likes: continue tags = model_info_basic.tags or [] if required_tags and not any(tag in tags for tag in required_tags): continue # Passed initial filters, add to candidates for detailed check initial_candidates.append(model_info_basic) if processed_count >= limit_processed_initial: print(f"\nReached initial processing limit ({limit_processed_initial}). Moving to detailed fetch.") break if not initial_candidates: print("\nNo models passed the initial filtering stage.") return None print(f"\nFound {len(initial_candidates)} initial candidates. Proceeding to fetch details and file sizes...") # --- Step 2: Fetch Detailed Info and Size for Candidates --- print("Step 2: Fetching detailed info and calculating sizes...") for model_info_basic in tqdm(initial_candidates, desc="Fetching Details", unit=" model"): model_id = model_info_basic.modelId downloads = model_info_basic.downloads likes = model_info_basic.likes last_updated_aware = model_info_basic.lastModified tags = model_info_basic.tags or [] try: # Fetch detailed info including file metadata # Use files_metadata=True which is generally faster than full=True detailed_info = hf_api.model_info(model_id, files_metadata=True) # Add a small delay to be nice to the API # time.sleep(0.05) total_size_bytes = 0 if detailed_info.siblings: for sibling in detailed_info.siblings: # Check if file name ends with specified extensions and size exists if sibling.rfilename.endswith(MODEL_FILE_EXTENSIONS) and sibling.size is not None: total_size_bytes += sibling.size # Convert to GB size_gb = total_size_bytes / (1024**3) # Filter out models smaller than MIN_SIZE_GB if size_gb < MIN_SIZE_GB: # print(f"Skipping {model_id}: Size {size_gb:.3f} GB is below threshold {MIN_SIZE_GB} GB.") continue last_updated_str = last_updated_aware.strftime('%Y-%m-%d') if last_updated_aware else "N/A" params_str = "N/A" # Parameter count still hard to get reliably model_type = "N/A" # Determine type based on tags if 'gguf' in tags: model_type = "GGUF" elif 'pytorch' in tags or 'safetensors' in tags: model_type = "PyTorch/SafeT." elif 'onnx' in tags: model_type = "ONNX" if model_type != "GGUF": if 'text-generation' in tags and ('instruct' in tags or 'chat' in tags): model_type = "Instruct/Chat" elif 'text-generation' in tags: model_type = "Base Model" elif 'conversational' in tags: model_type = "Conversational" models_data.append({ 'model_id': model_id, 'downloads': downloads, 'likes': likes, 'last_updated': last_updated_str, 'params': params_str, 'model_type': model_type, 'total_size_bytes': total_size_bytes, # Keep bytes for potential filtering 'size_gb': size_gb, # 'tags': tags # Optional }) fetched_count += 1 if fetched_count >= limit_fetched_final: print(f"\nReached limit of {limit_fetched_final} models with detailed info fetched.") break except Exception as detail_e: # Log error fetching details for a specific model but continue # print(f"\n[Warning] Failed to fetch details for {model_id}: {detail_e}") # Limit warnings to avoid spamming if processed_count % 50 == 0: # Print warning every 50 errors print(f"\n[Warning] Error fetching details for {model_id} (logged): {str(detail_e)[:100]}...") pass # Continue to the next model if not models_data: print("\nNo models satisfied all criteria after fetching details and size.") return None print(f"\nFetched details for {len(models_data)} models.") # --- Step 3: Calculate Score and Sort --- print("Step 3: Calculating scores and sorting...") df = pd.DataFrame(models_data) # Ensure necessary columns are numeric and handle potential NaNs (though unlikely now) df['downloads'] = df['downloads'].fillna(0).astype(int) df['likes'] = df['likes'].fillna(0).astype(int) df['size_gb'] = df['size_gb'].fillna(0).astype(float) # Filter again for size > 0 before division (redundant if MIN_SIZE_GB > 0, but safe) df = df[df['size_gb'] > 0].copy() if df.empty: print("\nNo models remaining after filtering for size > 0 GB.") return None # Calculate the new score: Popularity per GB # Adding a small epsilon to size_gb in denominator for numerical stability if needed, # but filtering size > 0 should be sufficient. df['popularity_per_gb'] = (df['downloads'] * df['likes']) / df['size_gb'] # Sort by the new score df.sort_values(by='popularity_per_gb', ascending=False, inplace=True) df.reset_index(drop=True, inplace=True) print("Model scoring and sorting completed.") return df except Exception as e: print(f"\n[Error] Failed during model fetching or processing: {e}", file=sys.stderr) traceback.print_exc() return None def display_models(df): """Displays model information in a table using Rich, including clickable links.""" if df is None or df.empty: print("No model data to display.") return num_to_display = min(TOP_N_MODELS, len(df)) console = Console() # Update title to reflect the new sorting metric title = f"Top {num_to_display} Hugging Face Models by Popularity/Size (Downloads*Likes/GB)" # Removed box=None and show_lines=False to revert to default box for clarity, you can add them back if preferred table = Table(title=title, show_header=True, header_style="bold magenta") # Add columns, including the new Size column table.add_column("#", style="dim", width=4, justify="right") # ***** CHANGE HERE: Removed markup=True and highlight=False ***** table.add_column("Model ID (Link)", style="cyan", min_width=40, no_wrap=False) table.add_column("Downloads", style="green", width=12, justify="right") table.add_column("Likes", style="red", width=8, justify="right") table.add_column("Size (GB)", style="purple", width=10, justify="right") # New column table.add_column("Score (Pop/GB)", style="yellow", width=14, justify="right") # Show the score table.add_column("Last Updated", style="blue", width=14) table.add_column("Type (Guess)", style="magenta", width=15) base_url = "https://huggingface.co/" for index, row in df.head(num_to_display).iterrows(): rank = str(index + 1) model_id = row.get('model_id', 'N/A') # --- Link Creation (No changes needed here) --- if model_id != 'N/A': # Construct the URL for the model model_url = f"{base_url}{model_id}" # Use Rich's link markup: [link=URL]TEXT[/link] # This markup will be interpreted correctly by the Console when printing the table model_id_display = f"[link={model_url}]{model_id}[/link]" else: model_id_display = 'N/A' # Fallback if ID is somehow missing # --- END: Link Creation --- downloads_str = f"{row.get('downloads', 0):,}" likes_str = f"{row.get('likes', 0):,}" # Format size size_gb = row.get('size_gb') size_str = f"{size_gb:.2f}" if pd.notna(size_gb) else "N/A" # Format score score = row.get('popularity_per_gb') # Use scientific notation for potentially large scores score_str = f"{score:,.0f}" if pd.notna(score) else "N/A" # Comma separated integer part last_updated = str(row.get('last_updated', 'N/A')) model_type = str(row.get('model_type', 'N/A')) table.add_row( rank, model_id_display, # Pass the string with link markup downloads_str, likes_str, size_str, score_str, last_updated, model_type ) console.print(table) # The console handles rendering the markup in model_id_display if len(df) > num_to_display: print(f"(Showing Top {num_to_display} of {len(df)} models meeting criteria and size threshold)") # --- main function remains largely unchanged --- def main(): df = fetch_popular_models_metadata() if df is None or df.empty: print("未能獲取或篩選到任何模型數據,程序退出。") sys.exit(1) display_models(df) max_choice = min(TOP_N_MODELS, len(df)) # Removed the interactive choice part as it wasn't in the provided code snippet # and the request was only about adding links to the table. # If you need the choice part back, it would need to be added here. if __name__ == "__main__": try: import pandas import rich # Keep the import check import huggingface_hub import tqdm as tqdm_module_check # Check tqdm again print(f"Using huggingface_hub version: {huggingface_hub.__version__}") # It might be helpful to print the rich version too for debugging try: # import rich # Already imported above # print(f"Using rich version: {rich.__version__}") # Comment out or delete this line print("Checking rich version (attribute unavailable in older versions).") # Optional: inform the user except ImportError: print("rich library not found.") # Should not happen given the check below except AttributeError: # Catch the specific error print("Installed rich version is old and does not have __version__ attribute.") print(f"Using tqdm version: {tqdm_module_check.__version__}") except ImportError as err: print(f"[錯誤] 缺少必要的 Python 庫: {err}", file=sys.stderr) print("請運行: pip install --upgrade pandas huggingface-hub rich tqdm requests", file=sys.stderr) sys.exit(1) main()