TsungChihTsai
/

moremodels

Model card Files Files and versions

xet

Community

TsungChihTsai commited on Apr 18, 2025

Commit

7a70f97

verified ·

1 Parent(s): d8a89db

Upload check_HF_models.py

Browse files

Files changed (1) hide show

check_HF_models.py +304 -0

check_HF_models.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# -*- coding: utf-8 -*-
+import pandas as pd
+from huggingface_hub import HfApi, list_models, hf_hub_download # Need HfApi for model_info
+from datetime import datetime, timedelta, timezone
+from rich.console import Console # Import Console
+from rich.table import Table     # Import Table
+import sys
+import re
+import traceback
+import time # For potential sleep to avoid rate limits
+from tqdm import tqdm
+# --- Configuration ---
+TOP_N_MODELS = 50
+RECENCY_DAYS = 180
+REQUIRED_TAGS = ['text-generation', 'conversational'] # Filter for relevant models
+MIN_DOWNLOADS = 5000 # Maybe increase min downloads to reduce model_info calls
+MIN_LIKES = 100     # Maybe increase min likes
+DEFAULT_OLLAMA_TAG_SUFFIX = ":latest"
+# Model file extensions to consider for size calculation
+MODEL_FILE_EXTENSIONS = ('.safetensors', '.bin', '.gguf')
+# Minimum size in GB to be considered (filters out repos with only tiny files)
+MIN_SIZE_GB = 0.01 # 10 MB
+# --- Hugging Face API Client ---
+# We need HfApi again to get detailed model info including file sizes
+hf_api = HfApi()
+def fetch_popular_models_metadata(recency_days=RECENCY_DAYS, min_downloads=MIN_DOWNLOADS, min_likes=MIN_LIKES, required_tags=REQUIRED_TAGS):
+    """Fetches metadata for popular and recently updated models from Hugging Face Hub."""
+    print("Querying Hugging Face Hub for model information...")
+    print(f"(Initial Filters: Updated in last {recency_days} days, Downloads > {min_downloads}, Likes > {min_likes}, Tags contain {required_tags or 'any'})")
+    print("[INFO] This version fetches file sizes and will be significantly slower.")
+    models_data = []
+    cutoff_date = datetime.now(timezone.utc) - timedelta(days=recency_days)
+    try:
+        model_iterator = list_models(
+            sort="lastModified",
+            direction=-1,
+            filter=required_tags if required_tags else None,
+            fetch_config=False # Config not needed here
+        )
+        print("Iterating through models and fetching details (this may take a long time)...")
+        processed_count = 0
+        fetched_count = 0
+        # Reduce limit because model_info calls are slow
+        limit_processed_initial = 10000 # Limit initial candidates to check
+        limit_fetched_final = 100 # Limit how many models we actually store after getting size
+        initial_candidates = []
+        # --- Step 1: Initial Filtering based on list_models ---
+        print("Step 1: Gathering initial candidates based on basic filters...")
+        for model_info_basic in tqdm(model_iterator, desc="Initial Scan", unit=" model", smoothing=0.1):
+             processed_count += 1
+             last_modified_aware = model_info_basic.lastModified
+             if last_modified_aware and last_modified_aware < cutoff_date:
+                 print(f"\nReached cutoff date {cutoff_date.date()}. Stopping initial scan.")
+                 break
+             downloads = model_info_basic.downloads
+             likes = model_info_basic.likes
+             if downloads is None or downloads < min_downloads or likes is None or likes < min_likes:
+                 continue
+             tags = model_info_basic.tags or []
+             if required_tags and not any(tag in tags for tag in required_tags):
+                  continue
+             # Passed initial filters, add to candidates for detailed check
+             initial_candidates.append(model_info_basic)
+             if processed_count >= limit_processed_initial:
+                print(f"\nReached initial processing limit ({limit_processed_initial}). Moving to detailed fetch.")
+                break
+        if not initial_candidates:
+             print("\nNo models passed the initial filtering stage.")
+             return None
+        print(f"\nFound {len(initial_candidates)} initial candidates. Proceeding to fetch details and file sizes...")
+        # --- Step 2: Fetch Detailed Info and Size for Candidates ---
+        print("Step 2: Fetching detailed info and calculating sizes...")
+        for model_info_basic in tqdm(initial_candidates, desc="Fetching Details", unit=" model"):
+            model_id = model_info_basic.modelId
+            downloads = model_info_basic.downloads
+            likes = model_info_basic.likes
+            last_updated_aware = model_info_basic.lastModified
+            tags = model_info_basic.tags or []
+            try:
+                # Fetch detailed info including file metadata
+                # Use files_metadata=True which is generally faster than full=True
+                detailed_info = hf_api.model_info(model_id, files_metadata=True)
+                # Add a small delay to be nice to the API
+                # time.sleep(0.05)
+                total_size_bytes = 0
+                if detailed_info.siblings:
+                    for sibling in detailed_info.siblings:
+                        # Check if file name ends with specified extensions and size exists
+                        if sibling.rfilename.endswith(MODEL_FILE_EXTENSIONS) and sibling.size is not None:
+                            total_size_bytes += sibling.size
+                # Convert to GB
+                size_gb = total_size_bytes / (1024**3)
+                # Filter out models smaller than MIN_SIZE_GB
+                if size_gb < MIN_SIZE_GB:
+                    # print(f"Skipping {model_id}: Size {size_gb:.3f} GB is below threshold {MIN_SIZE_GB} GB.")
+                    continue
+                last_updated_str = last_updated_aware.strftime('%Y-%m-%d') if last_updated_aware else "N/A"
+                params_str = "N/A" # Parameter count still hard to get reliably
+                model_type = "N/A" # Determine type based on tags
+                if 'gguf' in tags: model_type = "GGUF"
+                elif 'pytorch' in tags or 'safetensors' in tags: model_type = "PyTorch/SafeT."
+                elif 'onnx' in tags: model_type = "ONNX"
+                if model_type != "GGUF":
+                    if 'text-generation' in tags and ('instruct' in tags or 'chat' in tags): model_type = "Instruct/Chat"
+                    elif 'text-generation' in tags: model_type = "Base Model"
+                    elif 'conversational' in tags: model_type = "Conversational"
+                models_data.append({
+                    'model_id': model_id,
+                    'downloads': downloads,
+                    'likes': likes,
+                    'last_updated': last_updated_str,
+                    'params': params_str,
+                    'model_type': model_type,
+                    'total_size_bytes': total_size_bytes, # Keep bytes for potential filtering
+                    'size_gb': size_gb,
+                    # 'tags': tags # Optional
+                })
+                fetched_count += 1
+                if fetched_count >= limit_fetched_final:
+                    print(f"\nReached limit of {limit_fetched_final} models with detailed info fetched.")
+                    break
+            except Exception as detail_e:
+                # Log error fetching details for a specific model but continue
+                # print(f"\n[Warning] Failed to fetch details for {model_id}: {detail_e}")
+                # Limit warnings to avoid spamming
+                if processed_count % 50 == 0: # Print warning every 50 errors
+                     print(f"\n[Warning] Error fetching details for {model_id} (logged): {str(detail_e)[:100]}...")
+                pass # Continue to the next model
+        if not models_data:
+            print("\nNo models satisfied all criteria after fetching details and size.")
+            return None
+        print(f"\nFetched details for {len(models_data)} models.")
+        # --- Step 3: Calculate Score and Sort ---
+        print("Step 3: Calculating scores and sorting...")
+        df = pd.DataFrame(models_data)
+        # Ensure necessary columns are numeric and handle potential NaNs (though unlikely now)
+        df['downloads'] = df['downloads'].fillna(0).astype(int)
+        df['likes'] = df['likes'].fillna(0).astype(int)
+        df['size_gb'] = df['size_gb'].fillna(0).astype(float)
+        # Filter again for size > 0 before division (redundant if MIN_SIZE_GB > 0, but safe)
+        df = df[df['size_gb'] > 0].copy()
+        if df.empty:
+            print("\nNo models remaining after filtering for size > 0 GB.")
+            return None
+        # Calculate the new score: Popularity per GB
+        # Adding a small epsilon to size_gb in denominator for numerical stability if needed,
+        # but filtering size > 0 should be sufficient.
+        df['popularity_per_gb'] = (df['downloads'] * df['likes']) / df['size_gb']
+        # Sort by the new score
+        df.sort_values(by='popularity_per_gb', ascending=False, inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        print("Model scoring and sorting completed.")
+        return df
+    except Exception as e:
+        print(f"\n[Error] Failed during model fetching or processing: {e}", file=sys.stderr)
+        traceback.print_exc()
+        return None
+def display_models(df):
+    """Displays model information in a table using Rich, including clickable links."""
+    if df is None or df.empty:
+        print("No model data to display.")
+        return
+    num_to_display = min(TOP_N_MODELS, len(df))
+    console = Console()
+    # Update title to reflect the new sorting metric
+    title = f"Top {num_to_display} Hugging Face Models by Popularity/Size (Downloads*Likes/GB)"
+    # Removed box=None and show_lines=False to revert to default box for clarity, you can add them back if preferred
+    table = Table(title=title, show_header=True, header_style="bold magenta")
+    # Add columns, including the new Size column
+    table.add_column("#", style="dim", width=4, justify="right")
+    # ***** CHANGE HERE: Removed markup=True and highlight=False *****
+    table.add_column("Model ID (Link)", style="cyan", min_width=40, no_wrap=False)
+    table.add_column("Downloads", style="green", width=12, justify="right")
+    table.add_column("Likes", style="red", width=8, justify="right")
+    table.add_column("Size (GB)", style="purple", width=10, justify="right") # New column
+    table.add_column("Score (Pop/GB)", style="yellow", width=14, justify="right") # Show the score
+    table.add_column("Last Updated", style="blue", width=14)
+    table.add_column("Type (Guess)", style="magenta", width=15)
+    base_url = "https://huggingface.co/"
+    for index, row in df.head(num_to_display).iterrows():
+        rank = str(index + 1)
+        model_id = row.get('model_id', 'N/A')
+        # --- Link Creation (No changes needed here) ---
+        if model_id != 'N/A':
+            # Construct the URL for the model
+            model_url = f"{base_url}{model_id}"
+            # Use Rich's link markup: [link=URL]TEXT[/link]
+            # This markup will be interpreted correctly by the Console when printing the table
+            model_id_display = f"[link={model_url}]{model_id}[/link]"
+        else:
+            model_id_display = 'N/A' # Fallback if ID is somehow missing
+        # --- END: Link Creation ---
+        downloads_str = f"{row.get('downloads', 0):,}"
+        likes_str = f"{row.get('likes', 0):,}"
+        # Format size
+        size_gb = row.get('size_gb')
+        size_str = f"{size_gb:.2f}" if pd.notna(size_gb) else "N/A"
+        # Format score
+        score = row.get('popularity_per_gb')
+        # Use scientific notation for potentially large scores
+        score_str = f"{score:,.0f}" if pd.notna(score) else "N/A" # Comma separated integer part
+        last_updated = str(row.get('last_updated', 'N/A'))
+        model_type = str(row.get('model_type', 'N/A'))
+        table.add_row(
+            rank,
+            model_id_display, # Pass the string with link markup
+            downloads_str,
+            likes_str,
+            size_str,
+            score_str,
+            last_updated,
+            model_type
+        )
+    console.print(table) # The console handles rendering the markup in model_id_display
+    if len(df) > num_to_display:
+        print(f"(Showing Top {num_to_display} of {len(df)} models meeting criteria and size threshold)")
+# --- main function remains largely unchanged ---
+def main():
+    df = fetch_popular_models_metadata()
+    if df is None or df.empty:
+        print("未能獲取或篩選到任何模型數據，程序退出。")
+        sys.exit(1)
+    display_models(df)
+    max_choice = min(TOP_N_MODELS, len(df))
+    # Removed the interactive choice part as it wasn't in the provided code snippet
+    # and the request was only about adding links to the table.
+    # If you need the choice part back, it would need to be added here.
+if __name__ == "__main__":
+    try:
+        import pandas
+        import rich # Keep the import check
+        import huggingface_hub
+        import tqdm as tqdm_module_check # Check tqdm again
+        print(f"Using huggingface_hub version: {huggingface_hub.__version__}")
+        # It might be helpful to print the rich version too for debugging
+        try:
+            # import rich # Already imported above
+            # print(f"Using rich version: {rich.__version__}") # Comment out or delete this line
+            print("Checking rich version (attribute unavailable in older versions).") # Optional: inform the user
+        except ImportError:
+             print("rich library not found.") # Should not happen given the check below
+        except AttributeError: # Catch the specific error
+            print("Installed rich version is old and does not have __version__ attribute.")
+        print(f"Using tqdm version: {tqdm_module_check.__version__}")
+    except ImportError as err:
+        print(f"[錯誤] 缺少必要的 Python 庫: {err}", file=sys.stderr)
+        print("請運行: pip install --upgrade pandas huggingface-hub rich tqdm requests", file=sys.stderr)
+        sys.exit(1)
+    main()