TsungChihTsai commited on
Commit
7a70f97
·
verified ·
1 Parent(s): d8a89db

Upload check_HF_models.py

Browse files
Files changed (1) hide show
  1. check_HF_models.py +304 -0
check_HF_models.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import pandas as pd
3
+ from huggingface_hub import HfApi, list_models, hf_hub_download # Need HfApi for model_info
4
+ from datetime import datetime, timedelta, timezone
5
+ from rich.console import Console # Import Console
6
+ from rich.table import Table # Import Table
7
+ import sys
8
+ import re
9
+ import traceback
10
+ import time # For potential sleep to avoid rate limits
11
+ from tqdm import tqdm
12
+
13
+ # --- Configuration ---
14
+ TOP_N_MODELS = 50
15
+ RECENCY_DAYS = 180
16
+ REQUIRED_TAGS = ['text-generation', 'conversational'] # Filter for relevant models
17
+ MIN_DOWNLOADS = 5000 # Maybe increase min downloads to reduce model_info calls
18
+ MIN_LIKES = 100 # Maybe increase min likes
19
+ DEFAULT_OLLAMA_TAG_SUFFIX = ":latest"
20
+ # Model file extensions to consider for size calculation
21
+ MODEL_FILE_EXTENSIONS = ('.safetensors', '.bin', '.gguf')
22
+ # Minimum size in GB to be considered (filters out repos with only tiny files)
23
+ MIN_SIZE_GB = 0.01 # 10 MB
24
+
25
+ # --- Hugging Face API Client ---
26
+ # We need HfApi again to get detailed model info including file sizes
27
+ hf_api = HfApi()
28
+
29
+ def fetch_popular_models_metadata(recency_days=RECENCY_DAYS, min_downloads=MIN_DOWNLOADS, min_likes=MIN_LIKES, required_tags=REQUIRED_TAGS):
30
+ """Fetches metadata for popular and recently updated models from Hugging Face Hub."""
31
+ print("Querying Hugging Face Hub for model information...")
32
+ print(f"(Initial Filters: Updated in last {recency_days} days, Downloads > {min_downloads}, Likes > {min_likes}, Tags contain {required_tags or 'any'})")
33
+ print("[INFO] This version fetches file sizes and will be significantly slower.")
34
+
35
+ models_data = []
36
+ cutoff_date = datetime.now(timezone.utc) - timedelta(days=recency_days)
37
+
38
+ try:
39
+ model_iterator = list_models(
40
+ sort="lastModified",
41
+ direction=-1,
42
+ filter=required_tags if required_tags else None,
43
+ fetch_config=False # Config not needed here
44
+ )
45
+
46
+ print("Iterating through models and fetching details (this may take a long time)...")
47
+
48
+ processed_count = 0
49
+ fetched_count = 0
50
+ # Reduce limit because model_info calls are slow
51
+ limit_processed_initial = 10000 # Limit initial candidates to check
52
+ limit_fetched_final = 100 # Limit how many models we actually store after getting size
53
+
54
+ initial_candidates = []
55
+ # --- Step 1: Initial Filtering based on list_models ---
56
+ print("Step 1: Gathering initial candidates based on basic filters...")
57
+ for model_info_basic in tqdm(model_iterator, desc="Initial Scan", unit=" model", smoothing=0.1):
58
+ processed_count += 1
59
+
60
+ last_modified_aware = model_info_basic.lastModified
61
+ if last_modified_aware and last_modified_aware < cutoff_date:
62
+ print(f"\nReached cutoff date {cutoff_date.date()}. Stopping initial scan.")
63
+ break
64
+
65
+ downloads = model_info_basic.downloads
66
+ likes = model_info_basic.likes
67
+
68
+ if downloads is None or downloads < min_downloads or likes is None or likes < min_likes:
69
+ continue
70
+
71
+ tags = model_info_basic.tags or []
72
+ if required_tags and not any(tag in tags for tag in required_tags):
73
+ continue
74
+
75
+ # Passed initial filters, add to candidates for detailed check
76
+ initial_candidates.append(model_info_basic)
77
+
78
+ if processed_count >= limit_processed_initial:
79
+ print(f"\nReached initial processing limit ({limit_processed_initial}). Moving to detailed fetch.")
80
+ break
81
+
82
+ if not initial_candidates:
83
+ print("\nNo models passed the initial filtering stage.")
84
+ return None
85
+
86
+ print(f"\nFound {len(initial_candidates)} initial candidates. Proceeding to fetch details and file sizes...")
87
+
88
+ # --- Step 2: Fetch Detailed Info and Size for Candidates ---
89
+ print("Step 2: Fetching detailed info and calculating sizes...")
90
+ for model_info_basic in tqdm(initial_candidates, desc="Fetching Details", unit=" model"):
91
+ model_id = model_info_basic.modelId
92
+ downloads = model_info_basic.downloads
93
+ likes = model_info_basic.likes
94
+ last_updated_aware = model_info_basic.lastModified
95
+ tags = model_info_basic.tags or []
96
+
97
+ try:
98
+ # Fetch detailed info including file metadata
99
+ # Use files_metadata=True which is generally faster than full=True
100
+ detailed_info = hf_api.model_info(model_id, files_metadata=True)
101
+ # Add a small delay to be nice to the API
102
+ # time.sleep(0.05)
103
+
104
+ total_size_bytes = 0
105
+ if detailed_info.siblings:
106
+ for sibling in detailed_info.siblings:
107
+ # Check if file name ends with specified extensions and size exists
108
+ if sibling.rfilename.endswith(MODEL_FILE_EXTENSIONS) and sibling.size is not None:
109
+ total_size_bytes += sibling.size
110
+
111
+ # Convert to GB
112
+ size_gb = total_size_bytes / (1024**3)
113
+
114
+ # Filter out models smaller than MIN_SIZE_GB
115
+ if size_gb < MIN_SIZE_GB:
116
+ # print(f"Skipping {model_id}: Size {size_gb:.3f} GB is below threshold {MIN_SIZE_GB} GB.")
117
+ continue
118
+
119
+ last_updated_str = last_updated_aware.strftime('%Y-%m-%d') if last_updated_aware else "N/A"
120
+ params_str = "N/A" # Parameter count still hard to get reliably
121
+ model_type = "N/A" # Determine type based on tags
122
+
123
+ if 'gguf' in tags: model_type = "GGUF"
124
+ elif 'pytorch' in tags or 'safetensors' in tags: model_type = "PyTorch/SafeT."
125
+ elif 'onnx' in tags: model_type = "ONNX"
126
+
127
+ if model_type != "GGUF":
128
+ if 'text-generation' in tags and ('instruct' in tags or 'chat' in tags): model_type = "Instruct/Chat"
129
+ elif 'text-generation' in tags: model_type = "Base Model"
130
+ elif 'conversational' in tags: model_type = "Conversational"
131
+
132
+ models_data.append({
133
+ 'model_id': model_id,
134
+ 'downloads': downloads,
135
+ 'likes': likes,
136
+ 'last_updated': last_updated_str,
137
+ 'params': params_str,
138
+ 'model_type': model_type,
139
+ 'total_size_bytes': total_size_bytes, # Keep bytes for potential filtering
140
+ 'size_gb': size_gb,
141
+ # 'tags': tags # Optional
142
+ })
143
+ fetched_count += 1
144
+
145
+ if fetched_count >= limit_fetched_final:
146
+ print(f"\nReached limit of {limit_fetched_final} models with detailed info fetched.")
147
+ break
148
+
149
+ except Exception as detail_e:
150
+ # Log error fetching details for a specific model but continue
151
+ # print(f"\n[Warning] Failed to fetch details for {model_id}: {detail_e}")
152
+ # Limit warnings to avoid spamming
153
+ if processed_count % 50 == 0: # Print warning every 50 errors
154
+ print(f"\n[Warning] Error fetching details for {model_id} (logged): {str(detail_e)[:100]}...")
155
+ pass # Continue to the next model
156
+
157
+
158
+ if not models_data:
159
+ print("\nNo models satisfied all criteria after fetching details and size.")
160
+ return None
161
+
162
+ print(f"\nFetched details for {len(models_data)} models.")
163
+
164
+ # --- Step 3: Calculate Score and Sort ---
165
+ print("Step 3: Calculating scores and sorting...")
166
+ df = pd.DataFrame(models_data)
167
+
168
+ # Ensure necessary columns are numeric and handle potential NaNs (though unlikely now)
169
+ df['downloads'] = df['downloads'].fillna(0).astype(int)
170
+ df['likes'] = df['likes'].fillna(0).astype(int)
171
+ df['size_gb'] = df['size_gb'].fillna(0).astype(float)
172
+
173
+ # Filter again for size > 0 before division (redundant if MIN_SIZE_GB > 0, but safe)
174
+ df = df[df['size_gb'] > 0].copy()
175
+ if df.empty:
176
+ print("\nNo models remaining after filtering for size > 0 GB.")
177
+ return None
178
+
179
+ # Calculate the new score: Popularity per GB
180
+ # Adding a small epsilon to size_gb in denominator for numerical stability if needed,
181
+ # but filtering size > 0 should be sufficient.
182
+ df['popularity_per_gb'] = (df['downloads'] * df['likes']) / df['size_gb']
183
+
184
+ # Sort by the new score
185
+ df.sort_values(by='popularity_per_gb', ascending=False, inplace=True)
186
+ df.reset_index(drop=True, inplace=True)
187
+
188
+ print("Model scoring and sorting completed.")
189
+ return df
190
+
191
+ except Exception as e:
192
+ print(f"\n[Error] Failed during model fetching or processing: {e}", file=sys.stderr)
193
+ traceback.print_exc()
194
+ return None
195
+
196
+ def display_models(df):
197
+ """Displays model information in a table using Rich, including clickable links."""
198
+ if df is None or df.empty:
199
+ print("No model data to display.")
200
+ return
201
+
202
+ num_to_display = min(TOP_N_MODELS, len(df))
203
+ console = Console()
204
+ # Update title to reflect the new sorting metric
205
+ title = f"Top {num_to_display} Hugging Face Models by Popularity/Size (Downloads*Likes/GB)"
206
+ # Removed box=None and show_lines=False to revert to default box for clarity, you can add them back if preferred
207
+ table = Table(title=title, show_header=True, header_style="bold magenta")
208
+
209
+ # Add columns, including the new Size column
210
+ table.add_column("#", style="dim", width=4, justify="right")
211
+ # ***** CHANGE HERE: Removed markup=True and highlight=False *****
212
+ table.add_column("Model ID (Link)", style="cyan", min_width=40, no_wrap=False)
213
+ table.add_column("Downloads", style="green", width=12, justify="right")
214
+ table.add_column("Likes", style="red", width=8, justify="right")
215
+ table.add_column("Size (GB)", style="purple", width=10, justify="right") # New column
216
+ table.add_column("Score (Pop/GB)", style="yellow", width=14, justify="right") # Show the score
217
+ table.add_column("Last Updated", style="blue", width=14)
218
+ table.add_column("Type (Guess)", style="magenta", width=15)
219
+
220
+ base_url = "https://huggingface.co/"
221
+
222
+ for index, row in df.head(num_to_display).iterrows():
223
+ rank = str(index + 1)
224
+ model_id = row.get('model_id', 'N/A')
225
+
226
+ # --- Link Creation (No changes needed here) ---
227
+ if model_id != 'N/A':
228
+ # Construct the URL for the model
229
+ model_url = f"{base_url}{model_id}"
230
+ # Use Rich's link markup: [link=URL]TEXT[/link]
231
+ # This markup will be interpreted correctly by the Console when printing the table
232
+ model_id_display = f"[link={model_url}]{model_id}[/link]"
233
+ else:
234
+ model_id_display = 'N/A' # Fallback if ID is somehow missing
235
+ # --- END: Link Creation ---
236
+
237
+ downloads_str = f"{row.get('downloads', 0):,}"
238
+ likes_str = f"{row.get('likes', 0):,}"
239
+ # Format size
240
+ size_gb = row.get('size_gb')
241
+ size_str = f"{size_gb:.2f}" if pd.notna(size_gb) else "N/A"
242
+ # Format score
243
+ score = row.get('popularity_per_gb')
244
+ # Use scientific notation for potentially large scores
245
+ score_str = f"{score:,.0f}" if pd.notna(score) else "N/A" # Comma separated integer part
246
+
247
+ last_updated = str(row.get('last_updated', 'N/A'))
248
+ model_type = str(row.get('model_type', 'N/A'))
249
+
250
+ table.add_row(
251
+ rank,
252
+ model_id_display, # Pass the string with link markup
253
+ downloads_str,
254
+ likes_str,
255
+ size_str,
256
+ score_str,
257
+ last_updated,
258
+ model_type
259
+ )
260
+
261
+ console.print(table) # The console handles rendering the markup in model_id_display
262
+ if len(df) > num_to_display:
263
+ print(f"(Showing Top {num_to_display} of {len(df)} models meeting criteria and size threshold)")
264
+
265
+
266
+ # --- main function remains largely unchanged ---
267
+ def main():
268
+ df = fetch_popular_models_metadata()
269
+
270
+ if df is None or df.empty:
271
+ print("未能獲取或篩選到任何模型數據,程序退出。")
272
+ sys.exit(1)
273
+
274
+ display_models(df)
275
+
276
+ max_choice = min(TOP_N_MODELS, len(df))
277
+ # Removed the interactive choice part as it wasn't in the provided code snippet
278
+ # and the request was only about adding links to the table.
279
+ # If you need the choice part back, it would need to be added here.
280
+
281
+
282
+ if __name__ == "__main__":
283
+ try:
284
+ import pandas
285
+ import rich # Keep the import check
286
+ import huggingface_hub
287
+ import tqdm as tqdm_module_check # Check tqdm again
288
+ print(f"Using huggingface_hub version: {huggingface_hub.__version__}")
289
+ # It might be helpful to print the rich version too for debugging
290
+ try:
291
+ # import rich # Already imported above
292
+ # print(f"Using rich version: {rich.__version__}") # Comment out or delete this line
293
+ print("Checking rich version (attribute unavailable in older versions).") # Optional: inform the user
294
+ except ImportError:
295
+ print("rich library not found.") # Should not happen given the check below
296
+ except AttributeError: # Catch the specific error
297
+ print("Installed rich version is old and does not have __version__ attribute.")
298
+ print(f"Using tqdm version: {tqdm_module_check.__version__}")
299
+ except ImportError as err:
300
+ print(f"[錯誤] 缺少必要的 Python 庫: {err}", file=sys.stderr)
301
+ print("請運行: pip install --upgrade pandas huggingface-hub rich tqdm requests", file=sys.stderr)
302
+ sys.exit(1)
303
+
304
+ main()