File size: 9,173 Bytes
d5b98e6
 
 
 
 
 
 
 
 
 
 
 
c9c8026
d5b98e6
 
 
637183f
 
d5b98e6
637183f
d5b98e6
 
637183f
d5b98e6
637183f
d5b98e6
 
 
 
 
637183f
d5b98e6
637183f
 
 
 
 
 
 
 
d5b98e6
637183f
d5b98e6
 
637183f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5b98e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637183f
d5b98e6
 
 
637183f
d5b98e6
 
637183f
 
 
 
d5b98e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4254f01
 
d5b98e6
 
4254f01
 
d5b98e6
 
4254f01
 
d5b98e6
 
4254f01
 
d5b98e6
 
 
4254f01
 
 
d5b98e6
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""
Data loading and preprocessing for the Hugging Face model ecosystem dataset.
"""
import pandas as pd
from datasets import load_dataset
from typing import Optional, Dict, List
import numpy as np


class ModelDataLoader:
    """Load and preprocess model data from Hugging Face dataset."""
    
    def __init__(self, dataset_name: str = "modelbiome/ai_ecosystem"):
        self.dataset_name = dataset_name
        self.df: Optional[pd.DataFrame] = None
        
    def load_data(self, sample_size: Optional[int] = None, split: str = "train", 
                  prioritize_base_models: bool = True) -> pd.DataFrame:
        """
        Load dataset from Hugging Face Hub with methodological sampling.
        
        Args:
            sample_size: If provided, sample this many rows using stratified approach
            split: Dataset split to load
            prioritize_base_models: If True, prioritize base models (no parent) in sampling
            
        Returns:
            DataFrame with model data
        """
        dataset = load_dataset(self.dataset_name, split=split)
        df_full = dataset.to_pandas()
        
        if sample_size and len(df_full) > sample_size:
            if prioritize_base_models:
                # Methodological sampling: prioritize base models
                df_full = self._stratified_sample(df_full, sample_size)
            else:
                # Random sampling (old approach)
                dataset = dataset.shuffle(seed=42).select(range(sample_size))
                df_full = dataset.to_pandas()
        
        self.df = df_full
        return self.df
    
    def _stratified_sample(self, df: pd.DataFrame, sample_size: int) -> pd.DataFrame:
        """
        Stratified sampling prioritizing base models and popular models.
        
        Strategy:
        1. Include ALL base models (no parent) if they fit in sample_size
        2. Add popular models (high downloads/likes)
        3. Fill remaining with diverse models across libraries/tasks
        
        Args:
            df: Full DataFrame
            sample_size: Target sample size
            
        Returns:
            Sampled DataFrame
        """
        # Identify base models (no parent)
        # parent_model is stored as string representation of list: '[]' for base models
        base_models = df[
            df['parent_model'].isna() | 
            (df['parent_model'] == '') | 
            (df['parent_model'] == '[]') |
            (df['parent_model'] == 'null')
        ]
        
        # Start with base models
        if len(base_models) <= sample_size:
            # All base models fit - include them all
            sampled = base_models.copy()
            remaining_size = sample_size - len(sampled)
            
            # Get non-base models
            non_base = df[~df.index.isin(sampled.index)]
            
            if remaining_size > 0 and len(non_base) > 0:
                # Add popular derived models and diverse samples
                # Sort by downloads + likes for popularity
                non_base['popularity_score'] = (
                    non_base.get('downloads', 0).fillna(0) + 
                    non_base.get('likes', 0).fillna(0) * 100  # Weight likes more
                )
                
                # Take top 50% by popularity, 50% stratified by library
                popular_size = min(remaining_size // 2, len(non_base))
                diverse_size = remaining_size - popular_size
                
                # Popular models
                popular_models = non_base.nlargest(popular_size, 'popularity_score')
                sampled = pd.concat([sampled, popular_models])
                
                # Diverse sampling across libraries
                if diverse_size > 0:
                    remaining = non_base[~non_base.index.isin(popular_models.index)]
                    if len(remaining) > 0:
                        # Stratify by library if possible
                        if 'library_name' in remaining.columns:
                            libraries = remaining['library_name'].value_counts()
                            diverse_samples = []
                            per_library = max(1, diverse_size // len(libraries))
                            
                            for library in libraries.index:
                                lib_models = remaining[remaining['library_name'] == library]
                                n_sample = min(per_library, len(lib_models))
                                diverse_samples.append(lib_models.sample(n=n_sample, random_state=42))
                            
                            diverse_df = pd.concat(diverse_samples).head(diverse_size)
                        else:
                            diverse_df = remaining.sample(n=min(diverse_size, len(remaining)), random_state=42)
                        
                        sampled = pd.concat([sampled, diverse_df])
                
                sampled = sampled.drop(columns=['popularity_score'], errors='ignore')
        else:
            # Too many base models - sample from them strategically
            # Prioritize popular base models
            base_models = base_models.copy()  # Avoid SettingWithCopyWarning
            base_models['popularity_score'] = (
                base_models.get('downloads', 0).fillna(0) + 
                base_models.get('likes', 0).fillna(0) * 100
            )
            sampled = base_models.nlargest(sample_size, 'popularity_score')
            sampled = sampled.drop(columns=['popularity_score'], errors='ignore')
        
        return sampled.reset_index(drop=True)
    
    def preprocess_for_embedding(self, df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """
        Preprocess data for embedding generation.
        Combines text fields into a single representation.
        
        Args:
            df: DataFrame to process (uses self.df if None)
            
        Returns:
            DataFrame with combined text field
        """
        if df is None:
            df = self.df.copy()
        else:
            df = df.copy()
        
        text_fields = ['tags', 'pipeline_tag', 'library_name', 'modelCard']
        for field in text_fields:
            if field in df.columns:
                df[field] = df[field].fillna('')
        
        # Build combined text from available fields
        df['combined_text'] = (
            df.get('tags', '').astype(str) + ' ' +
            df.get('pipeline_tag', '').astype(str) + ' ' +
            df.get('library_name', '').astype(str)
        )
        
        # Add modelCard if available (only in withmodelcards dataset)
        if 'modelCard' in df.columns:
            df['combined_text'] = df['combined_text'] + ' ' + df['modelCard'].astype(str).str[:500]
        
        return df
    
    def filter_data(
        self,
        df: Optional[pd.DataFrame] = None,
        min_downloads: Optional[int] = None,
        min_likes: Optional[int] = None,
        libraries: Optional[List[str]] = None,
        pipeline_tags: Optional[List[str]] = None,
        search_query: Optional[str] = None
    ) -> pd.DataFrame:
        """
        Filter dataset based on criteria.
        
        Args:
            df: DataFrame to filter (uses self.df if None)
            min_downloads: Minimum download count
            min_likes: Minimum like count
            libraries: List of library names to include
            pipeline_tags: List of pipeline tags to include
            search_query: Text search in model_id or tags
            
        Returns:
            Filtered DataFrame
        """
        if df is None:
            df = self.df.copy()
        else:
            df = df.copy()
        
        if min_downloads is not None:
            downloads_col = df.get('downloads', pd.Series([0] * len(df), index=df.index))
            df = df[downloads_col >= min_downloads]
        
        if min_likes is not None:
            likes_col = df.get('likes', pd.Series([0] * len(df), index=df.index))
            df = df[likes_col >= min_likes]
        
        if libraries:
            library_col = df.get('library_name', pd.Series([''] * len(df), index=df.index))
            df = df[library_col.isin(libraries)]
        
        if pipeline_tags:
            pipeline_col = df.get('pipeline_tag', pd.Series([''] * len(df), index=df.index))
            df = df[pipeline_col.isin(pipeline_tags)]
        
        if search_query:
            query_lower = search_query.lower()
            model_id_col = df.get('model_id', '').astype(str).str.lower()
            tags_col = df.get('tags', '').astype(str).str.lower()
            mask = model_id_col.str.contains(query_lower, na=False) | tags_col.str.contains(query_lower, na=False)
            df = df[mask]
        
        return df
    
    def get_unique_values(self, column: str) -> List[str]:
        """Get unique non-null values from a column."""
        if self.df is None:
            return []
        values = self.df[column].dropna().unique().tolist()
        return sorted([str(v) for v in values if v and str(v) != 'nan'])