Spaces:
Running
Running
| import os | |
| import json | |
| import pandas as pd | |
| import pyarrow as pa | |
| import pyarrow.parquet as pq | |
| def get_scratch_dir(): | |
| return os.environ.get('SCRATCH_DIR', '/tmp/inat_tmp') | |
| def parse_inat_metadata(json_path): | |
| print(f"Parsing {json_path}...") | |
| with open(json_path, 'r') as f: | |
| data = json.load(f) | |
| # Extract categories and filter for Aves | |
| categories_df = pd.DataFrame(data['categories']) | |
| aves_cats = categories_df[ | |
| (categories_df['class'] == 'Aves') | | |
| (categories_df['name'].str.contains('Aves', na=False)) | |
| ] | |
| print(f"Found {len(aves_cats)} Aves categories out of {len(categories_df)}") | |
| if len(aves_cats) == 0: | |
| return None | |
| # Create an efficient lookup for aves category IDs | |
| aves_cat_ids = set(aves_cats['id'].tolist()) | |
| # Extract annotations for Aves | |
| annotations_df = pd.DataFrame(data['annotations']) | |
| aves_anns = annotations_df[annotations_df['category_id'].isin(aves_cat_ids)] | |
| # Extract images for Aves | |
| images_df = pd.DataFrame(data['images']) | |
| # Merge annotations with images | |
| merged_df = pd.merge(aves_anns, images_df, left_on='image_id', right_on='id', suffixes=('_ann', '_img')) | |
| # Merge with taxonomic hierarchy | |
| final_df = pd.merge(merged_df, aves_cats, left_on='category_id', right_on='id', suffixes=('', '_cat')) | |
| # Select and rename relevant columns | |
| cols_to_keep = [ | |
| 'image_id', 'file_name', 'latitude', 'longitude', 'date', | |
| 'category_id', 'name', 'common_name', 'kingdom', 'phylum', | |
| 'class', 'order', 'family', 'genus' | |
| ] | |
| # Some columns might be missing if the dataset structure changes slightly, so we filter safely | |
| final_df = final_df[[c for c in cols_to_keep if c in final_df.columns]] | |
| return final_df | |
| def main(): | |
| scratch_dir = get_scratch_dir() | |
| metadata_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'metadata') | |
| os.makedirs(metadata_dir, exist_ok=True) | |
| dfs = [] | |
| for split in ['train.json', 'val.json']: | |
| json_path = os.path.join(scratch_dir, split) | |
| if os.path.exists(json_path): | |
| df = parse_inat_metadata(json_path) | |
| if df is not None: | |
| df['split'] = split.replace('.json', '') | |
| dfs.append(df) | |
| else: | |
| print(f"Warning: {json_path} not found. Ensure download script ran successfully.") | |
| if dfs: | |
| combined_df = pd.concat(dfs, ignore_index=True) | |
| out_path = os.path.join(metadata_dir, 'aves_metadata.parquet') | |
| print(f"Saving Parquet file to {out_path} with {len(combined_df)} records.") | |
| # Save as a highly compressed parquet file | |
| table = pa.Table.from_pandas(combined_df) | |
| pq.write_table(table, out_path, compression='snappy') | |
| print("Metadata parsing complete.") | |
| else: | |
| print("No Aves metadata processed. Exiting.") | |
| if __name__ == "__main__": | |
| main() | |