import os import json import pandas as pd import pyarrow as pa import pyarrow.parquet as pq def get_scratch_dir(): return os.environ.get('SCRATCH_DIR', '/tmp/inat_tmp') def parse_inat_metadata(json_path): print(f"Parsing {json_path}...") with open(json_path, 'r') as f: data = json.load(f) # Extract categories and filter for Aves categories_df = pd.DataFrame(data['categories']) aves_cats = categories_df[ (categories_df['class'] == 'Aves') | (categories_df['name'].str.contains('Aves', na=False)) ] print(f"Found {len(aves_cats)} Aves categories out of {len(categories_df)}") if len(aves_cats) == 0: return None # Create an efficient lookup for aves category IDs aves_cat_ids = set(aves_cats['id'].tolist()) # Extract annotations for Aves annotations_df = pd.DataFrame(data['annotations']) aves_anns = annotations_df[annotations_df['category_id'].isin(aves_cat_ids)] # Extract images for Aves images_df = pd.DataFrame(data['images']) # Merge annotations with images merged_df = pd.merge(aves_anns, images_df, left_on='image_id', right_on='id', suffixes=('_ann', '_img')) # Merge with taxonomic hierarchy final_df = pd.merge(merged_df, aves_cats, left_on='category_id', right_on='id', suffixes=('', '_cat')) # Select and rename relevant columns cols_to_keep = [ 'image_id', 'file_name', 'latitude', 'longitude', 'date', 'category_id', 'name', 'common_name', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus' ] # Some columns might be missing if the dataset structure changes slightly, so we filter safely final_df = final_df[[c for c in cols_to_keep if c in final_df.columns]] return final_df def main(): scratch_dir = get_scratch_dir() metadata_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'metadata') os.makedirs(metadata_dir, exist_ok=True) dfs = [] for split in ['train.json', 'val.json']: json_path = os.path.join(scratch_dir, split) if os.path.exists(json_path): df = parse_inat_metadata(json_path) if df is not None: df['split'] = split.replace('.json', '') dfs.append(df) else: print(f"Warning: {json_path} not found. Ensure download script ran successfully.") if dfs: combined_df = pd.concat(dfs, ignore_index=True) out_path = os.path.join(metadata_dir, 'aves_metadata.parquet') print(f"Saving Parquet file to {out_path} with {len(combined_df)} records.") # Save as a highly compressed parquet file table = pa.Table.from_pandas(combined_df) pq.write_table(table, out_path, compression='snappy') print("Metadata parsing complete.") else: print("No Aves metadata processed. Exiting.") if __name__ == "__main__": main()