Side-Info_Generation / src /parse_metadata.py
mayesh's picture
deploy bird explorer dashboard
2572f0f
Raw
History Blame Contribute Delete
3 kB
import os
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
def get_scratch_dir():
return os.environ.get('SCRATCH_DIR', '/tmp/inat_tmp')
def parse_inat_metadata(json_path):
print(f"Parsing {json_path}...")
with open(json_path, 'r') as f:
data = json.load(f)
# Extract categories and filter for Aves
categories_df = pd.DataFrame(data['categories'])
aves_cats = categories_df[
(categories_df['class'] == 'Aves') |
(categories_df['name'].str.contains('Aves', na=False))
]
print(f"Found {len(aves_cats)} Aves categories out of {len(categories_df)}")
if len(aves_cats) == 0:
return None
# Create an efficient lookup for aves category IDs
aves_cat_ids = set(aves_cats['id'].tolist())
# Extract annotations for Aves
annotations_df = pd.DataFrame(data['annotations'])
aves_anns = annotations_df[annotations_df['category_id'].isin(aves_cat_ids)]
# Extract images for Aves
images_df = pd.DataFrame(data['images'])
# Merge annotations with images
merged_df = pd.merge(aves_anns, images_df, left_on='image_id', right_on='id', suffixes=('_ann', '_img'))
# Merge with taxonomic hierarchy
final_df = pd.merge(merged_df, aves_cats, left_on='category_id', right_on='id', suffixes=('', '_cat'))
# Select and rename relevant columns
cols_to_keep = [
'image_id', 'file_name', 'latitude', 'longitude', 'date',
'category_id', 'name', 'common_name', 'kingdom', 'phylum',
'class', 'order', 'family', 'genus'
]
# Some columns might be missing if the dataset structure changes slightly, so we filter safely
final_df = final_df[[c for c in cols_to_keep if c in final_df.columns]]
return final_df
def main():
scratch_dir = get_scratch_dir()
metadata_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'metadata')
os.makedirs(metadata_dir, exist_ok=True)
dfs = []
for split in ['train.json', 'val.json']:
json_path = os.path.join(scratch_dir, split)
if os.path.exists(json_path):
df = parse_inat_metadata(json_path)
if df is not None:
df['split'] = split.replace('.json', '')
dfs.append(df)
else:
print(f"Warning: {json_path} not found. Ensure download script ran successfully.")
if dfs:
combined_df = pd.concat(dfs, ignore_index=True)
out_path = os.path.join(metadata_dir, 'aves_metadata.parquet')
print(f"Saving Parquet file to {out_path} with {len(combined_df)} records.")
# Save as a highly compressed parquet file
table = pa.Table.from_pandas(combined_df)
pq.write_table(table, out_path, compression='snappy')
print("Metadata parsing complete.")
else:
print("No Aves metadata processed. Exiting.")
if __name__ == "__main__":
main()