import os
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

def get_scratch_dir():
    return os.environ.get('SCRATCH_DIR', '/tmp/inat_tmp')

def parse_inat_metadata(json_path):
    print(f"Parsing {json_path}...")
    with open(json_path, 'r') as f:
        data = json.load(f)
        
    # Extract categories and filter for Aves
    categories_df = pd.DataFrame(data['categories'])
    aves_cats = categories_df[
        (categories_df['class'] == 'Aves') | 
        (categories_df['name'].str.contains('Aves', na=False))
    ]
    print(f"Found {len(aves_cats)} Aves categories out of {len(categories_df)}")
    
    if len(aves_cats) == 0:
        return None
        
    # Create an efficient lookup for aves category IDs
    aves_cat_ids = set(aves_cats['id'].tolist())
    
    # Extract annotations for Aves
    annotations_df = pd.DataFrame(data['annotations'])
    aves_anns = annotations_df[annotations_df['category_id'].isin(aves_cat_ids)]
    
    # Extract images for Aves
    images_df = pd.DataFrame(data['images'])
    
    # Merge annotations with images
    merged_df = pd.merge(aves_anns, images_df, left_on='image_id', right_on='id', suffixes=('_ann', '_img'))
    
    # Merge with taxonomic hierarchy
    final_df = pd.merge(merged_df, aves_cats, left_on='category_id', right_on='id', suffixes=('', '_cat'))
    
    # Select and rename relevant columns
    cols_to_keep = [
        'image_id', 'file_name', 'latitude', 'longitude', 'date', 
        'category_id', 'name', 'common_name', 'kingdom', 'phylum', 
        'class', 'order', 'family', 'genus'
    ]
    
    # Some columns might be missing if the dataset structure changes slightly, so we filter safely
    final_df = final_df[[c for c in cols_to_keep if c in final_df.columns]]
    return final_df

def main():
    scratch_dir = get_scratch_dir()
    metadata_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'metadata')
    os.makedirs(metadata_dir, exist_ok=True)
    
    dfs = []
    for split in ['train.json', 'val.json']:
        json_path = os.path.join(scratch_dir, split)
        if os.path.exists(json_path):
            df = parse_inat_metadata(json_path)
            if df is not None:
                df['split'] = split.replace('.json', '')
                dfs.append(df)
        else:
            print(f"Warning: {json_path} not found. Ensure download script ran successfully.")
            
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        out_path = os.path.join(metadata_dir, 'aves_metadata.parquet')
        
        print(f"Saving Parquet file to {out_path} with {len(combined_df)} records.")
        # Save as a highly compressed parquet file
        table = pa.Table.from_pandas(combined_df)
        pq.write_table(table, out_path, compression='snappy')
        print("Metadata parsing complete.")
    else:
        print("No Aves metadata processed. Exiting.")

if __name__ == "__main__":
    main()