"""ReXVQA benchmark implementation."""

import json
import os
from typing import Dict, Optional, Any
from .base import Benchmark, BenchmarkDataPoint
from pathlib import Path
import tarfile
import zstandard as zstd
from huggingface_hub import hf_hub_download, list_repo_files
import os


def get_hf_token():
    """Get Hugging Face token from cache."""
    token_path = os.path.expanduser("~/.cache/huggingface/token")
    if os.path.exists(token_path):
        with open(token_path, 'r') as f:
            return f.read().strip()
    return None


class ReXVQABenchmark(Benchmark):
    """ReXVQA benchmark for chest radiology visual question answering.
    
    ReXVQA is a large-scale VQA dataset for chest radiology comprising approximately 
    696,000 questions paired with 160,000 chest X-rays. It tests 5 core radiological 
    reasoning skills: presence assessment, location analysis, negation detection, 
    differential diagnosis, and geometric reasoning.
    
    The dataset consists of two separate HuggingFace datasets:
    - ReXVQA: Contains questions, answers, and metadata
    - ReXGradient-160K: Contains metadata only (images are in separate part files)
    
    Paper: https://arxiv.org/abs/2506.04353
    Dataset: https://huggingface.co/datasets/rajpurkarlab/ReXVQA
    Images: https://huggingface.co/datasets/rajpurkarlab/ReXGradient-160K
    """

    def __init__(self, data_dir: str, **kwargs):
        """Initialize ReXVQA benchmark.
        
        Args:
            data_dir (str): Directory to store/cache downloaded data
            **kwargs: Additional configuration parameters
                split (str): Dataset split to use (default: 'test')
                trust_remote_code (bool): Whether to trust remote code (default: False)
                max_questions (int): Maximum number of questions to load (default: None, load all)
                images_dir (str): Directory containing extracted PNG images (default: None)
        """
        self.split = kwargs.get("split", "test")
        self.images_dir = f"{data_dir}/images/deid_png"

        super().__init__(data_dir, **kwargs)

    def _load_data(self) -> None:
        """Load ReXVQA data from HuggingFace."""
        try:
            # Download images and test_vqa_data.json locally if missing
            self.download_test_vqa_data_json(self.data_dir)
            self.download_rexgradient_images(self.data_dir, test_only=True)
            
            # Load JSON file
            json_file_path = os.path.join(self.data_dir, "metadata", "test_vqa_data.json")
            if not os.path.exists(json_file_path):
                raise FileNotFoundError(f"Could not find test_vqa_data.json in the expected location: {json_file_path}")
            print(f"Loading ReXVQA {self.split} split from local JSON file: {json_file_path}")
            with open(json_file_path, 'r', encoding='utf-8') as f:
                questions_data = json.load(f)
            
            # ReXVQA format: {question_id: {question_data}, ...}
            questions_list = []
            for question_id, question_data in questions_data.items():
                # Add the question_id to the question_data for reference
                question_data['id'] = question_id
                questions_list.append(question_data)
            print(f"Loaded {len(questions_list)} questions from local JSON file")
    
            # Process questions
            for i, item in enumerate(questions_list):
                try:
                    data_point = self._parse_rexvqa_item(item, i)
                    if data_point:
                        self.data_points.append(data_point)
                except Exception as e:
                    print(f"Error loading item {i}: {e}")
                    continue
                
        except Exception as e:
            raise RuntimeError(f"Failed to load ReXVQA dataset: {e}")

    def _parse_rexvqa_item(self, item: Dict[str, Any], index: int) -> Optional[BenchmarkDataPoint]:
        """Parse a ReXVQA dataset item.
        
        Args:
            item (Dict[str, Any]): Dataset item from JSON file
            index (int): Item index
            
        Returns:
            Optional[BenchmarkDataPoint]: Parsed data point
        """
        # Extract question ID
        question_id = item.get("id", f"rexvqa_{self.split}_{index}")

        # Extract question and options
        question = item.get("question", "")
        options = item.get("options", [])
        question_with_options = question + "\n\nOptions:\n" + "\n".join(options)
        
        # Extract correct answer
        correct_answer = item.get("correct_answer", "")
        
        # Extract images
        images = None
        if self.images_dir and "ImagePath" in item and item["ImagePath"]:
            images = []
            for rel_path in item["ImagePath"]:
                norm_rel_path = rel_path.lstrip("./")
                full_path = str(Path(self.images_dir).parent / norm_rel_path)
                images.append(full_path)
        
        # Extract metadata
        metadata = {
            "dataset": "rexvqa",
            "split": self.split,
            "study_id": item.get("study_id", ""),
            "study_instance_uid": item.get("StudyInstanceUid", ""),
            "reasoning_type": item.get("task_name", ""),  # task_name maps to reasoning_type
            "category": item.get("category", ""),
            "class": item.get("class", ""),
            "subcategory": item.get("subcategory", ""),
            "patient_id": item.get("PatientID", ""),
            "patient_age": item.get("PatientAge", ""),
            "patient_sex": item.get("PatientSex", ""),
            "study_date": item.get("StudyDate", ""),
            "indication": item.get("Indication", ""),
            "findings": item.get("Findings", ""),
            "impression": item.get("Impression", ""),
            "image_modality": item.get("ImageModality", []),
            "image_view_position": item.get("ImageViewPosition", []),
            "correct_answer_explanation": item.get("correct_answer_explanation", ""),
        }

        # Return data point
        return BenchmarkDataPoint(
            id=question_id,
            text=question_with_options,
            images=images,
            correct_answer=correct_answer,
            metadata=metadata
        )

    @staticmethod
    def download_rexgradient_images(output_dir: str = "benchmarking/data/rexvqa", repo_id: str = "rajpurkarlab/ReXGradient-160K", test_only: bool = True):
        """Download and extract ReXGradient-160K images if not already present.
        
        Args:
            output_dir: Directory to store downloaded and extracted images
            repo_id: HuggingFace repository ID for the dataset
            test_only: If True, only extract images from the test split (default: True)
        """
        output_dir = Path(output_dir)
        tar_path = output_dir / "deid_png.tar"
        images_dir = output_dir / "images"

        # Check if images already exist
        if images_dir.exists() and any(images_dir.rglob("*.png")):
            print(f"Images already exist in {images_dir}, skipping download.")
            return
            
        # Load test split metadata if test_only is True
        test_image_paths = set()
        if test_only:
            print("Loading test split metadata to identify test images...")
            try:
                # Load the test metadata to get image paths
                test_metadata_path = output_dir / "metadata" / "test_vqa_data.json"
                if test_metadata_path.exists():
                    with open(test_metadata_path, 'r', encoding='utf-8') as f:
                        test_data = json.load(f)
                    
                    # Extract all image paths from test data
                    for item in test_data.values():
                        if "ImagePath" in item and item["ImagePath"]:
                            for rel_path in item["ImagePath"]:
                                # Normalize path to match tar file structure
                                norm_path = rel_path.lstrip("./")
                                test_image_paths.add(norm_path)
                    
                    print(f"Found {len(test_image_paths)} test images to extract")
                else:
                    print("Warning: test_vqa_data.json not found, will extract all images")
                    test_only = False
            except Exception as e:
                print(f"Warning: Could not load test metadata: {e}, will extract all images")
                test_only = False
        output_dir.mkdir(parents=True, exist_ok=True)
        print(f"Output directory: {output_dir}")
        try:
            print("Listing files in repository...")
            files = list_repo_files(repo_id, repo_type='dataset', token=get_hf_token())
            part_files = [f for f in files if f.startswith("deid_png.part")]
            if not part_files:
                print("No part files found. The images might be in a different format.")
                return
            print(f"Found {len(part_files)} part files.")
            # Download part files
            for part_file in part_files:
                output_path = output_dir / part_file
                if output_path.exists():
                    print(f"Skipping {part_file} (already exists)")
                    continue
                print(f"Downloading {part_file}...")
                hf_hub_download(
                    repo_id=repo_id,
                    filename=part_file,
                    local_dir=output_dir,
                    local_dir_use_symlinks=False,
                    repo_type='dataset',
                    token=get_hf_token()
                )
            # Concatenate part files
            if not tar_path.exists():
                print("\nConcatenating part files...")
                with open(tar_path, 'wb') as tar_file:
                    for part_file in sorted(part_files):
                        part_path = output_dir / part_file
                        if part_path.exists():
                            print(f"Adding {part_file}...")
                            with open(part_path, 'rb') as f:
                                tar_file.write(f.read())
                        else:
                            print(f"Warning: {part_file} not found, skipping...")
                
                # Clean up part files after successful concatenation
                print("Cleaning up part files...")
                for part_file in part_files:
                    part_path = output_dir / part_file
                    if part_path.exists():
                        try:
                            part_path.unlink()
                            print(f"Deleted {part_file}")
                        except Exception as e:
                            print(f"Could not delete {part_file}: {e}")
            else:
                print(f"Tar file already exists: {tar_path}")
            # Extract tar file
            if tar_path.exists():
                print("\nExtracting images...")
                images_dir.mkdir(exist_ok=True)
                if any(images_dir.rglob("*.png")):
                    print("Images already extracted.")
                else:
                    try:
                        # Stream extract with filtering for test-only images (no seeking)
                        print("Stream extracting zstd-compressed tar file with filtering (streaming mode)...")

                        # Create a decompressor
                        dctx = zstd.ZstdDecompressor()

                        # Stream extract with filtering
                        extracted_count = 0
                        total_png_members = 0

                        with open(tar_path, 'rb') as compressed_file:
                            with dctx.stream_reader(compressed_file) as decompressed_stream:
                                # Use streaming tar mode to avoid seeks
                                with tarfile.open(fileobj=decompressed_stream, mode='r|') as tar:
                                    for member in tar:
                                        # Only consider PNG files
                                        if not member.isfile() or not member.name.endswith('.png'):
                                            continue
                                        total_png_members += 1

                                        # Normalize name to match entries gathered from JSON
                                        normalized_name = member.name.lstrip('./')

                                        # Decide whether to extract this file
                                        should_extract = True
                                        if test_only:
                                            should_extract = normalized_name in test_image_paths

                                        if not should_extract:
                                            # Must still advance the stream for this member
                                            tar.members = []  # no-op in stream mode; ensure we don't hold refs
                                            continue

                                        # Ensure parent directories exist and write file by streaming
                                        target_path = Path(images_dir) / normalized_name
                                        target_path.parent.mkdir(parents=True, exist_ok=True)

                                        extracted_file_obj = tar.extractfile(member)
                                        if extracted_file_obj is None:
                                            continue
                                        with open(target_path, 'wb') as out_f:
                                            while True:
                                                chunk = extracted_file_obj.read(1024 * 1024)
                                                if not chunk:
                                                    break
                                                out_f.write(chunk)

                                        extracted_count += 1
                                        if extracted_count % 100 == 0:
                                            print(f"Extracted {extracted_count} test images...")

                        print(f"Extraction completed! Extracted {extracted_count} matching PNGs out of {total_png_members} PNG members in the archive")
                        
                        # Clean up compressed tar file after successful extraction
                        print("Cleaning up compressed tar file...")
                        try:
                            tar_path.unlink()
                            print(f"Deleted {tar_path}")
                        except Exception as e:
                            print(f"Could not delete {tar_path}: {e}")
                    except Exception as e:
                        print(f"Error extracting tar file: {e}")
                        return
                png_files = list(images_dir.rglob("*.png"))
                print(f"Extracted {len(png_files)} PNG images to {images_dir}")

        except Exception as e:
            print(f"Error: {e}")

    @staticmethod
    def download_test_vqa_data_json(output_dir: str = "benchmarking/data/rexvqa", repo_id: str = "rajpurkarlab/ReXVQA"):
        """Download test_vqa_data.json from the ReXVQA HuggingFace repo if not already present."""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        json_path = output_dir / "metadata" / "test_vqa_data.json"
        if json_path.exists():
            print(f"test_vqa_data.json already exists at {json_path}, skipping download.")
            return
        print(f"Downloading test_vqa_data.json to {json_path}...")
        try:
            hf_hub_download(
                repo_id=repo_id,
                filename="metadata/test_vqa_data.json",
                local_dir=output_dir,
                local_dir_use_symlinks=False,
                repo_type='dataset',
                token=get_hf_token()
            )
            print("Download complete.")
        except Exception as e:
            print(f"Error downloading test_vqa_data.json: {e}")
            print("You may need to accept the license agreement on HuggingFace.")