Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

File size: 8,891 Bytes

cacd4d0

"""
Data loading utilities for various file formats
"""

import json
import base64
import pandas as pd
from typing import Any, Optional, Union, List , Dict
from pathlib import Path
import logging

logger = logging.getLogger(__name__)

class DataLoader:
    """
    Utility class for loading data from various sources
    """
    
    def __init__(self):
        self.supported_formats = [
            '.csv', '.json', '.jsonl', '.txt', '.md', '.xlsx',
            '.png', '.jpg', '.jpeg'
        ]
    
    def load(self, source: Union[str, Path], format_hint: Optional[str] = None) -> Optional[Any]:
        """
        Load data from any supported source
        
        Args:
            source: File path or data source
            format_hint: Optional format hint to override auto-detection
            
        Returns:
            Loaded data or None if failed
        """
        try:
            path = Path(source)
            
            if not path.exists():
                logger.error(f"File not found: {source}")
                return None
            
            # Use format hint or detect from extension
            file_format = format_hint or path.suffix.lower()
            
            if file_format == '.csv':
                return self.load_csv(path)
            elif file_format == '.json':
                return self.load_json(path)
            elif file_format == '.jsonl':
                return self.load_jsonl(path)
            elif file_format in ['.txt', '.md']:
                return self.load_text(path)
            elif file_format == '.xlsx':
                return self.load_excel(path)
            elif file_format in ['.png', '.jpg', '.jpeg']:
                return self.load_image_base64(path)
            else:
                logger.warning(f"Unsupported format: {file_format}")
                return None
                
        except Exception as e:
            logger.error(f"Failed to load data from {source}: {str(e)}")
            return None
    
    def load_csv(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
        """Load CSV file as pandas DataFrame"""
        try:
            df = pd.read_csv(path)
            logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
            return df
        except Exception as e:
            logger.error(f"Failed to load CSV {path}: {str(e)}")
            return None
    
    def load_json(self, path: Union[str, Path]) -> Optional[Any]:
        """Load JSON file"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            if isinstance(data, list):
                logger.info(f"Loaded JSON with {len(data)} items")
            else:
                logger.info("Loaded JSON object")
            
            return data
        except Exception as e:
            logger.error(f"Failed to load JSON {path}: {str(e)}")
            return None
    
    def load_jsonl(self, path: Union[str, Path]) -> Optional[List[Dict]]:
        """Load JSONL (JSON Lines) file"""
        try:
            data = []
            with open(path, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if line:
                        try:
                            data.append(json.loads(line))
                        except json.JSONDecodeError as e:
                            logger.warning(f"Invalid JSON on line {line_num}: {str(e)}")
            
            logger.info(f"Loaded JSONL with {len(data)} items")
            return data
        except Exception as e:
            logger.error(f"Failed to load JSONL {path}: {str(e)}")
            return None
    
    def load_text(self, path: Union[str, Path]) -> Optional[str]:
        """Load plain text file"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            logger.info(f"Loaded text file with {len(content)} characters")
            return content
        except Exception as e:
            logger.error(f"Failed to load text {path}: {str(e)}")
            return None
    
    def load_excel(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
        """Load Excel file as pandas DataFrame"""
        try:
            df = pd.read_excel(path)
            logger.info(f"Loaded Excel with {len(df)} rows and {len(df.columns)} columns")
            return df
        except Exception as e:
            logger.error(f"Failed to load Excel {path}: {str(e)}")
            return None
            
    def load_image_base64(self, path: Union[str, Path]) -> Optional[str]:
        """Load image file and encode as Base64 string"""
        try:
            with open(path, 'rb') as f:
                encoded_string = base64.b64encode(f.read()).decode('utf-8')
            logger.info(f"Loaded image {path} and encoded to Base64")
            return encoded_string
        except Exception as e:
            logger.error(f"Failed to load image {path}: {str(e)}")
            return None

    def is_supported_format(self, file_path: Union[str, Path]) -> bool:
        """Check if file format is supported"""
        path = Path(file_path)
        return path.suffix.lower() in self.supported_formats
    
    def get_file_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
        """Get information about a file"""
        path = Path(file_path)
        
        if not path.exists():
            return {'exists': False}
        
        return {
            'exists': True,
            'size': path.stat().st_size,
            'format': path.suffix.lower(),
            'supported': self.is_supported_format(path),
            'name': path.name,
            'stem': path.stem,
            'parent': str(path.parent)
        }

    def load_ui_tree_dataset(self, json_dir: str, screenshots_dir: str) -> List[Dict[str, Any]]:
        """
        Load UI tree dataset by pairing JSON files with corresponding screenshots
        
        Args:
            json_dir: Directory containing JSON files (e.g., "json_tree")
            screenshots_dir: Directory containing screenshot images (e.g., "screenshots")
            
        Returns:
            List of dictionaries with 'input', 'output', and 'image' keys
        """
        json_path = Path(json_dir)
        screenshots_path = Path(screenshots_dir)
        
        if not json_path.exists():
            raise FileNotFoundError(f"JSON directory not found: {json_dir}")
        if not screenshots_path.exists():
            raise FileNotFoundError(f"Screenshots directory not found: {screenshots_dir}")
        
        dataset = []
        
        # Get all JSON files
        json_files = list(json_path.glob("*.json"))
        logger.info(f"Found {len(json_files)} JSON files in {json_dir}")
        
        for json_file in json_files:
            # Extract filename without extension (e.g., "2" from "2.json")
            file_stem = json_file.stem
            
            # Look for corresponding image file
            image_extensions = ['.jpg', '.jpeg', '.png']
            image_file = None
            
            for ext in image_extensions:
                potential_image = screenshots_path / f"{file_stem}{ext}"
                if potential_image.exists():
                    image_file = potential_image
                    break
            
            if not image_file:
                logger.warning(f"No corresponding image found for {json_file.name}")
                continue
                
            try:
                # Load JSON content
                json_data = self.load_json(json_file)
                if not json_data:
                    logger.warning(f"Failed to load JSON: {json_file}")
                    continue
                    
                # Load image as base64
                image_base64 = self.load_image_base64(image_file)
                if not image_base64:
                    logger.warning(f"Failed to load image: {image_file}")
                    continue
                
                # Create dataset entry
                dataset_entry = {
                    'input': 'Extract UI elements from this screenshot and provide the complete UI tree structure',
                    'output': json.dumps(json_data, indent=2),  # Convert JSON to string
                    'image': image_base64
                }
                
                dataset.append(dataset_entry)
                logger.debug(f"Loaded pair: {json_file.name} + {image_file.name}")
                
            except Exception as e:
                logger.error(f"Error loading {json_file.name}: {str(e)}")
                continue
        
        logger.info(f"Successfully loaded {len(dataset)} image-JSON pairs")
        return dataset