File size: 4,419 Bytes

0baf78e

import json
import pandas as pd
from pathlib import Path
from typing import List, Dict
import logging
from datetime import datetime

class DatasetConverter:
    def __init__(self, input_file: str, output_dir: str):
        """

        Initialize the converter with input and output paths

        

        Args:

            input_file (str): Path to the input JSON file

            output_dir (str): Directory to save the output files

        """
        self.input_file = Path(input_file)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        
    def _format_message(self, instruction: str, response: str) -> List[Dict]:
        """

        Format a single instruction-response pair into the required message format

        

        Args:

            instruction (str): The user instruction/question

            response (str): The assistant's response

            

        Returns:

            List[Dict]: Formatted message list

        """
        return [
            {"content": instruction, "role": "user"},
            {"content": response, "role": "assistant"}
        ]
    
    def convert(self) -> None:
        """

        Convert the input JSON file to HuggingFace dataset format

        """
        try:
            # Read input JSON file
            logging.info(f"Reading input file: {self.input_file}")
            with open(self.input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract QA pairs from the JSON structure
            qa_pairs = data.get('pairs', [])  # Handle both raw list and nested structure
            if not qa_pairs and isinstance(data, list):
                qa_pairs = data
            
            logging.info(f"Found {len(qa_pairs)} QA pairs")
            
            # Create dataset records
            dataset_records = []
            for idx, pair in enumerate(qa_pairs):
                try:
                    messages = self._format_message(
                        pair['instruction'],
                        pair['response']
                    )
                    
                    dataset_records.append({
                        'id': f'bloomington_{idx:05d}',
                        'messages': messages
                    })
                except KeyError as e:
                    logging.warning(f"Skipping invalid pair at index {idx}: {e}")
            
            # Convert to DataFrame
            df = pd.DataFrame(dataset_records)
            
            # Save as CSV and JSON
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv'
            json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json'
            
            df.to_csv(csv_path, index=False)
            df.to_json(json_path, orient='records', indent=2)
            
            logging.info(f"Successfully converted {len(dataset_records)} records")
            logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}")
            
            # Generate and save dataset statistics
            stats = {
                'total_records': len(dataset_records),
                'avg_instruction_length': sum(len(record['messages'][0]['content']) 
                                           for record in dataset_records) / len(dataset_records),
                'avg_response_length': sum(len(record['messages'][1]['content']) 
                                        for record in dataset_records) / len(dataset_records),
                'timestamp': timestamp
            }
            
            with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f:
                json.dump(stats, f, indent=2)
                
        except Exception as e:
            logging.error(f"Error converting dataset: {e}", exc_info=True)
            raise

if __name__ == "__main__":
    # Example usage
    converter = DatasetConverter(
        input_file="data/final/final_qa_pairs.json",
        output_dir="data/huggingface"
    )
    converter.convert()