File size: 4,419 Bytes
0baf78e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import pandas as pd
from pathlib import Path
from typing import List, Dict
import logging
from datetime import datetime

class DatasetConverter:
    def __init__(self, input_file: str, output_dir: str):
        """

        Initialize the converter with input and output paths

        

        Args:

            input_file (str): Path to the input JSON file

            output_dir (str): Directory to save the output files

        """
        self.input_file = Path(input_file)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        
    def _format_message(self, instruction: str, response: str) -> List[Dict]:
        """

        Format a single instruction-response pair into the required message format

        

        Args:

            instruction (str): The user instruction/question

            response (str): The assistant's response

            

        Returns:

            List[Dict]: Formatted message list

        """
        return [
            {"content": instruction, "role": "user"},
            {"content": response, "role": "assistant"}
        ]
    
    def convert(self) -> None:
        """

        Convert the input JSON file to HuggingFace dataset format

        """
        try:
            # Read input JSON file
            logging.info(f"Reading input file: {self.input_file}")
            with open(self.input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Extract QA pairs from the JSON structure
            qa_pairs = data.get('pairs', [])  # Handle both raw list and nested structure
            if not qa_pairs and isinstance(data, list):
                qa_pairs = data
            
            logging.info(f"Found {len(qa_pairs)} QA pairs")
            
            # Create dataset records
            dataset_records = []
            for idx, pair in enumerate(qa_pairs):
                try:
                    messages = self._format_message(
                        pair['instruction'],
                        pair['response']
                    )
                    
                    dataset_records.append({
                        'id': f'bloomington_{idx:05d}',
                        'messages': messages
                    })
                except KeyError as e:
                    logging.warning(f"Skipping invalid pair at index {idx}: {e}")
            
            # Convert to DataFrame
            df = pd.DataFrame(dataset_records)
            
            # Save as CSV and JSON
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv'
            json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json'
            
            df.to_csv(csv_path, index=False)
            df.to_json(json_path, orient='records', indent=2)
            
            logging.info(f"Successfully converted {len(dataset_records)} records")
            logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}")
            
            # Generate and save dataset statistics
            stats = {
                'total_records': len(dataset_records),
                'avg_instruction_length': sum(len(record['messages'][0]['content']) 
                                           for record in dataset_records) / len(dataset_records),
                'avg_response_length': sum(len(record['messages'][1]['content']) 
                                        for record in dataset_records) / len(dataset_records),
                'timestamp': timestamp
            }
            
            with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f:
                json.dump(stats, f, indent=2)
                
        except Exception as e:
            logging.error(f"Error converting dataset: {e}", exc_info=True)
            raise

if __name__ == "__main__":
    # Example usage
    converter = DatasetConverter(
        input_file="data/final/final_qa_pairs.json",
        output_dir="data/huggingface"
    )
    converter.convert()