import json import pandas as pd from pathlib import Path from typing import List, Dict import logging from datetime import datetime class DatasetConverter: def __init__(self, input_file: str, output_dir: str): """ Initialize the converter with input and output paths Args: input_file (str): Path to the input JSON file output_dir (str): Directory to save the output files """ self.input_file = Path(input_file) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) def _format_message(self, instruction: str, response: str) -> List[Dict]: """ Format a single instruction-response pair into the required message format Args: instruction (str): The user instruction/question response (str): The assistant's response Returns: List[Dict]: Formatted message list """ return [ {"content": instruction, "role": "user"}, {"content": response, "role": "assistant"} ] def convert(self) -> None: """ Convert the input JSON file to HuggingFace dataset format """ try: # Read input JSON file logging.info(f"Reading input file: {self.input_file}") with open(self.input_file, 'r', encoding='utf-8') as f: data = json.load(f) # Extract QA pairs from the JSON structure qa_pairs = data.get('pairs', []) # Handle both raw list and nested structure if not qa_pairs and isinstance(data, list): qa_pairs = data logging.info(f"Found {len(qa_pairs)} QA pairs") # Create dataset records dataset_records = [] for idx, pair in enumerate(qa_pairs): try: messages = self._format_message( pair['instruction'], pair['response'] ) dataset_records.append({ 'id': f'bloomington_{idx:05d}', 'messages': messages }) except KeyError as e: logging.warning(f"Skipping invalid pair at index {idx}: {e}") # Convert to DataFrame df = pd.DataFrame(dataset_records) # Save as CSV and JSON timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv' json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json' df.to_csv(csv_path, index=False) df.to_json(json_path, orient='records', indent=2) logging.info(f"Successfully converted {len(dataset_records)} records") logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}") # Generate and save dataset statistics stats = { 'total_records': len(dataset_records), 'avg_instruction_length': sum(len(record['messages'][0]['content']) for record in dataset_records) / len(dataset_records), 'avg_response_length': sum(len(record['messages'][1]['content']) for record in dataset_records) / len(dataset_records), 'timestamp': timestamp } with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f: json.dump(stats, f, indent=2) except Exception as e: logging.error(f"Error converting dataset: {e}", exc_info=True) raise if __name__ == "__main__": # Example usage converter = DatasetConverter( input_file="data/final/final_qa_pairs.json", output_dir="data/huggingface" ) converter.convert()