File size: 4,419 Bytes
0baf78e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import json
import pandas as pd
from pathlib import Path
from typing import List, Dict
import logging
from datetime import datetime
class DatasetConverter:
def __init__(self, input_file: str, output_dir: str):
"""
Initialize the converter with input and output paths
Args:
input_file (str): Path to the input JSON file
output_dir (str): Directory to save the output files
"""
self.input_file = Path(input_file)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def _format_message(self, instruction: str, response: str) -> List[Dict]:
"""
Format a single instruction-response pair into the required message format
Args:
instruction (str): The user instruction/question
response (str): The assistant's response
Returns:
List[Dict]: Formatted message list
"""
return [
{"content": instruction, "role": "user"},
{"content": response, "role": "assistant"}
]
def convert(self) -> None:
"""
Convert the input JSON file to HuggingFace dataset format
"""
try:
# Read input JSON file
logging.info(f"Reading input file: {self.input_file}")
with open(self.input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract QA pairs from the JSON structure
qa_pairs = data.get('pairs', []) # Handle both raw list and nested structure
if not qa_pairs and isinstance(data, list):
qa_pairs = data
logging.info(f"Found {len(qa_pairs)} QA pairs")
# Create dataset records
dataset_records = []
for idx, pair in enumerate(qa_pairs):
try:
messages = self._format_message(
pair['instruction'],
pair['response']
)
dataset_records.append({
'id': f'bloomington_{idx:05d}',
'messages': messages
})
except KeyError as e:
logging.warning(f"Skipping invalid pair at index {idx}: {e}")
# Convert to DataFrame
df = pd.DataFrame(dataset_records)
# Save as CSV and JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv'
json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json'
df.to_csv(csv_path, index=False)
df.to_json(json_path, orient='records', indent=2)
logging.info(f"Successfully converted {len(dataset_records)} records")
logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}")
# Generate and save dataset statistics
stats = {
'total_records': len(dataset_records),
'avg_instruction_length': sum(len(record['messages'][0]['content'])
for record in dataset_records) / len(dataset_records),
'avg_response_length': sum(len(record['messages'][1]['content'])
for record in dataset_records) / len(dataset_records),
'timestamp': timestamp
}
with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f:
json.dump(stats, f, indent=2)
except Exception as e:
logging.error(f"Error converting dataset: {e}", exc_info=True)
raise
if __name__ == "__main__":
# Example usage
converter = DatasetConverter(
input_file="data/final/final_qa_pairs.json",
output_dir="data/huggingface"
)
converter.convert() |