|
|
import json
|
|
|
import pandas as pd
|
|
|
from pathlib import Path
|
|
|
from typing import List, Dict
|
|
|
import logging
|
|
|
from datetime import datetime
|
|
|
|
|
|
class DatasetConverter:
|
|
|
def __init__(self, input_file: str, output_dir: str):
|
|
|
"""
|
|
|
Initialize the converter with input and output paths
|
|
|
|
|
|
Args:
|
|
|
input_file (str): Path to the input JSON file
|
|
|
output_dir (str): Directory to save the output files
|
|
|
"""
|
|
|
self.input_file = Path(input_file)
|
|
|
self.output_dir = Path(output_dir)
|
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
)
|
|
|
|
|
|
def _format_message(self, instruction: str, response: str) -> List[Dict]:
|
|
|
"""
|
|
|
Format a single instruction-response pair into the required message format
|
|
|
|
|
|
Args:
|
|
|
instruction (str): The user instruction/question
|
|
|
response (str): The assistant's response
|
|
|
|
|
|
Returns:
|
|
|
List[Dict]: Formatted message list
|
|
|
"""
|
|
|
return [
|
|
|
{"content": instruction, "role": "user"},
|
|
|
{"content": response, "role": "assistant"}
|
|
|
]
|
|
|
|
|
|
def convert(self) -> None:
|
|
|
"""
|
|
|
Convert the input JSON file to HuggingFace dataset format
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
logging.info(f"Reading input file: {self.input_file}")
|
|
|
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
|
qa_pairs = data.get('pairs', [])
|
|
|
if not qa_pairs and isinstance(data, list):
|
|
|
qa_pairs = data
|
|
|
|
|
|
logging.info(f"Found {len(qa_pairs)} QA pairs")
|
|
|
|
|
|
|
|
|
dataset_records = []
|
|
|
for idx, pair in enumerate(qa_pairs):
|
|
|
try:
|
|
|
messages = self._format_message(
|
|
|
pair['instruction'],
|
|
|
pair['response']
|
|
|
)
|
|
|
|
|
|
dataset_records.append({
|
|
|
'id': f'bloomington_{idx:05d}',
|
|
|
'messages': messages
|
|
|
})
|
|
|
except KeyError as e:
|
|
|
logging.warning(f"Skipping invalid pair at index {idx}: {e}")
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(dataset_records)
|
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv'
|
|
|
json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json'
|
|
|
|
|
|
df.to_csv(csv_path, index=False)
|
|
|
df.to_json(json_path, orient='records', indent=2)
|
|
|
|
|
|
logging.info(f"Successfully converted {len(dataset_records)} records")
|
|
|
logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}")
|
|
|
|
|
|
|
|
|
stats = {
|
|
|
'total_records': len(dataset_records),
|
|
|
'avg_instruction_length': sum(len(record['messages'][0]['content'])
|
|
|
for record in dataset_records) / len(dataset_records),
|
|
|
'avg_response_length': sum(len(record['messages'][1]['content'])
|
|
|
for record in dataset_records) / len(dataset_records),
|
|
|
'timestamp': timestamp
|
|
|
}
|
|
|
|
|
|
with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f:
|
|
|
json.dump(stats, f, indent=2)
|
|
|
|
|
|
except Exception as e:
|
|
|
logging.error(f"Error converting dataset: {e}", exc_info=True)
|
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
converter = DatasetConverter(
|
|
|
input_file="data/final/final_qa_pairs.json",
|
|
|
output_dir="data/huggingface"
|
|
|
)
|
|
|
converter.convert() |