krishna3103
/

Bloomington

Model card Files Files and versions

Bloomington / post_processor.py

krishna3103's picture

Upload 8 files

0baf78e verified about 1 year ago

history blame contribute delete

4.42 kB

	import json
	import pandas as pd
	from pathlib import Path
	from typing import List, Dict
	import logging
	from datetime import datetime

	class DatasetConverter:
	def __init__(self, input_file: str, output_dir: str):
	"""
	Initialize the converter with input and output paths

	Args:
	input_file (str): Path to the input JSON file
	output_dir (str): Directory to save the output files
	"""
	self.input_file = Path(input_file)
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(parents=True, exist_ok=True)

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	def _format_message(self, instruction: str, response: str) -> List[Dict]:
	"""
	Format a single instruction-response pair into the required message format

	Args:
	instruction (str): The user instruction/question
	response (str): The assistant's response

	Returns:
	List[Dict]: Formatted message list
	"""
	return [
	{"content": instruction, "role": "user"},
	{"content": response, "role": "assistant"}
	]

	def convert(self) -> None:
	"""
	Convert the input JSON file to HuggingFace dataset format
	"""
	try:
	# Read input JSON file
	logging.info(f"Reading input file: {self.input_file}")
	with open(self.input_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Extract QA pairs from the JSON structure
	qa_pairs = data.get('pairs', []) # Handle both raw list and nested structure
	if not qa_pairs and isinstance(data, list):
	qa_pairs = data

	logging.info(f"Found {len(qa_pairs)} QA pairs")

	# Create dataset records
	dataset_records = []
	for idx, pair in enumerate(qa_pairs):
	try:
	messages = self._format_message(
	pair['instruction'],
	pair['response']
	)

	dataset_records.append({
	'id': f'bloomington_{idx:05d}',
	'messages': messages
	})
	except KeyError as e:
	logging.warning(f"Skipping invalid pair at index {idx}: {e}")

	# Convert to DataFrame
	df = pd.DataFrame(dataset_records)

	# Save as CSV and JSON
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	csv_path = self.output_dir / f'bloomington_dataset_{timestamp}.csv'
	json_path = self.output_dir / f'bloomington_dataset_{timestamp}.json'

	df.to_csv(csv_path, index=False)
	df.to_json(json_path, orient='records', indent=2)

	logging.info(f"Successfully converted {len(dataset_records)} records")
	logging.info(f"Saved dataset to:\n- CSV: {csv_path}\n- JSON: {json_path}")

	# Generate and save dataset statistics
	stats = {
	'total_records': len(dataset_records),
	'avg_instruction_length': sum(len(record['messages'][0]['content'])
	for record in dataset_records) / len(dataset_records),
	'avg_response_length': sum(len(record['messages'][1]['content'])
	for record in dataset_records) / len(dataset_records),
	'timestamp': timestamp
	}

	with open(self.output_dir / f'dataset_stats_{timestamp}.json', 'w') as f:
	json.dump(stats, f, indent=2)

	except Exception as e:
	logging.error(f"Error converting dataset: {e}", exc_info=True)
	raise

	if __name__ == "__main__":
	# Example usage
	converter = DatasetConverter(
	input_file="data/final/final_qa_pairs.json",
	output_dir="data/huggingface"
	)
	converter.convert()