dongxx1104
/

llm

Model card Files Files and versions

llm / extract_no_tool_call.py

dongxx1104's picture

Upload folder using huggingface_hub

db704cb verified 21 days ago

history blame contribute delete

3.23 kB

	#!/usr/bin/env python3
	"""
	Extract 50k samples without tool calls from Dolci-Instruct-SFT dataset
	"""
	import os
	import json
	import pyarrow.parquet as pq
	from pathlib import Path

	# Source and destination paths
	SOURCE_DIR = "/shared_workspace_mfs/ximing/datasets/Dolci-Instruct-SFT/data"
	DEST_DIR = "/shared_workspace_mfs/ximing/LLaMA-Factory/data"
	OUTPUT_FILE = "dolci_50k_no_tool_call.json"
	TARGET_COUNT = 50000

	def has_tool_call(conversations):
	"""Check if the conversation contains any tool calls"""
	if not conversations:
	return False

	for msg in conversations:
	role = msg.get("role", "")
	content = msg.get("content", "")

	# Check for tool/function role
	if role in ["tool", "function"]:
	return True

	# Check for tool_calls in assistant messages
	if "tool_calls" in msg and msg["tool_calls"]:
	return True

	# Check for function_call in assistant messages (old format)
	if "function_call" in msg and msg["function_call"]:
	return True

	return False

	def main():
	# Get all parquet files sorted
	parquet_files = sorted(Path(SOURCE_DIR).glob("train-*.parquet"))
	print(f"Found {len(parquet_files)} parquet files")

	no_tool_call_samples = []
	total_processed = 0

	for parquet_file in parquet_files:
	print(f"\nProcessing: {parquet_file.name}")

	# Read parquet file
	table = pq.read_table(parquet_file)
	data = table.to_pydict()
	num_rows = len(data[list(data.keys())[0]])
	print(f" Total rows: {num_rows}")

	# Process each row
	for idx in range(num_rows):
	total_processed += 1

	# Extract row data
	row_data = {key: data[key][idx] for key in data.keys()}
	conversations = row_data.get("conversations", [])

	# Check if this sample has tool calls
	if not has_tool_call(conversations):
	no_tool_call_samples.append(row_data)

	# Check if we've reached the target
	if len(no_tool_call_samples) >= TARGET_COUNT:
	print(f"\n✓ Reached target of {TARGET_COUNT} samples!")
	break

	# Progress indicator
	if (idx + 1) % 1000 == 0:
	print(f" Progress: {idx + 1}/{num_rows}, no tool call: {len(no_tool_call_samples)}")

	print(f" No tool call samples so far: {len(no_tool_call_samples)}")

	# Stop if we've reached the target
	if len(no_tool_call_samples) >= TARGET_COUNT:
	break

	# Save to JSON file
	output_path = os.path.join(DEST_DIR, OUTPUT_FILE)
	print(f"\nSaving {len(no_tool_call_samples)} samples to: {output_path}")

	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(no_tool_call_samples, f, ensure_ascii=False, indent=2)

	print(f"\n{'='*60}")
	print(f"Summary:")
	print(f" Total processed: {total_processed}")
	print(f" No tool call samples: {len(no_tool_call_samples)}")
	print(f" Output file: {output_path}")
	print(f" File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
	print(f"{'='*60}")

	if __name__ == "__main__":
	main()