llm / check_original_format.py
dongxx1104's picture
Upload folder using huggingface_hub
db704cb verified
#!/usr/bin/env python3
"""
Check original Dolci dataset format to understand the structure.
"""
from datasets import load_dataset
import json
def main():
print("Loading dataset...")
dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train")
# Check samples from the last 20k
start_idx = len(dataset) - 20000
sample_indices = [start_idx, start_idx + 100, start_idx + 500]
for idx in sample_indices:
print(f"\n{'='*60}")
print(f"Sample {idx}:")
print(f"{'='*60}")
sample = dataset[idx]
messages = sample['messages']
print(f"\nTotal messages: {len(messages)}")
for i, msg in enumerate(messages):
role = msg.get('role', '')
content = msg.get('content', '')
function_calls = msg.get('function_calls')
functions = msg.get('functions')
print(f"\n--- Message {i} ---")
print(f"Role: {role}")
if content:
print(f"Content: {content[:200]}{'...' if len(content) > 200 else ''}")
if function_calls:
print(f"Function calls: {function_calls[:200]}{'...' if len(function_calls) > 200 else ''}")
if functions:
print(f"Has functions: True (length: {len(functions)})")
if __name__ == "__main__":
main()