| | |
| | """ |
| | Check original Dolci dataset format to understand the structure. |
| | """ |
| |
|
| | from datasets import load_dataset |
| | import json |
| |
|
| | def main(): |
| | print("Loading dataset...") |
| | dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train") |
| |
|
| | |
| | start_idx = len(dataset) - 20000 |
| | sample_indices = [start_idx, start_idx + 100, start_idx + 500] |
| |
|
| | for idx in sample_indices: |
| | print(f"\n{'='*60}") |
| | print(f"Sample {idx}:") |
| | print(f"{'='*60}") |
| |
|
| | sample = dataset[idx] |
| | messages = sample['messages'] |
| |
|
| | print(f"\nTotal messages: {len(messages)}") |
| |
|
| | for i, msg in enumerate(messages): |
| | role = msg.get('role', '') |
| | content = msg.get('content', '') |
| | function_calls = msg.get('function_calls') |
| | functions = msg.get('functions') |
| |
|
| | print(f"\n--- Message {i} ---") |
| | print(f"Role: {role}") |
| | if content: |
| | print(f"Content: {content[:200]}{'...' if len(content) > 200 else ''}") |
| | if function_calls: |
| | print(f"Function calls: {function_calls[:200]}{'...' if len(function_calls) > 200 else ''}") |
| | if functions: |
| | print(f"Has functions: True (length: {len(functions)})") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|