"""debug_downloaded_data.py - Inspect the downloaded conversation format""" import json def inspect_downloaded_data(): """Inspect the first few records to understand the format""" data_path = "data/conversation_raw/OpenAssistant_oasst1_raw.jsonl" print("🔍 Inspecting downloaded OpenAssistant data...") print("="*50) try: with open(data_path, 'r', encoding='utf-8') as f: for i in range(5): line = f.readline().strip() if line: record = json.loads(line) print(f"\nRecord {i+1}:") print(f"Top-level keys: {list(record.keys())}") # Show sample content for each key for key, value in record.items(): if isinstance(value, str) and len(value) > 100: value = value[:100] + "..." elif isinstance(value, dict): value = f"Dict with keys: {list(value.keys())}" elif isinstance(value, list): value = f"List with {len(value)} items" print(f" {key}: {value}") # If there's a nested structure, explore it for key in ['prompt', 'conversation', 'messages']: if key in record and isinstance(record[key], (dict, list)): print(f"\n Exploring {key}:") nested = record[key] if isinstance(nested, dict): print(f" Keys: {list(nested.keys())}") for nkey, nvalue in list(nested.items())[:3]: if isinstance(nvalue, str) and len(nvalue) > 50: nvalue = nvalue[:50] + "..." print(f" {nkey}: {nvalue}") elif isinstance(nested, list) and nested: print(f" First item type: {type(nested[0])}") if isinstance(nested, dict): print(f" First item keys: {list(nested.keys())}") except Exception as e: print(f"Error reading file: {e}") if __name__ == "__main__": inspect_downloaded_data()