import pandas as pd import numpy as np # Create realistic sample autism screening dataset np.random.seed(42) n_samples = 704 # Features based on typical autism screening questionnaires data = { 'A1_prefer_detail_not_big_picture': np.random.randint(0, 2, n_samples), 'A2_must_have_sameness': np.random.randint(0, 2, n_samples), 'A3_prefer_reading_systematically': np.random.randint(0, 2, n_samples), 'A4_feel_anxious_in_social': np.random.randint(0, 2, n_samples), 'A5_prefer_talking_one_to_one': np.random.randint(0, 2, n_samples), 'A6_notice_small_changes': np.random.randint(0, 2, n_samples), 'A7_trouble_focus_on_changing': np.random.randint(0, 2, n_samples), 'A8_often_daydream': np.random.randint(0, 2, n_samples), 'A9_focused_on_one_topic': np.random.randint(0, 2, n_samples), 'A10_difficult_small_talk': np.random.randint(0, 2, n_samples), 'age': np.random.randint(18, 80, n_samples), 'gender': np.random.choice(['M', 'F'], n_samples), 'ethnicity': np.random.choice(['White', 'Asian', 'Black', 'Others'], n_samples), 'jundice': np.random.choice(['yes', 'no'], n_samples), 'autism_family_member': np.random.choice(['yes', 'no'], n_samples), 'country': np.random.choice(['USA', 'UK', 'Canada', 'India'], n_samples), 'used_app_before': np.random.choice(['yes', 'no'], n_samples), 'screening_type': np.random.choice(['Questionnaire', 'Interview'], n_samples), } autism_score = (data['A1_prefer_detail_not_big_picture'] + data['A2_must_have_sameness'] + data['A4_feel_anxious_in_social'] + data['A9_focused_on_one_topic'] + data['A10_difficult_small_talk']) class_binary = (autism_score >= 3).astype(int) data['Class'] = ['YES' if x == 1 else 'NO' for x in class_binary] df = pd.DataFrame(data) df.to_csv('data/autism_screening.csv', index=False) print(f'✅ Sample dataset created!') print(f' Records: {len(df)}') print(f' Features: {len(df.columns)}') print(f' Saved to: data/autism_screening.csv') print(f'\nClass Distribution:') print(df['Class'].value_counts())