import pandas as pd import pickle def inspect_and_generate_schemas(): # Load Data print("Loading data...") df = pd.read_csv('01_merged_data.csv') # Define features explicitly based on analysis excluded_cols = [ 'id_employee', 'eval_number', 'code_sondage', 'a_quitte_l_entreprise', 'turnover' ] features = [c for c in df.columns if c not in excluded_cols] print(f"Selected features: {features}") # Generate Pydantic Schema pydantic_fields = [] sql_columns = [] type_mapping = { 'int64': 'int', 'float64': 'float', 'object': 'str', 'bool': 'bool' } sql_type_mapping = { 'int64': 'Integer', 'float64': 'Float', 'object': 'String', 'bool': 'Boolean' } for feature in features: dtype = str(df[feature].dtype) py_type = type_mapping.get(dtype, 'str') sql_type = sql_type_mapping.get(dtype, 'String') pydantic_fields.append(f" {feature}: {py_type}") sql_columns.append(f" {feature} = Column({sql_type})") pydantic_schema = "class InputSchema(BaseModel):\n" + "\n".join(pydantic_fields) print("\n--- Generated Pydantic Schema ---") print(pydantic_schema) print("\n--- Generated SQL Columns (for reference) ---") print("\n".join(sql_columns)) # Save to file with open('generated_schemas_v2.txt', 'w') as f: f.write(pydantic_schema) f.write("\n\n") f.write("\n".join(sql_columns)) if __name__ == "__main__": inspect_and_generate_schemas()