prediction-api / inspect_data.py
3v324v23's picture
feat(analysis): Add data inspection script and schema generation
f81c1a5
import pandas as pd
import pickle
def inspect_and_generate_schemas():
# Load Data
print("Loading data...")
df = pd.read_csv('01_merged_data.csv')
# Define features explicitly based on analysis
excluded_cols = [
'id_employee', 'eval_number', 'code_sondage',
'a_quitte_l_entreprise', 'turnover'
]
features = [c for c in df.columns if c not in excluded_cols]
print(f"Selected features: {features}")
# Generate Pydantic Schema
pydantic_fields = []
sql_columns = []
type_mapping = {
'int64': 'int',
'float64': 'float',
'object': 'str',
'bool': 'bool'
}
sql_type_mapping = {
'int64': 'Integer',
'float64': 'Float',
'object': 'String',
'bool': 'Boolean'
}
for feature in features:
dtype = str(df[feature].dtype)
py_type = type_mapping.get(dtype, 'str')
sql_type = sql_type_mapping.get(dtype, 'String')
pydantic_fields.append(f" {feature}: {py_type}")
sql_columns.append(f" {feature} = Column({sql_type})")
pydantic_schema = "class InputSchema(BaseModel):\n" + "\n".join(pydantic_fields)
print("\n--- Generated Pydantic Schema ---")
print(pydantic_schema)
print("\n--- Generated SQL Columns (for reference) ---")
print("\n".join(sql_columns))
# Save to file
with open('generated_schemas_v2.txt', 'w') as f:
f.write(pydantic_schema)
f.write("\n\n")
f.write("\n".join(sql_columns))
if __name__ == "__main__":
inspect_and_generate_schemas()