Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Augment the Data | |
| ================= | |
| Parameters: | |
| ----------- | |
| data: | |
| description: Data to augment | |
| type: pd.DataFrame | |
| ----------- | |
| Returns: | |
| -------- | |
| data: | |
| description: Augmented data | |
| type: pd.DataFrame | |
| -------------------------------------------------------------------------------------------- | |
| Working: | |
| -------- | |
| - Create a DataFrame from data | |
| - Remove a symptom or cause from the new DataFrame | |
| - Check if the resulting row is present in the original data | |
| - If not present, add the resulting row to the new DataFrame | |
| - Repeat steps 1-3 for all symptoms and causes | |
| - Remove the rows with sum = 0 | |
| - Remove the same rows from the new DataFrame | |
| - Add the new DataFrame to the original data | |
| - Return the Resulting DataFrame | |
| -------------------------------------------------------------------------------------------- | |
| """ | |
| # Get the number of columns with symptoms_ prefix | |
| numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')]) | |
| symptoms = data.columns[1:numberOfSymptoms] | |
| causes = data.columns[numberOfSymptoms:] | |
| df = data | |
| for index, row in data.iterrows(): | |
| for symptom in symptoms: | |
| if row[symptom] == 1: # type: ignore | |
| row[symptom] = 0 | |
| df = df.append(row, ignore_index=True) # type: ignore | |
| row[symptom] = 1 | |
| df.append(row, ignore_index=True) | |
| for cause in causes: | |
| if row[cause] == 1: # type: ignore | |
| row[cause] = 0 | |
| df = df.append(row, ignore_index=True) # type: ignore | |
| row[cause] = 1 | |
| df.append(row, ignore_index=True) | |
| print(f"data before drop_duplicates: {df}") | |
| df = df[(df.sum(axis=1, numeric_only=True) != 0)] | |
| data = data.append(df, ignore_index=True) # type: ignore | |
| data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False) | |
| data.reset_index(drop=True, inplace=True) | |
| print(f"final data: {data}") | |
| return data |