Spaces:

HaiderSultanArc
/

Unani-Medicine-AI-Engine

Runtime error

App Files Files Community

Unani-Medicine-AI-Engine / tasks /data /dataAugmentation.py

HaiderSultanArc

AI Engine API

ba600a6 over 2 years ago

raw

history blame contribute delete

2.31 kB

	import pandas as pd


	def augmentDataWithVectorSpaceAlgorithm(data: pd.DataFrame) -> pd.DataFrame:
	"""
	Augment the Data
	=================
	Parameters:
	-----------
	data:
	description: Data to augment
	type: pd.DataFrame
	-----------
	Returns:
	--------
	data:
	description: Augmented data
	type: pd.DataFrame
	--------------------------------------------------------------------------------------------
	Working:
	--------
	- Create a DataFrame from data
	- Remove a symptom or cause from the new DataFrame
	- Check if the resulting row is present in the original data
	- If not present, add the resulting row to the new DataFrame
	- Repeat steps 1-3 for all symptoms and causes
	- Remove the rows with sum = 0
	- Remove the same rows from the new DataFrame
	- Add the new DataFrame to the original data
	- Return the Resulting DataFrame
	--------------------------------------------------------------------------------------------
	"""

	# Get the number of columns with symptoms_ prefix
	numberOfSymptoms = len([col for col in data.columns if col.startswith('symptoms_')])

	symptoms = data.columns[1:numberOfSymptoms]
	causes = data.columns[numberOfSymptoms:]
	df = data

	for index, row in data.iterrows():
	for symptom in symptoms:
	if row[symptom] == 1: # type: ignore
	row[symptom] = 0
	df = df.append(row, ignore_index=True) # type: ignore
	row[symptom] = 1
	df.append(row, ignore_index=True)

	for cause in causes:
	if row[cause] == 1: # type: ignore
	row[cause] = 0
	df = df.append(row, ignore_index=True) # type: ignore
	row[cause] = 1
	df.append(row, ignore_index=True)


	print(f"data before drop_duplicates: {df}")

	df = df[(df.sum(axis=1, numeric_only=True) != 0)]
	data = data.append(df, ignore_index=True) # type: ignore
	data = data.drop_duplicates(subset=df.columns.difference(['disease']), keep=False)
	data.reset_index(drop=True, inplace=True)

	print(f"final data: {data}")

	return data