Spaces:
Sleeping
Sleeping
Create clean_admission_data.py
Browse files- src/clean_admission_data.py +99 -0
src/clean_admission_data.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
def clean_admission_data(input_file_path, output_csv_path):
|
| 3 |
+
"""
|
| 4 |
+
Reads raw admission data from an Excel file, cleans it, combines minority and non-minority
|
| 5 |
+
sanctioned/admitted numbers, calculates vacancies, and saves to a new CSV.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
input_file_path (str): Path to the raw input Excel file (e.g., 'Admission 24-07-2025.xlsx').
|
| 9 |
+
output_csv_path (str): Path where the cleaned CSV file will be saved.
|
| 10 |
+
"""
|
| 11 |
+
try:
|
| 12 |
+
# Read the raw Excel file, skipping the initial header rows.
|
| 13 |
+
# Based on the file snippet, data starts from the 6th row (index 5).
|
| 14 |
+
# Using pd.read_excel for .xlsx files.
|
| 15 |
+
df_raw = pd.read_excel(input_file_path, skiprows=4)
|
| 16 |
+
|
| 17 |
+
# Define the column names for the raw DataFrame after skipping rows.
|
| 18 |
+
# This list must match the exact number of columns in your data (12 columns).
|
| 19 |
+
# These names are derived from the structure of your input Excel snippet.
|
| 20 |
+
df_raw.columns = [
|
| 21 |
+
'S.No', 'District', 'Institution Name',
|
| 22 |
+
'V Minorities Sanctioned', 'V Minorities Admitted',
|
| 23 |
+
'V NonMinorities Sanctioned', 'V NonMinorities Admitted',
|
| 24 |
+
'Course',
|
| 25 |
+
'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
|
| 26 |
+
'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted'
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
# Clean 'Institution Name': remove text within brackets and replace 'Boys'/'Girls'.
|
| 30 |
+
# Convert to string first to handle potential non-string types.
|
| 31 |
+
#df_raw['Institution Name'] = df_raw['Institution Name'].astype(str).apply(
|
| 32 |
+
# lambda x: re.sub(r'\([^)]*\)', '', x).replace('Boys', 'B').replace('Girls', 'G').strip()
|
| 33 |
+
#)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# --- ADDON CODE: Remove the last row of the input data ---
|
| 37 |
+
# This is useful if the last row contains summary statistics or unwanted data.
|
| 38 |
+
if not df_raw.empty:
|
| 39 |
+
df_raw = df_raw.iloc[:-1]
|
| 40 |
+
print("Last row of the raw data has been removed.")
|
| 41 |
+
else:
|
| 42 |
+
print("Raw DataFrame is empty, no row to remove.")
|
| 43 |
+
# --- END ADDON CODE ---
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
numeric_cols = [
|
| 48 |
+
'V Minorities Sanctioned', 'V Minorities Admitted',
|
| 49 |
+
'V NonMinorities Sanctioned', 'V NonMinorities Admitted',
|
| 50 |
+
'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
|
| 51 |
+
'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted'
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for col in numeric_cols:
|
| 55 |
+
df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0).astype(int)
|
| 56 |
+
|
| 57 |
+
# --- Process Class V data ---
|
| 58 |
+
# Select relevant columns for Class V and create a copy to avoid SettingWithCopyWarning.
|
| 59 |
+
df_v = df_raw[['S.No', 'District', 'Institution Name',
|
| 60 |
+
'V Minorities Sanctioned', 'V Minorities Admitted',
|
| 61 |
+
'V NonMinorities Sanctioned', 'V NonMinorities Admitted']].copy()
|
| 62 |
+
df_v['Class'] = 'V' # Assign 'V' to the 'Class' column for these rows
|
| 63 |
+
|
| 64 |
+
# Calculate combined 'Sanctioned' and 'Admitted' for Class V (Minorities + Non-Minorities)
|
| 65 |
+
df_v['Sanctioned'] = df_v['V Minorities Sanctioned'] + df_v['V NonMinorities Sanctioned']
|
| 66 |
+
df_v['Admitted'] = df_v['V Minorities Admitted'] + df_v['V NonMinorities Admitted']
|
| 67 |
+
|
| 68 |
+
# --- Process Inter 1st Year data ---
|
| 69 |
+
# Select relevant columns for Inter 1st Year and create a copy.
|
| 70 |
+
df_inter = df_raw[['S.No', 'District', 'Institution Name',
|
| 71 |
+
'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
|
| 72 |
+
'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted']].copy()
|
| 73 |
+
df_inter['Class'] = 'Inter 1st Year' # Assign 'Inter 1st Year' to the 'Class' column
|
| 74 |
+
|
| 75 |
+
# Calculate combined 'Sanctioned' and 'Admitted' for Inter (Minorities + Non-Minorities)
|
| 76 |
+
df_inter['Sanctioned'] = df_inter['Inter Minorities Sanctioned'] + df_inter['Inter NonMinorities Sanctioned']
|
| 77 |
+
df_inter['Admitted'] = df_inter['Inter Minorities Admitted'] + df_inter['Inter NonMinorities Admitted']
|
| 78 |
+
|
| 79 |
+
# Concatenate the Class V and Inter DataFrames to form the final DataFrame
|
| 80 |
+
df_final = pd.concat([df_v, df_inter], ignore_index=True)
|
| 81 |
+
|
| 82 |
+
# Calculate 'Vacancies'
|
| 83 |
+
df_final['Vacancies'] = df_final['Sanctioned'] - df_final['Admitted']
|
| 84 |
+
|
| 85 |
+
# Reorder and select final columns as per the desired output format
|
| 86 |
+
# The 'S.No' from the raw data is kept as the final S.No.
|
| 87 |
+
df_final = df_final[['S.No', 'District', 'Institution Name', 'Class', 'Sanctioned', 'Admitted', 'Vacancies']]
|
| 88 |
+
|
| 89 |
+
# Rename 'Institution Name' to 'name of the tmr institute' to match the example output
|
| 90 |
+
#df_final.rename(columns={'Institution Name': 'name of the tmr institute'}, inplace=True)
|
| 91 |
+
|
| 92 |
+
# Save the cleaned DataFrame to a CSV file
|
| 93 |
+
df_final.to_csv(output_csv_path, index=False)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
except FileNotFoundError:
|
| 97 |
+
print(f"Error: The input file '{input_file_path}' was not found. Please ensure it's in the correct directory.")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"An unexpected error occurred: {e}")
|