AhmedUddin2002 commited on
Commit
de490e4
·
verified ·
1 Parent(s): 3298bea

Create clean_admission_data.py

Browse files
Files changed (1) hide show
  1. src/clean_admission_data.py +99 -0
src/clean_admission_data.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def clean_admission_data(input_file_path, output_csv_path):
3
+ """
4
+ Reads raw admission data from an Excel file, cleans it, combines minority and non-minority
5
+ sanctioned/admitted numbers, calculates vacancies, and saves to a new CSV.
6
+
7
+ Args:
8
+ input_file_path (str): Path to the raw input Excel file (e.g., 'Admission 24-07-2025.xlsx').
9
+ output_csv_path (str): Path where the cleaned CSV file will be saved.
10
+ """
11
+ try:
12
+ # Read the raw Excel file, skipping the initial header rows.
13
+ # Based on the file snippet, data starts from the 6th row (index 5).
14
+ # Using pd.read_excel for .xlsx files.
15
+ df_raw = pd.read_excel(input_file_path, skiprows=4)
16
+
17
+ # Define the column names for the raw DataFrame after skipping rows.
18
+ # This list must match the exact number of columns in your data (12 columns).
19
+ # These names are derived from the structure of your input Excel snippet.
20
+ df_raw.columns = [
21
+ 'S.No', 'District', 'Institution Name',
22
+ 'V Minorities Sanctioned', 'V Minorities Admitted',
23
+ 'V NonMinorities Sanctioned', 'V NonMinorities Admitted',
24
+ 'Course',
25
+ 'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
26
+ 'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted'
27
+ ]
28
+
29
+ # Clean 'Institution Name': remove text within brackets and replace 'Boys'/'Girls'.
30
+ # Convert to string first to handle potential non-string types.
31
+ #df_raw['Institution Name'] = df_raw['Institution Name'].astype(str).apply(
32
+ # lambda x: re.sub(r'\([^)]*\)', '', x).replace('Boys', 'B').replace('Girls', 'G').strip()
33
+ #)
34
+
35
+
36
+ # --- ADDON CODE: Remove the last row of the input data ---
37
+ # This is useful if the last row contains summary statistics or unwanted data.
38
+ if not df_raw.empty:
39
+ df_raw = df_raw.iloc[:-1]
40
+ print("Last row of the raw data has been removed.")
41
+ else:
42
+ print("Raw DataFrame is empty, no row to remove.")
43
+ # --- END ADDON CODE ---
44
+
45
+
46
+
47
+ numeric_cols = [
48
+ 'V Minorities Sanctioned', 'V Minorities Admitted',
49
+ 'V NonMinorities Sanctioned', 'V NonMinorities Admitted',
50
+ 'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
51
+ 'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted'
52
+ ]
53
+
54
+ for col in numeric_cols:
55
+ df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0).astype(int)
56
+
57
+ # --- Process Class V data ---
58
+ # Select relevant columns for Class V and create a copy to avoid SettingWithCopyWarning.
59
+ df_v = df_raw[['S.No', 'District', 'Institution Name',
60
+ 'V Minorities Sanctioned', 'V Minorities Admitted',
61
+ 'V NonMinorities Sanctioned', 'V NonMinorities Admitted']].copy()
62
+ df_v['Class'] = 'V' # Assign 'V' to the 'Class' column for these rows
63
+
64
+ # Calculate combined 'Sanctioned' and 'Admitted' for Class V (Minorities + Non-Minorities)
65
+ df_v['Sanctioned'] = df_v['V Minorities Sanctioned'] + df_v['V NonMinorities Sanctioned']
66
+ df_v['Admitted'] = df_v['V Minorities Admitted'] + df_v['V NonMinorities Admitted']
67
+
68
+ # --- Process Inter 1st Year data ---
69
+ # Select relevant columns for Inter 1st Year and create a copy.
70
+ df_inter = df_raw[['S.No', 'District', 'Institution Name',
71
+ 'Inter Minorities Sanctioned', 'Inter Minorities Admitted',
72
+ 'Inter NonMinorities Sanctioned', 'Inter NonMinorities Admitted']].copy()
73
+ df_inter['Class'] = 'Inter 1st Year' # Assign 'Inter 1st Year' to the 'Class' column
74
+
75
+ # Calculate combined 'Sanctioned' and 'Admitted' for Inter (Minorities + Non-Minorities)
76
+ df_inter['Sanctioned'] = df_inter['Inter Minorities Sanctioned'] + df_inter['Inter NonMinorities Sanctioned']
77
+ df_inter['Admitted'] = df_inter['Inter Minorities Admitted'] + df_inter['Inter NonMinorities Admitted']
78
+
79
+ # Concatenate the Class V and Inter DataFrames to form the final DataFrame
80
+ df_final = pd.concat([df_v, df_inter], ignore_index=True)
81
+
82
+ # Calculate 'Vacancies'
83
+ df_final['Vacancies'] = df_final['Sanctioned'] - df_final['Admitted']
84
+
85
+ # Reorder and select final columns as per the desired output format
86
+ # The 'S.No' from the raw data is kept as the final S.No.
87
+ df_final = df_final[['S.No', 'District', 'Institution Name', 'Class', 'Sanctioned', 'Admitted', 'Vacancies']]
88
+
89
+ # Rename 'Institution Name' to 'name of the tmr institute' to match the example output
90
+ #df_final.rename(columns={'Institution Name': 'name of the tmr institute'}, inplace=True)
91
+
92
+ # Save the cleaned DataFrame to a CSV file
93
+ df_final.to_csv(output_csv_path, index=False)
94
+
95
+
96
+ except FileNotFoundError:
97
+ print(f"Error: The input file '{input_file_path}' was not found. Please ensure it's in the correct directory.")
98
+ except Exception as e:
99
+ print(f"An unexpected error occurred: {e}")