Spaces:
Sleeping
Sleeping
'data'
Browse files- data_processor.py +7 -7
data_processor.py
CHANGED
|
@@ -195,7 +195,7 @@ class DataProcessor:
|
|
| 195 |
|
| 196 |
# Remove rows with missing critical data
|
| 197 |
before_dropna = len(courses)
|
| 198 |
-
courses = courses.dropna(subset=["kode_mk", "kategori_mk"])
|
| 199 |
if len(courses) < before_dropna:
|
| 200 |
logger.info(
|
| 201 |
f" Removed {before_dropna - len(courses)} rows with missing kode_mk or kategori_mk"
|
|
@@ -228,19 +228,19 @@ class DataProcessor:
|
|
| 228 |
initial_count = len(students)
|
| 229 |
|
| 230 |
# Remove rows with missing critical data
|
| 231 |
-
students = students.dropna(subset=["kode_mk", "thn", "smt", "kode_mhs"])
|
| 232 |
if len(students) < initial_count:
|
| 233 |
logger.info(
|
| 234 |
f" Removed {initial_count - len(students)} rows with missing critical data"
|
| 235 |
)
|
| 236 |
|
| 237 |
# Ensure correct data types
|
| 238 |
-
students["thn"] = pd.to_numeric(students["thn"], errors="coerce")
|
| 239 |
-
students["smt"] = pd.to_numeric(students["smt"], errors="coerce")
|
| 240 |
|
| 241 |
# Remove rows with invalid year/semester after conversion
|
| 242 |
before_invalid = len(students)
|
| 243 |
-
students = students.dropna(subset=["thn", "smt"])
|
| 244 |
if len(students) < before_invalid:
|
| 245 |
logger.info(
|
| 246 |
f" Removed {before_invalid - len(students)} rows with invalid year/semester values"
|
|
@@ -253,7 +253,7 @@ class DataProcessor:
|
|
| 253 |
logger.warning(
|
| 254 |
f" Found {invalid_sem.sum()} records with invalid semester values"
|
| 255 |
)
|
| 256 |
-
students = students[~invalid_sem]
|
| 257 |
|
| 258 |
# Validate year range
|
| 259 |
current_year = pd.Timestamp.now().year
|
|
@@ -262,7 +262,7 @@ class DataProcessor:
|
|
| 262 |
logger.warning(
|
| 263 |
f" Found {invalid_year.sum()} records with unreasonable year values"
|
| 264 |
)
|
| 265 |
-
students = students[~invalid_year]
|
| 266 |
|
| 267 |
# Remove exact duplicate enrollments (same student, course, semester)
|
| 268 |
before_dedup = len(students)
|
|
|
|
| 195 |
|
| 196 |
# Remove rows with missing critical data
|
| 197 |
before_dropna = len(courses)
|
| 198 |
+
courses = courses.dropna(subset=["kode_mk", "kategori_mk"]).copy()
|
| 199 |
if len(courses) < before_dropna:
|
| 200 |
logger.info(
|
| 201 |
f" Removed {before_dropna - len(courses)} rows with missing kode_mk or kategori_mk"
|
|
|
|
| 228 |
initial_count = len(students)
|
| 229 |
|
| 230 |
# Remove rows with missing critical data
|
| 231 |
+
students = students.dropna(subset=["kode_mk", "thn", "smt", "kode_mhs"]).copy()
|
| 232 |
if len(students) < initial_count:
|
| 233 |
logger.info(
|
| 234 |
f" Removed {initial_count - len(students)} rows with missing critical data"
|
| 235 |
)
|
| 236 |
|
| 237 |
# Ensure correct data types
|
| 238 |
+
students.loc[:, "thn"] = pd.to_numeric(students["thn"], errors="coerce")
|
| 239 |
+
students.loc[:, "smt"] = pd.to_numeric(students["smt"], errors="coerce")
|
| 240 |
|
| 241 |
# Remove rows with invalid year/semester after conversion
|
| 242 |
before_invalid = len(students)
|
| 243 |
+
students = students.dropna(subset=["thn", "smt"]).copy()
|
| 244 |
if len(students) < before_invalid:
|
| 245 |
logger.info(
|
| 246 |
f" Removed {before_invalid - len(students)} rows with invalid year/semester values"
|
|
|
|
| 253 |
logger.warning(
|
| 254 |
f" Found {invalid_sem.sum()} records with invalid semester values"
|
| 255 |
)
|
| 256 |
+
students = students[~invalid_sem].copy()
|
| 257 |
|
| 258 |
# Validate year range
|
| 259 |
current_year = pd.Timestamp.now().year
|
|
|
|
| 262 |
logger.warning(
|
| 263 |
f" Found {invalid_year.sum()} records with unreasonable year values"
|
| 264 |
)
|
| 265 |
+
students = students[~invalid_year].copy()
|
| 266 |
|
| 267 |
# Remove exact duplicate enrollments (same student, course, semester)
|
| 268 |
before_dedup = len(students)
|