Spaces:

alwaysgood
/

my-tide-env

Sleeping

App Files Files Community

alwaysgood commited on Aug 20, 2025

Commit

902ce76

verified ·

1 Parent(s): fa82f2c

Update preprocessing.py

Browse files

Files changed (1) hide show

preprocessing.py +182 -4

preprocessing.py CHANGED Viewed

@@ -99,7 +99,17 @@ def convert_tide_level_to_residual(df, station_id):
         if harmonic_level is None:
             harmonic_level = find_closest_harmonic(normalized_time, harmonic_dict, max_diff_minutes=5)
-        if harmonic_level is not None:
             residual = tide_level - harmonic_level
             residual_values.append(residual)
             successful_conversions += 1
@@ -184,6 +194,158 @@ def create_mock_residual_data(df):
     print(f"✅ 가상 residual 데이터 {len(residual_values)}개 생성")
     return df
 def validate_input_data(df):
     """입력 데이터 유효성 검증"""
     print("🔍 입력 데이터 검증 중...")
@@ -535,7 +697,7 @@ def handle_missing_values(df, station_id=None):
 def preprocess_uploaded_file(file_path, station_id):
     """
     업로드된 파일의 전체 전처리 파이프라인
-    tide_level → residual 변환 + 검증
     """
     try:
         print(f"\n🚀 {station_id} 관측소 데이터 전처리 시작")
@@ -550,10 +712,26 @@ def preprocess_uploaded_file(file_path, station_id):
         if not is_valid:
             return None, f"입력 데이터 오류:\n" + "\n".join(issues)
-        # 3. 결측치 처리
         df_cleaned = handle_missing_values(df, station_id)
-        # 4. tide_level → residual 변환
         converted_df = convert_tide_level_to_residual(df_cleaned, station_id)
         # 5. 변환된 데이터를 임시 파일로 저장

         if harmonic_level is None:
             harmonic_level = find_closest_harmonic(normalized_time, harmonic_dict, max_diff_minutes=5)
+        # 이상치 플래그 확인
+        is_outlier = False
+        if '_tide_outlier_flag' in df_last_144.columns:
+            is_outlier = df_last_144.at[idx, '_tide_outlier_flag'] if not pd.isna(df_last_144.at[idx, '_tide_outlier_flag']) else False
+        if is_outlier:
+            # 이상치로 탐지된 경우 residual = 0 (harmonic만 사용)
+            residual_values.append(0.0)
+            successful_conversions += 1
+            print(f"   🚨 이상치 탐지된 시점 {timestamp}: residual=0 적용")
+        elif harmonic_level is not None:
             residual = tide_level - harmonic_level
             residual_values.append(residual)
             successful_conversions += 1
     print(f"✅ 가상 residual 데이터 {len(residual_values)}개 생성")
     return df
+def detect_harmonic_based_outliers(df, station_id):
+    """조화 예측 기반 tide_level 이상치 탐지"""
+    print("🌊 Harmonic 기반 tide_level 이상치 탐지 시작...")
+    if 'tide_level' not in df.columns:
+        print("⚠️ tide_level 컬럼이 없어서 이상치 탐지를 건너뜁니다.")
+        return pd.Series(False, index=df.index)
+    try:
+        # 1. 해당 시간대 harmonic_level 조회
+        df_copy = df.copy()
+        df_copy['date'] = pd.to_datetime(df_copy['date'])
+        # KST 시간대 설정
+        kst = pytz.timezone('Asia/Seoul')
+        if df_copy['date'].dt.tz is None:
+            df_copy['date'] = df_copy['date'].dt.tz_localize(kst)
+        else:
+            df_copy['date'] = df_copy['date'].dt.tz_convert(kst)
+        start_time = df_copy['date'].min()
+        end_time = df_copy['date'].max()
+        print(f"📅 이상치 탐지 시간 범위: {start_time} ~ {end_time}")
+        # 2. Harmonic 데이터 조회
+        harmonic_data = get_harmonic_predictions(station_id, start_time, end_time)
+        if not harmonic_data:
+            print("❌ 조화 예측 데이터가 없어서 물리적 한계로 대체합니다.")
+            # 물리적 한계로 폴백
+            physical_outliers = (df['tide_level'] < -300) | (df['tide_level'] > 2000)
+            return physical_outliers
+        print(f"📊 조화 예측 데이터: {len(harmonic_data)}개 조회")
+        # 3. Harmonic 딕셔너리 생성
+        harmonic_dict = {}
+        for h_data in harmonic_data:
+            try:
+                h_time_str = h_data['predicted_at']
+                h_time = parse_time_string(h_time_str)
+                if h_time is None:
+                    continue
+                # KST로 변환
+                if h_time.tzinfo is None:
+                    h_time = pytz.UTC.localize(h_time)
+                h_time = h_time.astimezone(kst)
+                # 5분 단위로 정규화
+                minutes = (h_time.minute // 5) * 5
+                h_time = h_time.replace(minute=minutes, second=0, microsecond=0)
+                harmonic_dict[h_time] = float(h_data['harmonic_level'])
+            except Exception as e:
+                print(f"⚠️ 조화 데이터 파싱 오류: {h_data}, {e}")
+                continue
+        print(f"📊 사용 가능한 조화 예측: {len(harmonic_dict)}개")
+        # 4. Residual 계산
+        residuals = []
+        outlier_flags = []
+        for idx, row in df_copy.iterrows():
+            tide_level = row['tide_level']
+            timestamp = row['date']
+            # 5분 단위로 정규화
+            minutes = (timestamp.minute // 5) * 5
+            normalized_time = timestamp.replace(minute=minutes, second=0, microsecond=0)
+            # 정확히 매칭되는 harmonic_level 찾기
+            harmonic_level = harmonic_dict.get(normalized_time)
+            # 매칭되지 않으면 가장 가까운 시간 찾기 (5분 이내)
+            if harmonic_level is None:
+                harmonic_level = find_closest_harmonic(normalized_time, harmonic_dict, max_diff_minutes=5)
+            if harmonic_level is not None:
+                residual = tide_level - harmonic_level
+                residuals.append(residual)
+                outlier_flags.append(False)  # 일단 정상으로 표시
+            else:
+                # Harmonic 데이터가 없으면 물리적 한계로 판정
+                outlier_flags.append(tide_level < -300 or tide_level > 2000)
+                residuals.append(0.0)
+        # 5. Residual 기반 3σ 이상치 탐지
+        if len(residuals) > 0:
+            residuals_array = np.array(residuals)
+            # NaN이 아닌 residual만으로 통계 계산
+            valid_residuals = residuals_array[~np.isnan(residuals_array)]
+            if len(valid_residuals) > 3:  # 최소 3개 이상 필요
+                residual_mean = np.mean(valid_residuals)
+                residual_std = np.std(valid_residuals)
+                print(f"📈 Residual 통계: 평균={residual_mean:.1f}cm, 표준편차={residual_std:.1f}cm")
+                if residual_std > 0:  # 표준편차가 0이 아닌 경우만
+                    # 3σ 기준 이상치 탐지
+                    threshold = 3 * residual_std
+                    for i, residual in enumerate(residuals):
+                        if not np.isnan(residual):
+                            if abs(residual - residual_mean) > threshold:
+                                outlier_flags[i] = True
+                    outlier_count = sum(outlier_flags)
+                    print(f"🚨 Harmonic 기반 이상치 탐지: {outlier_count}개 (3σ={threshold:.1f}cm 기준)")
+                else:
+                    print("📊 Residual 표준편차가 0이므로 물리적 한계만 적용")
+            else:
+                print("📊 유효한 residual이 부족하여 물리적 한계만 적용")
+        return pd.Series(outlier_flags, index=df.index)
+    except Exception as e:
+        print(f"❌ Harmonic 기반 이상치 탐지 실패: {e}")
+        traceback.print_exc()
+        # 폴백: 물리적 한계로 탐지
+        physical_outliers = (df['tide_level'] < -300) | (df['tide_level'] > 2000)
+        return physical_outliers
+def detect_weather_outliers(df):
+    """기상 데이터 물리적 한계 기반 이상치 탐지"""
+    print("🌡️ 기상 데이터 물리적 한계 기반 이상치 탐지 시작...")
+    # 물리적 한계 정의
+    PHYSICAL_LIMITS = {
+        'air_pres': (850, 1100),         # hPa (극한 기상 포함)
+        'wind_speed': (0, 80),           # m/s (한국 최대풍속 고려)
+        'air_temp': (-35, 45),           # °C (한국 기록 극값)
+        'wind_dir': (0, 360)             # degree
+    }
+    outliers = pd.DataFrame(False, index=df.index, columns=df.columns)
+    for col, (min_val, max_val) in PHYSICAL_LIMITS.items():
+        if col in df.columns:
+            col_outliers = (df[col] < min_val) | (df[col] > max_val)
+            outlier_count = col_outliers.sum()
+            if outlier_count > 0:
+                print(f"🌡️ {col} 물리적 한계 이상치: {outlier_count}개 (범위: {min_val}~{max_val})")
+                outliers[col] = col_outliers
+    return outliers
 def validate_input_data(df):
     """입력 데이터 유효성 검증"""
     print("🔍 입력 데이터 검증 중...")
 def preprocess_uploaded_file(file_path, station_id):
     """
     업로드된 파일의 전체 전처리 파이프라인
+    이상치 탐지 → 결측치 처리 → tide_level → residual 변환 + 검증
     """
     try:
         print(f"\n🚀 {station_id} 관측소 데이터 전처리 시작")
         if not is_valid:
             return None, f"입력 데이터 오류:\n" + "\n".join(issues)
+        # 3. 이상치 탐지 및 처리
+        print("\n🔍 이상치 탐지 및 처리 단계")
+        # 3-1. Harmonic 기반 tide_level 이상치 탐지
+        tide_outliers = detect_harmonic_based_outliers(df, station_id)
+        if tide_outliers.any():
+            print(f"🌊 tide_level 이상치 {tide_outliers.sum()}개 → residual=0 처리 예정")
+            df.loc[tide_outliers, '_tide_outlier_flag'] = True
+        # 3-2. 기상 데이터 물리적 한계 기반 이상치 탐지
+        weather_outliers = detect_weather_outliers(df)
+        for col in weather_outliers.columns:
+            if weather_outliers[col].any():
+                print(f"🌡️ {col} 이상치 {weather_outliers[col].sum()}개 → NaN 변환")
+                df.loc[weather_outliers[col], col] = np.nan
+        # 4. 결측치 처리
         df_cleaned = handle_missing_values(df, station_id)
+        # 5. tide_level → residual 변환 (이상치 플래그 반영)
         converted_df = convert_tide_level_to_residual(df_cleaned, station_id)
         # 5. 변환된 데이터를 임시 파일로 저장