Spaces:

AlanRex
/

AITEST

Sleeping

App Files Files Community

AlanRex commited on Sep 25

Commit

f9e7f22

verified ·

1 Parent(s): 05993cc

Update model_predictor.py

Browse files

Files changed (1) hide show

model_predictor.py +329 -184

model_predictor.py CHANGED Viewed

@@ -1,276 +1,421 @@
 # model_predictor.py - 支援漲幅百分比輸出的XGBoost模型預測器
 # 修改版本：輸出改為漲幅百分比而非絕對價格
 import os
-import pandas as pd
 import numpy as np
 import xgboost as xgb
-from sklearn.preprocessing import StandardScaler
-import pickle
 import joblib
 class XGBoostModel:
     def __init__(self):
         """
-        初始化 XGBoost 模型預測器
-        【重要更新】
-        - 模型現在輸出漲幅百分比而非絕對價格
-        - 支援 1日、5日、10日、20日的漲幅預測
         """
-        self.model = None
-        self.scaler = None
         self.feature_columns = [
-            'close',             # 前一日收盤價
-            'return_t-1',        # 前一日報酬率
-            'return_t-5',        # 過去 5 日累積報酬率
-            'MA5_close',         # 5 日移動平均價
-            'volatility_5d',     # 5 日報酬標準差
-            'volume_ratio_5d',   # 今日成交量 ÷ 5 日均量
-            'MACD_diff',         # MACD - signal
-            'dji_return_t-1',    # 前一日道瓊指數報酬率
-            'sox_return_t-1',    # 前一日費半指數報酬率
-            'NEWS'               # 新聞情緒分數
         ]
-        # 【新增】輸出目標對應表
-        self.output_targets = {
-            1: 'Change_pct_t1_pred',    # 1天後漲幅%
-            5: 'Change_pct_t5_pred',    # 5天後漲幅%
-            10: 'Change_pct_t10_pred',  # 10天後漲幅%
-            20: 'Change_pct_t20_pred'   # 20天後漲幅%
         }
-        print("XGBoost 模型預測器初始化完成")
-        print("輸出格式：漲幅百分比 (1日, 5日, 10日, 20日)")
-    def load_model(self, model_path):
         """
-        載入預訓練的 XGBoost 模型
         Args:
-            model_path (str): 模型檔案路徑 (.json 格式)
         Returns:
-            bool: 是否成功載入
         """
-        try:
-            # 檢查模型檔案是否存在
-            if not os.path.exists(model_path):
-                print(f"錯誤：找不到模型檔案 {model_path}")
-                return False
-            # 載入 XGBoost 模型
-            self.model = xgb.XGBRegressor()
-            self.model.load_model(model_path)
-            print(f"成功載入模型：{model_path}")
-            print(f"預期特徵數量：{len(self.feature_columns)}")
-            return True
-        except Exception as e:
-            print(f"載入模型時發生錯誤：{e}")
-            return False
-    def load_scaler(self, scaler_path):
         """
-        載入特徵標準化器
         Args:
-            scaler_path (str): 標準化器檔案路徑 (.pkl 格式)
         Returns:
-            bool: 是否成功載入
         """
         try:
-            if os.path.exists(scaler_path):
-                self.scaler = joblib.load(scaler_path)
-                print(f"成功載入標準化器：{scaler_path}")
-                return True
             else:
-                print(f"警告：找不到標準化器檔案 {scaler_path}")
-                print("將使用預設標準化器")
-                self.scaler = StandardScaler()
                 return False
         except Exception as e:
-            print(f"載入標準化器時發生錯誤：{e}")
-            self.scaler = StandardScaler()
             return False
-    def preprocess_features(self, input_df):
         """
-        預處理輸入特徵
         Args:
-            input_df (pd.DataFrame): 輸入特徵 DataFrame
         Returns:
-            pd.DataFrame: 預處理後的特徵
         """
         try:
-            # 確保輸入包含所有必要特徵
-            missing_features = [f for f in self.feature_columns if f not in input_df.columns]
             if missing_features:
-                print(f"警告：缺少以下特徵：{missing_features}")
-                # 用 0 填補缺少的特徵
-                for feature in missing_features:
-                    input_df[feature] = 0
-            # 按照預期順序重新排列特徵
-            input_df = input_df[self.feature_columns]
-            # 處理 NaN 值
-            input_df = input_df.fillna(0)
-            # 如果有標準化器，進行標準化
             if self.scaler is not None:
-                try:
-                    # 嘗試使用已訓練的標準化器
-                    scaled_features = self.scaler.transform(input_df)
-                    input_df = pd.DataFrame(scaled_features,
-                                          columns=input_df.columns,
-                                          index=input_df.index)
-                except Exception as scaler_error:
-                    print(f"標準化過程發生錯誤：{scaler_error}")
-                    print("跳過標準化步驟")
-            return input_df
         except Exception as e:
-            print(f"特徵預處理時發生錯誤：{e}")
-            return input_df
-    def predict(self, model_name, input_df):
         """
-        進行股價漲幅預測
         Args:
-            model_name (str): 模型名稱（用於載入對應模型）
-            input_df (pd.DataFrame): 輸入特徵
         Returns:
-            dict: 預測結果，包含各時間點的漲幅百分比
         """
         try:
-            # 載入模型（如果尚未載入）
-            if self.model is None:
-                model_path = f"{model_name}.json"
-                if not self.load_model(model_path):
-                    return None
-            # 載入標準化器（如果存在）
-            if self.scaler is None:
-                scaler_path = f"{model_name}_scaler.pkl"
-                self.load_scaler(scaler_path)
-            # 預處理特徵
-            processed_df = self.preprocess_features(input_df.copy())
-            # 進行預測
-            predictions = self.model.predict(processed_df)
-            # 【重要修改】將預測結果格式化為漲幅百分比
-            if predictions.ndim == 1:
-                # 如果只有一個輸出，假設是 1 日預測
-                result = {
-                    'Change_pct_t1_pred': float(predictions[0])
-                }
             else:
-                # 多輸出情況：1日, 5日, 10日, 20日
-                result = {
-                    'Change_pct_t1_pred': float(predictions[0][0]) if len(predictions[0]) > 0 else 0.0,
-                    'Change_pct_t5_pred': float(predictions[0][1]) if len(predictions[0]) > 1 else 0.0,
-                    'Change_pct_t10_pred': float(predictions[0][2]) if len(predictions[0]) > 2 else 0.0,
-                    'Change_pct_t20_pred': float(predictions[0][3]) if len(predictions[0]) > 3 else 0.0
-                }
-            # 輸出預測結果摘要
-            print("=== 漲幅預測結果 ===")
-            for key, value in result.items():
-                days = key.split('_')[2][1:]  # 提取天數
-                direction = "上漲" if value > 0 else "下跌"
-                print(f"  {days}日後預測: {value:+.2f}% ({direction})")
-            return result
         except Exception as e:
-            print(f"預測過程中發生錯誤：{e}")
-            import traceback
-            traceback.print_exc()
-            return None
-    def predict_single_timeframe(self, model_name, input_df, days):
         """
-        預測特定時間框架的漲幅
         Args:
-            model_name (str): 模型名稱
-            input_df (pd.DataFrame): 輸入特徵
-            days (int): 預測天數 (1, 5, 10, 20)
         Returns:
-            float: 預測的漲幅百分比
         """
         try:
-            predictions = self.predict(model_name, input_df)
-            if predictions is None:
-                return None
-            # 根據天數選擇對應的預測結果
-            target_key = f'Change_pct_t{days}_pred'
-            if target_key in predictions:
-                return predictions[target_key]
-            else:
-                print(f"警告：找不到 {days} 日預測結果")
-                return None
         except Exception as e:
-            print(f"單一時間框架預測時發生錯誤：{e}")
-            return None
-    def get_prediction_confidence(self, input_df):
         """
-        評估預測的信心度
         Args:
-            input_df (pd.DataFrame): 輸入特徵
         Returns:
-            float: 信心度 (0-1)
         """
         try:
-            # 基於特徵完整性和質量評估信心度
-            feature_completeness = 0
-            total_features = len(self.feature_columns)
-            for feature in self.feature_columns:
-                if feature in input_df.columns:
-                    value = input_df[feature].iloc[0]
-                    if not pd.isna(value) and value != 0:
-                        feature_completeness += 1
-            completeness_ratio = feature_completeness / total_features
-            # 基於數據質量調整信心度
-            base_confidence = max(0.5, completeness_ratio)
-            # 如果重要特徵缺失，降低信心度
-            important_features = ['close', 'return_t-1', 'MA5_close']
-            missing_important = 0
-            for feature in important_features:
-                if feature not in input_df.columns or pd.isna(input_df[feature].iloc[0]):
-                    missing_important += 1
-            if missing_important > 0:
-                base_confidence *= (1 - missing_important * 0.1)
-            return min(0.9, max(0.3, base_confidence))
         except Exception as e:
-            print(f"計算信心度時發生錯誤：{e}")
             return 0.5
     def validate_input(self, input_df):

 # model_predictor.py - 支援漲幅百分比輸出的XGBoost模型預測器
 # 修改版本：輸出改為漲幅百分比而非絕對價格
+# model_predictor.py - 修正版本，對應訓練腳本的確切配置
 import os
 import numpy as np
+import pandas as pd
 import xgboost as xgb
+from sklearn.preprocessing import MinMaxScaler
 import joblib
+import warnings
+warnings.filterwarnings('ignore')
 class XGBoostModel:
     def __init__(self):
         """
+        初始化 XGBoost 模型類別
+        根據訓練腳本 xgboost_for_stock_trend_&_prices_prediction_gpu_v_2_1_3.py 的配置
         """
+        # 根據訓練腳本的 new_feature_columns，確保順序完全一致
         self.feature_columns = [
+            'close',                    # 前一日收盤價
+            'return_t-1',              # 前一日報酬率
+            'return_t-5',              # 過去 5 日累積報酬率
+            'MA5_close',               # 5 日移動平均價
+            'volatility_5d',           # 5 日報酬標準差
+            'volume_ratio_5d',         # 今日成交量 ÷ 5 日均量
+            'MACD_diff',               # MACD - signal
+            'dji_return_t-1',          # 前一日道瓊指數報酬率
+            'sox_return_t-1',          # 前一日費半指數報酬率
+            'NEWS',                    # 新聞情緒分數
+            'MACDvol',                 # MACD柱狀圖
+            'RSI_14',                  # 14日RSI
+            'ADX',                     # ADX指標
+            'volume_weighted_return'   # 成交量加權報酬率
         ]
+        # 預測目標對應（根據訓練腳本的 train_y）
+        self.prediction_mapping = {
+            'Change_pct_t1_pred': 1,   # 1天後漲幅%
+            'Change_pct_t5_pred': 5,   # 5天後漲幅%
+            'Change_pct_t10_pred': 10, # 10天後漲幅%
+            'Change_pct_t20_pred': 20  # 20天後漲幅%
         }
+        self.model = None
+        self.scaler = None
+        self.is_model_loaded = False
+        # 模型檔案路徑
+        self.model_path = 'xgboost_model.json'
+        self.scaler_path = 'feature_scaler.pkl'
+    def create_features_from_stock_data(self, stock_data):
         """
+        從股票資料創建所需的特徵
+        完全對應訓練腳本中的 create_new_features 函數
         Args:
+            stock_data: yfinance 格式的股票資料 DataFrame
         Returns:
+            processed_df: 包含所有特徵的 DataFrame
         """
+        df = stock_data.copy()
+        # 確保必要的基礎欄位存在
+        required_base_columns = ['Close', 'Volume', 'High', 'Low']
+        for col in required_base_columns:
+            if col not in df.columns:
+                raise ValueError(f"缺少必要的基礎欄位: {col}")
+        # 統一欄位名稱（yfinance 使用大寫）
+        df['close'] = df['Close']
+        df['volume'] = df['Volume']
+        # 1. return_t-1 — 前一日報酬率
+        df['return_t-1'] = df['close'].pct_change()
+        # 2. return_t-5 — 過去 5 日累積報酬率
+        df['return_t-5'] = (df['close'] / df['close'].shift(5) - 1)
+        # 3. MA5_close — 5 日移動平均價
+        df['MA5_close'] = df['close'].rolling(window=5).mean()
+        # 4. volatility_5d — 5 日報酬標準差
+        df['volatility_5d'] = df['return_t-1'].rolling(window=5).std()
+        # 5. volume_ratio_5d — 今日成交量 ÷ 5 日均量
+        df['volume_5d_avg'] = df['volume'].rolling(window=5).mean()
+        df['volume_ratio_5d'] = df['volume'] / df['volume_5d_avg']
+        # 6. MACD_diff — MACD - signal
+        exp1 = df['close'].ewm(span=12).mean()
+        exp2 = df['close'].ewm(span=26).mean()
+        macd_line = exp1 - exp2
+        signal_line = macd_line.ewm(span=9).mean()
+        df['MACD_diff'] = macd_line - signal_line
+        # 7-8. 美股指數報酬率（需要外部資料，暫設為0）
+        df['dji_return_t-1'] = 0.0  # 這需要從外部獲取道瓊指數資料
+        df['sox_return_t-1'] = 0.0  # 這需要從外部獲取費半指數資料
+        # 9. NEWS — 新聞情緒分數（需要外部資料，暫設為0）
+        df['NEWS'] = 0.0
+        # 10. MACDvol — MACD柱狀圖
+        df['MACDvol'] = macd_line - signal_line
+        # 11. RSI_14 — 14日RSI
+        delta = df['close'].diff()
+        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
+        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
+        rs = gain / loss
+        df['RSI_14'] = 100 - (100 / (1 + rs))
+        # 12. ADX — 平均趨向指標
+        df['up_move'] = df['High'] - df['High'].shift(1)
+        df['down_move'] = df['Low'].shift(1) - df['Low']
+        df['+DM'] = np.where((df['up_move'] > df['down_move']) & (df['up_move'] > 0), df['up_move'], 0)
+        df['-DM'] = np.where((df['down_move'] > df['up_move']) & (df['down_move'] > 0), df['down_move'], 0)
+        high_low = df['High'] - df['Low']
+        high_close_prev = np.abs(df['High'] - df['close'].shift(1))
+        low_close_prev = np.abs(df['Low'] - df['close'].shift(1))
+        df['TR'] = np.maximum.reduce([high_low, high_close_prev, low_close_prev])
+        df['+DI'] = (df['+DM'].ewm(com=13, adjust=False).mean() / df['TR'].ewm(com=13, adjust=False).mean()) * 100
+        df['-DI'] = (df['-DM'].ewm(com=13, adjust=False).mean() / df['TR'].ewm(com=13, adjust=False).mean()) * 100
+        df['DX'] = np.abs(df['+DI'] - df['-DI']) / (df['+DI'] + df['-DI']) * 100
+        df['ADX'] = df['DX'].ewm(com=13, adjust=False).mean()
+        # 13. volume_weighted_return — 成交量加權報酬率
+        df['volume_weighted_return'] = np.abs(df['return_t-1']) * df['volume']
+        # 清理輔助欄位
+        cleanup_columns = ['volume_5d_avg', 'up_move', 'down_move', '+DM', '-DM', 'TR', '+DI', '-DI', 'DX']
+        df.drop(columns=[col for col in cleanup_columns if col in df.columns], inplace=True)
+        # 填補 NaN 值
+        df.fillna(method='ffill', inplace=True)
+        df.fillna(0, inplace=True)  # 剩餘的 NaN 用 0 填補
+        return df
+    def load_model(self, model_name='xgboost_model'):
         """
+        載入訓練好的模型和標準化器
         Args:
+            model_name: 模型名稱
         Returns:
+            bool: 載入是否成功
         """
         try:
+            # 載入 XGBoost 模型
+            if os.path.exists(self.model_path):
+                self.model = xgb.XGBRegressor()
+                self.model.load_model(self.model_path)
+                print(f"成功載入模型: {self.model_path}")
             else:
+                print(f"警告：模型檔案 {self.model_path} 不存在")
                 return False
+            # 嘗試載入標準化器（如果存在）
+            if os.path.exists(self.scaler_path):
+                self.scaler = joblib.load(self.scaler_path)
+                print(f"成功載入標準化器: {self.scaler_path}")
+            else:
+                print(f"警告：未找到標準化器檔案 {self.scaler_path}，將使用原始數據進行預測")
+                # 根據訓練腳本，模型沒有使用標準化，所以這是正常的
+                self.scaler = None
+            self.is_model_loaded = True
+            return True
         except Exception as e:
+            print(f"載入模型時發生錯誤: {e}")
             return False
+    def predict(self, model_name, input_data):
         """
+        使用載入的模型進行預測
         Args:
+            model_name: 模型名稱（保持接口一致性）
+            input_data: 輸入特徵 DataFrame 或 numpy array
         Returns:
+            dict: 預測結果字典，包含各時間框架的漲幅百分比
         """
+        if not self.is_model_loaded:
+            if not self.load_model(model_name):
+                raise RuntimeError("模型載入失敗，無法進行預測")
         try:
+            # 確保輸入是 DataFrame 格式
+            if isinstance(input_data, np.ndarray):
+                if input_data.shape[1] != len(self.feature_columns):
+                    raise ValueError(f"輸入特徵數量不匹配。期望: {len(self.feature_columns)}, 實際: {input_data.shape[1]}")
+                input_df = pd.DataFrame(input_data, columns=self.feature_columns)
+            elif isinstance(input_data, pd.DataFrame):
+                input_df = input_data.copy()
+            else:
+                raise ValueError("輸入數據必須是 DataFrame 或 numpy array")
+            # 確保所有必需的特徵都存在
+            missing_features = [col for col in self.feature_columns if col not in input_df.columns]
             if missing_features:
+                raise ValueError(f"缺少必要的特徵欄位: {missing_features}")
+            # 選擇並排序特徵
+            input_features = input_df[self.feature_columns]
+            # 檢查 NaN 值
+            if input_features.isnull().any().any():
+                print("警告：輸入數據包含 NaN 值，將用 0 填補")
+                input_features = input_features.fillna(0)
+            # 應用標準化（如果有的話）
             if self.scaler is not None:
+                input_features_scaled = self.scaler.transform(input_features)
+            else:
+                input_features_scaled = input_features.values
+            # 進行預測
+            predictions = self.model.predict(input_features_scaled)
+            # 處理預測結果的維度
+            if predictions.ndim == 1:
+                # 如果是單一樣本的預測，reshape 成 (1, 4)
+                if len(predictions) == 4:
+                    predictions = predictions.reshape(1, -1)
+                else:
+                    raise ValueError(f"預測結果維度不正確: {predictions.shape}")
+            # 確保結果是 (n_samples, 4) 的形狀
+            if predictions.shape[1] != 4:
+                raise ValueError(f"模型預測輸出維度錯誤，期望 4 個輸出，實際: {predictions.shape[1]}")
+            # 構建預測結果字典（取第一個樣本的預測）
+            result = {}
+            prediction_keys = ['Change_pct_t1_pred', 'Change_pct_t5_pred', 'Change_pct_t10_pred', 'Change_pct_t20_pred']
+            for i, key in enumerate(prediction_keys):
+                result[key] = float(predictions[0, i])  # 取第一個樣本的第 i 個預測
+            return result
         except Exception as e:
+            print(f"預測過程中發生錯誤: {e}")
+            raise
+    def predict_single_timeframe(self, stock_data, days, news_score=0.0, us_market_data=None):
         """
+        預測單一時間框架的漲幅
         Args:
+            stock_data: 股票歷史數據 (yfinance格式)
+            days: 預測天數 (1, 5, 10, 20)
+            news_score: 新聞情緒分數
+            us_market_data: 美股市場數據 (可選)
         Returns:
+            float: 預測的漲幅百分比
         """
         try:
+            # 創建特徵
+            processed_df = self.create_features_from_stock_data(stock_data)
+            # 使用最新的數據點
+            latest_data = processed_df.iloc[-1:].copy()
+            # 更新新聞分數
+            latest_data.loc[latest_data.index[0], 'NEWS'] = news_score
+            # 更新美股數據（如果提供）
+            if us_market_data:
+                if 'DJI' in us_market_data and len(us_market_data) > 1:
+                    dji_return = (us_market_data['DJI'][-1] - us_market_data['DJI'][-2]) / us_market_data['DJI'][-2]
+                    latest_data.loc[latest_data.index[0], 'dji_return_t-1'] = dji_return
+                if 'SOX' in us_market_data and len(us_market_data) > 1:
+                    sox_return = (us_market_data['SOX'][-1] - us_market_data['SOX'][-2]) / us_market_data['SOX'][-2]
+                    latest_data.loc[latest_data.index[0], 'sox_return_t-1'] = sox_return
+            # 進行預測
+            predictions = self.predict('xgboost_model', latest_data)
+            # 根據天數返回對應的預測值
+            if days == 1:
+                return predictions['Change_pct_t1_pred']
+            elif days == 5:
+                return predictions['Change_pct_t5_pred']
+            elif days == 10:
+                return predictions['Change_pct_t10_pred']
+            elif days == 20:
+                return predictions['Change_pct_t20_pred']
             else:
+                # 對於其他天數，使用最接近的預測值
+                if days <= 3:
+                    return predictions['Change_pct_t1_pred']
+                elif days <= 7:
+                    return predictions['Change_pct_t5_pred']
+                elif days <= 15:
+                    return predictions['Change_pct_t10_pred']
+                else:
+                    return predictions['Change_pct_t20_pred']
         except Exception as e:
+            print(f"單一時間框架預測失敗: {e}")
+            return 0.0
+    def validate_input_features(self, input_data):
         """
+        驗證輸入特徵的完整性和有效性
         Args:
+            input_data: 輸入的特徵數據
+        Returns:
+            dict: 驗證結果
+        """
+        validation_result = {
+            'is_valid': True,
+            'missing_features': [],
+            'invalid_values': [],
+            'warnings': []
+        }
+        try:
+            if isinstance(input_data, np.ndarray):
+                if input_data.shape[1] != len(self.feature_columns):
+                    validation_result['is_valid'] = False
+                    validation_result['warnings'].append(f"特徵數量不匹配: 期望{len(self.feature_columns)}, 實際{input_data.shape[1]}")
+                return validation_result
+            # 檢查缺失特徵
+            if isinstance(input_data, pd.DataFrame):
+                missing_features = [col for col in self.feature_columns if col not in input_data.columns]
+                if missing_features:
+                    validation_result['missing_features'] = missing_features
+                    validation_result['is_valid'] = False
+                # 檢查數值有效性
+                for feature in self.feature_columns:
+                    if feature in input_data.columns:
+                        if input_data[feature].isnull().any():
+                            validation_result['invalid_values'].append(f"{feature}: 包含NaN值")
+                        if np.isinf(input_data[feature]).any():
+                            validation_result['invalid_values'].append(f"{feature}: 包含無限值")
+            return validation_result
+        except Exception as e:
+            validation_result['is_valid'] = False
+            validation_result['warnings'].append(f"驗證過程出錯: {e}")
+            return validation_result
+    def get_feature_importance(self):
+        """
+        獲取模型的特徵重要性
         Returns:
+            dict: 特徵重要性字典
         """
+        if not self.is_model_loaded:
+            return {}
         try:
+            importance_scores = self.model.feature_importances_
+            importance_dict = {}
+            for i, feature in enumerate(self.feature_columns):
+                importance_dict[feature] = float(importance_scores[i])
+            # 按重要性排序
+            sorted_importance = dict(sorted(importance_dict.items(), key=lambda x: x[1], reverse=True))
+            return sorted_importance
         except Exception as e:
+            print(f"獲取特徵重要性失敗: {e}")
+            return {}
+    def get_prediction_confidence(self, input_data):
         """
+        估算預測信心度
         Args:
+            input_data: 輸入特徵數據
         Returns:
+            float: 信心度分數 (0-1)
         """
         try:
+            # 基礎信心度檢查
+            validation_result = self.validate_input_features(input_data)
+            if not validation_result['is_valid']:
+                return 0.3  # 數據有問題時給予較低信心度
+            # 根據特徵完整性調整信心度
+            base_confidence = 0.7
+            if validation_result['missing_features']:
+                base_confidence -= len(validation_result['missing_features']) * 0.05
+            if validation_result['invalid_values']:
+                base_confidence -= len(validation_result['invalid_values']) * 0.05
+            return max(0.3, min(0.9, base_confidence))
         except Exception as e:
+            print(f"計算預測信心度失敗: {e}")
             return 0.5
     def validate_input(self, input_df):