Spaces:

superxu520
/

sync_stock

Paused

App Files Files Community

superxu520 commited on Mar 13

Commit

aa468bc

1 Parent(s): c6a3487

feat:incremental-sync

Browse files

Files changed (5) hide show

app/__init__.py +3 -1
app/database.py +275 -29
main.py +85 -90
requirements.txt +0 -2
sync_data.py +873 -55

app/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~# App package for sync space~~

+"""
+App package for stock data sync
+"""

app/database.py CHANGED Viewed

@@ -62,63 +62,81 @@ class DatabaseManager:
         """
         conn = self.conn
-        # 检查本地是否已有数据
         if not force_download:
             try:
-                # 尝试查询现有数据
                 count = conn.execute("SELECT COUNT(*) FROM stock_list").fetchone()[0]
                 if count > 0:
-                    logger.info(f"Local data exists ({count} stocks). Skip downloading.")
                     return
             except Exception:
-                # 表不存在，需要下载
                 pass
-        # 从 HF Dataset 下载数据
         if HF_TOKEN and DATASET_REPO_ID:
             logger.info("Downloading remote Parquet files from HF Dataset...")
             try:
                 from huggingface_hub import list_repo_files, hf_hub_download
-                # 1. 动态获取文件列表
                 all_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
-                # 2. 股票列表：下载并加载
-                list_file = "data/stock_list.parquet"
-                if list_file in all_files:
-                    local_list_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename=list_file, repo_type="dataset")
-                    conn.execute(f"CREATE OR REPLACE TABLE stock_list AS SELECT * FROM read_parquet('{local_list_path}')")
-                    logger.info("Local stock_list table created from remote parquet")
-                # 3. 日线数据：下载并加载
                 parquet_files = [f for f in all_files if f.startswith("data/parquet/") and f.endswith(".parquet")]
                 if parquet_files:
-                    local_paths = []
                     for f in parquet_files:
-                        # hf_hub_download 会自动使用缓存，不会重复下载
-                        path = hf_hub_download(repo_id=DATASET_REPO_ID, filename=f, repo_type="dataset")
-                        local_paths.append(f"'{path}'")
-                    files_sql = ", ".join(local_paths)
-                    conn.execute("DROP VIEW IF EXISTS stock_daily")
-                    conn.execute("DROP TABLE IF EXISTS stock_daily")
-                    conn.execute(f"CREATE OR REPLACE VIEW stock_daily AS SELECT * FROM read_parquet([{files_sql}])")
-                    logger.info(f"Remote stock_daily view created with {len(parquet_files)} partitions")
                 else:
-                    logger.warning("No parquet data files found in dataset")
                     self._create_tables()
-                # 验证
-                count = conn.execute("SELECT COUNT(*) FROM stock_list").fetchone()[0]
-                logger.info(f"Verification successful: {count} stocks found")
             except Exception as e:
                 logger.error(f"Failed to load remote Parquet: {e}")
                 self._create_tables()
         else:
             self._create_tables()
             logger.info("Local database initialized")
     def upload_db(self) -> None:
         """上传 Parquet 分区到 Hugging Face Dataset"""
@@ -156,6 +174,94 @@ class DatabaseManager:
                         repo_type="dataset",
                     )
             logger.info(f"Parquet files uploaded to HF Dataset: {DATASET_REPO_ID}")
         except Exception as e:
             logger.error(f"Failed to upload to HF: {e}")
@@ -166,7 +272,7 @@ class DatabaseManager:
         """创建数据库表结构"""
         conn = self.conn
-        # 日线行情表
         conn.execute("""
             CREATE TABLE IF NOT EXISTS stock_daily (
                 code VARCHAR,
@@ -193,11 +299,151 @@ class DatabaseManager:
             )
         """)
         # 创建索引
         conn.execute("""
             CREATE INDEX IF NOT EXISTS idx_code_date
             ON stock_daily (code, trade_date)
         """)
         logger.info("Database tables created/verified")

         """
         conn = self.conn
+        # 1. 检查本地是否已有数据表
         if not force_download:
             try:
                 count = conn.execute("SELECT COUNT(*) FROM stock_list").fetchone()[0]
                 if count > 0:
+                    logger.info(f"Local database tables exist ({count} stocks).")
+                    # 即使表存在，也要确保视图被创建（如果本地有 parquet 文件）
+                    self._refresh_views()
                     return
             except Exception:
                 pass
+        # 2. 尝试从本地 Parquet 文件恢复（Space 没重启的情况）
+        parquet_dir = Path(os.path.dirname(DUCKDB_PATH)) / "parquet"
+        list_file = Path(os.path.dirname(DUCKDB_PATH)) / "stock_list.parquet"
+        if not force_download and list_file.exists():
+            try:
+                conn.execute(f"CREATE OR REPLACE TABLE stock_list AS SELECT * FROM read_parquet('{list_file}')")
+                self._refresh_views()
+                logger.info("Database restored from local parquet files.")
+                return
+            except Exception as e:
+                logger.warning(f"Failed to restore from local parquet: {e}")
+        # 3. 从 HF Dataset 下载数据（Space 重启后的情况）
         if HF_TOKEN and DATASET_REPO_ID:
             logger.info("Downloading remote Parquet files from HF Dataset...")
             try:
                 from huggingface_hub import list_repo_files, hf_hub_download
                 all_files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
+                # 下载股票列表
+                if "data/stock_list.parquet" in all_files:
+                    local_list_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename="data/stock_list.parquet", repo_type="dataset")
+                    # 拷贝到本地数据目录
+                    import shutil
+                    shutil.copy(local_list_path, list_file)
+                    conn.execute(f"CREATE OR REPLACE TABLE stock_list AS SELECT * FROM read_parquet('{list_file}')")
+                # 下载日线数据分区
                 parquet_files = [f for f in all_files if f.startswith("data/parquet/") and f.endswith(".parquet")]
                 if parquet_files:
                     for f in parquet_files:
+                        remote_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename=f, repo_type="dataset")
+                        dest_path = Path(os.path.dirname(DUCKDB_PATH)) / f.replace("data/", "")
+                        dest_path.parent.mkdir(parents=True, exist_ok=True)
+                        import shutil
+                        shutil.copy(remote_path, dest_path)
+                    self._refresh_views()
+                    logger.info(f"Remote data downloaded and views created.")
                 else:
                     self._create_tables()
             except Exception as e:
                 logger.error(f"Failed to load remote Parquet: {e}")
                 self._create_tables()
         else:
             self._create_tables()
             logger.info("Local database initialized")
+    def _refresh_views(self) -> None:
+        """刷新数据库视图"""
+        conn = self.conn
+        parquet_dir = Path(os.path.dirname(DUCKDB_PATH)) / "parquet"
+        if parquet_dir.exists():
+            p_files = list(parquet_dir.glob("*.parquet"))
+            if p_files:
+                files_sql = ", ".join([f"'{str(f)}'" for f in p_files])
+                conn.execute("DROP VIEW IF EXISTS stock_daily")
+                conn.execute(f"CREATE OR REPLACE VIEW stock_daily AS SELECT * FROM read_parquet([{files_sql}])")
+                logger.info(f"Database views refreshed with {len(p_files)} partitions")
     def upload_db(self) -> None:
         """上传 Parquet 分区到 Hugging Face Dataset"""
                         repo_type="dataset",
                     )
+            # 3. 上传资金流向数据
+            fund_flow_path = Path(os.path.dirname(DUCKDB_PATH)) / "fund_flow.parquet"
+            if fund_flow_path.exists():
+                upload_file(
+                    path_or_fileobj=str(fund_flow_path),
+                    path_in_repo="data/fund_flow.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Fund flow data uploaded")
+            # 4. 上传估值指标数据
+            valuation_path = Path(os.path.dirname(DUCKDB_PATH)) / "valuation.parquet"
+            if valuation_path.exists():
+                upload_file(
+                    path_or_fileobj=str(valuation_path),
+                    path_in_repo="data/valuation.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Valuation data uploaded")
+            # 5. 上传融资融券数据
+            margin_path = Path(os.path.dirname(DUCKDB_PATH)) / "margin.parquet"
+            if margin_path.exists():
+                upload_file(
+                    path_or_fileobj=str(margin_path),
+                    path_in_repo="data/margin.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Margin data uploaded")
+            # 6. 上传财务指标数据
+            financial_path = Path(os.path.dirname(DUCKDB_PATH)) / "financial_indicator.parquet"
+            if financial_path.exists():
+                upload_file(
+                    path_or_fileobj=str(financial_path),
+                    path_in_repo="data/financial_indicator.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Financial indicator data uploaded")
+            # 7. 上传股东户数数据
+            holder_path = Path(os.path.dirname(DUCKDB_PATH)) / "holder_num.parquet"
+            if holder_path.exists():
+                upload_file(
+                    path_or_fileobj=str(holder_path),
+                    path_in_repo="data/holder_num.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Holder number data uploaded")
+            # 8. 上传分红数据
+            dividend_path = Path(os.path.dirname(DUCKDB_PATH)) / "dividend.parquet"
+            if dividend_path.exists():
+                upload_file(
+                    path_or_fileobj=str(dividend_path),
+                    path_in_repo="data/dividend.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Dividend data uploaded")
+            # 9. 上传十大股东数据
+            top_holders_path = Path(os.path.dirname(DUCKDB_PATH)) / "top_holders.parquet"
+            if top_holders_path.exists():
+                upload_file(
+                    path_or_fileobj=str(top_holders_path),
+                    path_in_repo="data/top_holders.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Top holders data uploaded")
+            # 10. 上传限售解禁数据
+            restricted_path = Path(os.path.dirname(DUCKDB_PATH)) / "restricted_unlock.parquet"
+            if restricted_path.exists():
+                upload_file(
+                    path_or_fileobj=str(restricted_path),
+                    path_in_repo="data/restricted_unlock.parquet",
+                    repo_id=DATASET_REPO_ID,
+                    repo_type="dataset",
+                )
+                logger.info("Restricted unlock data uploaded")
             logger.info(f"Parquet files uploaded to HF Dataset: {DATASET_REPO_ID}")
         except Exception as e:
             logger.error(f"Failed to upload to HF: {e}")
         """创建数据库表结构"""
         conn = self.conn
+        # 日线行情表（保持原有结构不变）
         conn.execute("""
             CREATE TABLE IF NOT EXISTS stock_daily (
                 code VARCHAR,
             )
         """)
+        # 资金流向表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_fund_flow (
+                code VARCHAR,
+                trade_date DATE,
+                close DOUBLE,
+                pct_chg DOUBLE,
+                main_net_inflow DOUBLE,
+                main_net_inflow_pct DOUBLE,
+                huge_net_inflow DOUBLE,
+                huge_net_inflow_pct DOUBLE,
+                large_net_inflow DOUBLE,
+                large_net_inflow_pct DOUBLE,
+                medium_net_inflow DOUBLE,
+                medium_net_inflow_pct DOUBLE,
+                small_net_inflow DOUBLE,
+                small_net_inflow_pct DOUBLE,
+                PRIMARY KEY (code, trade_date)
+            )
+        """)
+        # 估值指标表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_valuation (
+                code VARCHAR,
+                trade_date DATE,
+                pe_ttm DOUBLE,
+                pe_static DOUBLE,
+                pb DOUBLE,
+                ps_ttm DOUBLE,
+                dv_ratio DOUBLE,
+                total_mv DOUBLE,
+                circ_mv DOUBLE,
+                PRIMARY KEY (code, trade_date)
+            )
+        """)
+        # 融资融券表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_margin (
+                code VARCHAR,
+                trade_date DATE,
+                rzye DOUBLE,
+                rzmre DOUBLE,
+                rzche DOUBLE,
+                rqye DOUBLE,
+                rqmcl DOUBLE,
+                rzrqye DOUBLE,
+                PRIMARY KEY (code, trade_date)
+            )
+        """)
+        # 财务指标表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_financial_indicator (
+                code VARCHAR,
+                trade_date DATE,
+                roe DOUBLE,
+                roa DOUBLE,
+                gross_margin DOUBLE,
+                net_margin DOUBLE,
+                debt_ratio DOUBLE,
+                current_ratio DOUBLE,
+                quick_ratio DOUBLE,
+                inventory_turnover DOUBLE,
+                receivable_turnover DOUBLE,
+                total_asset_turnover DOUBLE,
+                PRIMARY KEY (code, trade_date)
+            )
+        """)
+        # 股东户数表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_holder_num (
+                code VARCHAR,
+                trade_date DATE,
+                holder_num BIGINT,
+                avg_share DOUBLE,
+                avg_value DOUBLE,
+                total_share DOUBLE,
+                total_value DOUBLE,
+                PRIMARY KEY (code, trade_date)
+            )
+        """)
+        # 历史分红表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_dividend (
+                code VARCHAR,
+                trade_date DATE,
+                dividend_type VARCHAR,
+                dividend_amount DOUBLE,
+                record_date DATE,
+                ex_date DATE,
+                pay_date DATE,
+                PRIMARY KEY (code, trade_date, dividend_type)
+            )
+        """)
+        # 十大股东表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_top_holders (
+                code VARCHAR,
+                trade_date DATE,
+                holder_name VARCHAR,
+                holder_type VARCHAR,
+                hold_num DOUBLE,
+                hold_ratio DOUBLE,
+                hold_change DOUBLE,
+                hold_change_ratio DOUBLE,
+                PRIMARY KEY (code, trade_date, holder_name)
+            )
+        """)
+        # 限售解禁表
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS stock_restricted_unlock (
+                code VARCHAR,
+                trade_date DATE,
+                unlock_date DATE,
+                unlock_num DOUBLE,
+                unlock_value DOUBLE,
+                unlock_ratio DOUBLE,
+                lock_type VARCHAR,
+                PRIMARY KEY (code, unlock_date)
+            )
+        """)
         # 创建索引
         conn.execute("""
             CREATE INDEX IF NOT EXISTS idx_code_date
             ON stock_daily (code, trade_date)
         """)
+        conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_fund_flow_code_date
+            ON stock_fund_flow (code, trade_date)
+        """)
+        conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_valuation_code_date
+            ON stock_valuation (code, trade_date)
+        """)
+        conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_margin_code_date
+            ON stock_margin (code, trade_date)
+        """)
         logger.info("Database tables created/verified")

main.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """
-Sync Space 启动脚本 - 纯 Python 版本
-替代 start_sync.sh，更简洁、跨平台
 """
 import os
@@ -9,8 +11,13 @@ import logging
 import schedule
 import time
 import subprocess
 from datetime import datetime
-from pathlib import Path
 # 配置日志
 logging.basicConfig(
@@ -19,138 +26,126 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-def check_environment():
-    """检查必需的环境变量"""
-    logger.info("=" * 60)
-    logger.info("  Stock Data Sync Space")
-    logger.info("=" * 60)
-    # 检查 HF_TOKEN
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        logger.error("ERROR: HF_TOKEN not set!")
-        logger.error("Please set HF_TOKEN in Space secrets")
-        sys.exit(1)
-    # 检查 DATASET_REPO_ID
-    dataset_repo = os.getenv("DATASET_REPO_ID")
-    if not dataset_repo:
-        logger.error("ERROR: DATASET_REPO_ID not set!")
-        logger.error("Please set DATASET_REPO_ID in Space secrets")
-        sys.exit(1)
-    logger.info(f"Dataset: {dataset_repo}")
-    return hf_token, dataset_repo
 def run_sync():
-    """执行数据同步"""
     logger.info("=" * 60)
-    logger.info(f"Starting sync at {datetime.now()}")
     logger.info("=" * 60)
     try:
-        # 运行同步脚本
         result = subprocess.run(
             [sys.executable, "-u", "sync_data.py"],
-            cwd="/app",
             capture_output=False,
             text=True
         )
         if result.returncode == 0:
-            logger.info("✅ Sync completed successfully!")
         else:
-            logger.error(f"❌ Sync failed with return code {result.returncode}")
     except Exception as e:
-        logger.error(f"❌ Sync error: {e}")
-def run_once():
-    """一次性同步模式"""
-    logger.info("Running one-time sync...")
-    run_sync()
-    logger.info("Sync completed! Space will stop.")
-    sys.exit(0)
 def parse_sync_times() -> list:
-    """
-    解析同步时间配置
-    支持两种格式：
-    1. 单个时间：SYNC_TIME=18:00
-    2. 多个时间：SYNC_TIME=08:00,12:00,18:00
-    """
     sync_time_str = os.getenv("SYNC_TIME", "18:00")
-    # 分割多个时间点
     times = [t.strip() for t in sync_time_str.split(",")]
-    # 验证时间格式
     valid_times = []
     for t in times:
         try:
-            # 验证格式 HH:MM
             hour, minute = t.split(":")
             if 0 <= int(hour) <= 23 and 0 <= int(minute) <= 59:
                 valid_times.append(t)
-            else:
-                logger.warning(f"Invalid time format: {t}, skipping")
-        except Exception as e:
-            logger.warning(f"Invalid time format: {t}, skipping. Error: {e}")
-    if not valid_times:
-        logger.warning("No valid sync times found, using default: 18:00")
-        return ["18:00"]
-    return valid_times
-def run_schedule():
-    """定时同步模式"""
     sync_times = parse_sync_times()
-    sync_on_start = os.getenv("SYNC_ON_START", "false").lower() == "true"
-    if len(sync_times) == 1:
-        logger.info(f"Schedule mode: Daily sync at {sync_times[0]} (Beijing time)")
-    else:
-        logger.info(f"Schedule mode: Daily sync at {', '.join(sync_times)} (Beijing time)")
-    # 设置多个定时任务
-    for sync_time in sync_times:
-        schedule.every().day.at(sync_time).do(run_sync)
-        logger.info(f"  - Scheduled sync at {sync_time}")
-    # 启动时立即执行一次（可选）
     if sync_on_start:
         logger.info("Running initial sync on startup...")
         run_sync()
-    logger.info(f"Scheduler started. Total {len(sync_times)} sync time(s) configured")
-    # 保持运行
-    while True:
         schedule.run_pending()
-        time.sleep(60)
 def main():
-    """主入口"""
-    # 检查环境
-    check_environment()
-    # 获取同步模式
-    sync_mode = os.getenv("SYNC_MODE", "schedule")
-    logger.info(f"Sync mode: {sync_mode}")
-    # 根据模式运行
     if sync_mode == "once":
-        run_once()
     else:
-        run_schedule()
 if __name__ == "__main__":
     main()

 """
+Sync Space 启动脚本 - 增强版
+1. 启动轻量级 Web 服务以绕过 HF 健康检查
+2. 异步运行数据同步调度器
+3. 支持多时间点定时触发
 """
 import os
 import schedule
 import time
 import subprocess
+import threading
+import signal
 from datetime import datetime
+from http.server import BaseHTTPRequestHandler, HTTPServer
+# 全局退出标志
+_shutdown_requested = False
 # 配置日志
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
+# ==================== 1. 健康检查 Web 服务 ====================
+class HealthCheckHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        self.send_response(200)
+        self.send_header('Content-type', 'text/html')
+        self.end_headers()
+        content = f"Stock Data Sync Space is running.<br>Last check: {datetime.now()}"
+        self.wfile.write(content.encode())
+    def log_message(self, format, *args):
+        # 禁用访问日志，保持控制台干净
+        return
+def run_health_check_server(port=7860):
+    """启动 Web 服务响应 HF 健康检查"""
+    server_address = ('', port)
+    httpd = HTTPServer(server_address, HealthCheckHandler)
+    logger.info(f"Health check server started on port {port}")
+    httpd.serve_forever()
+# ==================== 2. 同步逻辑与调度 ====================
 def run_sync():
+    """执行数据同步脚本"""
     logger.info("=" * 60)
+    logger.info(f"Starting sync task at {datetime.now()}")
     logger.info("=" * 60)
     try:
+        # 使用 -u 参数确保日志实时输出
         result = subprocess.run(
             [sys.executable, "-u", "sync_data.py"],
+            cwd=os.path.dirname(os.path.abspath(__file__)),
             capture_output=False,
             text=True
         )
         if result.returncode == 0:
+            logger.info("✅ Sync task completed successfully!")
         else:
+            logger.error(f"❌ Sync task failed with return code {result.returncode}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"❌ Sync task failed with return code {e.returncode}")
     except Exception as e:
+        logger.error(f"❌ Sync task error: {e}")
 def parse_sync_times() -> list:
+    """解析 SYNC_TIME 环境变量 (格式: 14:40,18:00)"""
     sync_time_str = os.getenv("SYNC_TIME", "18:00")
     times = [t.strip() for t in sync_time_str.split(",")]
     valid_times = []
     for t in times:
         try:
             hour, minute = t.split(":")
             if 0 <= int(hour) <= 23 and 0 <= int(minute) <= 59:
                 valid_times.append(t)
+        except (ValueError, IndexError):
+            logger.warning(f"Invalid time format: {t}, skipping")
+    return valid_times if valid_times else ["18:00"]
+def scheduler_loop():
+    """调度器主循环"""
+    global _shutdown_requested
     sync_times = parse_sync_times()
+    sync_on_start = os.getenv("SYNC_ON_START", "true").lower() == "true"
+    logger.info(f"Scheduler configured for times: {', '.join(sync_times)}")
+    for t in sync_times:
+        schedule.every().day.at(t).do(run_sync)
+    # 启动时立即执行一次
     if sync_on_start:
         logger.info("Running initial sync on startup...")
         run_sync()
+    while not _shutdown_requested:
         schedule.run_pending()
+        time.sleep(30)
+    logger.info("Scheduler loop exiting gracefully.")
+# ==================== 3. 主入口 ====================
 def main():
+    # 1. 检查必需的环境变量
+    hf_token = os.getenv("HF_TOKEN")
+    dataset_repo = os.getenv("DATASET_REPO_ID")
+    if not hf_token or not dataset_repo:
+        logger.error("HF_TOKEN or DATASET_REPO_ID not set! Please check Space secrets.")
+        sys.exit(1)
+    # 2. 启动健康检查 Web 服务 (在后台线程)
+    port = int(os.getenv("PORT", 7860))
+    web_thread = threading.Thread(target=run_health_check_server, args=(port,), daemon=True)
+    web_thread.start()
+    # 3. 设置信号处理
+    def signal_handler(signum, frame):
+        global _shutdown_requested
+        logger.info(f"Received signal {signum}, shutting down gracefully...")
+        _shutdown_requested = True
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+    # 4. 运行调度器 (在主线程)
+    sync_mode = os.getenv("SYNC_MODE", "schedule")
     if sync_mode == "once":
+        logger.info("Mode: ONCE - Running sync and exiting.")
+        run_sync()
+        logger.info("Sync completed. Keeping container alive for health check.")
+        # 保持主线程运行，否则容器会退出
+        while not _shutdown_requested:
+            time.sleep(3600)
     else:
+        logger.info("Mode: SCHEDULE - Starting scheduler.")
+        scheduler_loop()
 if __name__ == "__main__":
     main()

requirements.txt CHANGED Viewed

@@ -4,7 +4,5 @@ pandas>=2.0.0
 duckdb>=0.9.0
 huggingface-hub>=0.20.0
 python-dotenv>=1.0.0
-yfinance>=0.2.0
-pytz>=2023.3
 schedule>=1.2.0  # 定时任务调度
 pyarrow>=14.0.0  # Parquet 文件支持

 duckdb>=0.9.0
 huggingface-hub>=0.20.0
 python-dotenv>=1.0.0
 schedule>=1.2.0  # 定时任务调度
 pyarrow>=14.0.0  # Parquet 文件支持

sync_data.py CHANGED Viewed

@@ -30,8 +30,63 @@ logger = logging.getLogger(__name__)
 # 配置
 YEARS_OF_DATA = 10
-MAX_WORKERS = 5  # 降低并发数，减少超时
-SYNC_LIMIT = -1
 def get_stock_list() -> pd.DataFrame:
     """获取全市场标的列表"""
@@ -163,17 +218,30 @@ def get_target_daily(code: str, start_date: str, market: str) -> Optional[pd.Dat
     return None
 def get_last_trading_day() -> str:
-    """获取最近一个交易日"""
     try:
         df = ak.stock_zh_index_daily_em(symbol="sh000300")
         if df is not None and not df.empty:
             date_col = 'date' if 'date' in df.columns else ('日期' if '日期' in df.columns else None)
             if date_col:
                 return pd.to_datetime(df[date_col].iloc[-1]).strftime('%Y-%m-%d')
-    except Exception as e:
-        logger.warning(f"Failed to get last trading day: {e}")
-    # 回退：按工作日估算
     d = get_beijing_time()
     while d.weekday() >= 5:
         d -= timedelta(days=1)
@@ -237,11 +305,17 @@ def sync_stock_daily(targets: List[Dict[str, str]], last_trade_day: str) -> int:
         t['start_dt'] = start_dt
         pending.append(t)
     if not pending: return 0
     logger.info(f"Syncing {len(pending)} targets...")
     all_new_data = []
-    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         futures = {executor.submit(get_target_daily, t['code'], t['start_dt'], t['market']): t['code'] for t in pending}
         for i, future in enumerate(as_completed(futures), 1):
             res = future.result()
@@ -259,62 +333,806 @@ def sync_stock_daily(targets: List[Dict[str, str]], last_trade_day: str) -> int:
             local_path = Path(f"/tmp/data/parquet/{filename}")  # Sync Space 使用 /tmp
             local_path.parent.mkdir(parents=True, exist_ok=True)
-            # 云端增量核心：先从云端拉取旧的月份文件
-            repo_id = os.getenv("DATASET_REPO_ID")
-            if repo_id:
                 try:
-                    old_file = hf_hub_download(repo_id=repo_id, filename=f"data/parquet/{filename}", repo_type="dataset")
-                    old_df = pd.read_parquet(old_file)
-                    # 合并
-                    month_inc = inc_df[(inc_df['trade_date'].dt.year == yr) & (inc_df['trade_date'].dt.month == mo)]
-                    final_month_df = pd.concat([old_df, month_inc]).drop_duplicates(subset=['code', 'trade_date'])
-                    final_month_df.to_parquet(local_path)
-                    logger.info(f"Merged cloud data for {filename}")
-                    continue
-                except: pass
-            # 如果云端没有，直接保存
             month_inc = inc_df[(inc_df['trade_date'].dt.year == yr) & (inc_df['trade_date'].dt.month == mo)]
-            month_inc.to_parquet(local_path)
-            logger.info(f"Saved new data for {filename}")
     return len(all_new_data)
-def main():
-    logger.info("=" * 60)
-    logger.info("Stock Data Sync Started")
-    logger.info("=" * 60)
-    db = get_db()
-    db.init_db()
-    # 1. 列表同步
-    target_list = get_stock_list()
-    list_parquet = Path("/tmp/data/stock_list.parquet")
-    list_parquet.parent.mkdir(parents=True, exist_ok=True)
-    target_list.to_parquet(list_parquet)
-    # 2. 行情同步
-    last_day = get_last_trading_day()
-    logger.info(f"Last trading day: {last_day}")
-    sync_count = sync_stock_daily(target_list.to_dict('records'), last_day)
-    logger.info(f"Synced {sync_count} stock-day records")
-    # 3. 指数同步
-    idx_df = get_index_daily('000300')
-    if idx_df is not None:
-        idx_path = Path("/tmp/data/parquet/index_000300.parquet")
-        idx_path.parent.mkdir(parents=True, exist_ok=True)
-        idx_df.to_parquet(idx_path)
-        logger.info("Index data synced")
-    # 4. 上传
-    logger.info("Uploading to Hugging Face Dataset...")
-    db.upload_db()
     logger.info("=" * 60)
-    logger.info("Sync Completed Successfully!")
     logger.info("=" * 60)
 if __name__ == "__main__":
-    main()

 # 配置
 YEARS_OF_DATA = 10
+def _safe_int_env(var_name: str, default: int) -> int:
+    """安全地读取环境变量并转换为整数"""
+    try:
+        value = os.getenv(var_name)
+        if value is None:
+            return default
+        return int(value)
+    except (ValueError, TypeError):
+        logger.warning(f"Invalid value for {var_name}, using default: {default}")
+        return default
+# 动态线程数配置（延迟计算，避免导入时触发 multiprocessing）
+def get_thread_config():
+    """获取线程池配置（延迟计算）"""
+    import multiprocessing
+    cpu_count = multiprocessing.cpu_count()
+    # 分层并发策略
+    config = {
+        'daily': _safe_int_env("MAX_WORKERS_DAILY", cpu_count * 4),
+        'fund': _safe_int_env("MAX_WORKERS_FUND", cpu_count * 3),
+        'valuation': _safe_int_env("MAX_WORKERS_VALUATION", cpu_count * 3),
+        'margin': _safe_int_env("MAX_WORKERS_MARGIN", cpu_count * 3),
+        'financial': _safe_int_env("MAX_WORKERS_FINANCIAL", cpu_count * 2),
+        'dividend': _safe_int_env("MAX_WORKERS_DIVIDEND", cpu_count * 2),
+    }
+    # 向后兼容
+    legacy = _safe_int_env("MAX_WORKERS", 0)
+    if legacy > 0:
+        config = {k: legacy for k in config}
+    return cpu_count, config
+# 延迟初始化线程配置（在 main 中调用）
+_CPU_COUNT = None
+_THREAD_CONFIG = None
+_thread_config_lock = threading.Lock()
+def get_thread_config_safe():
+    """安全获取线程配置（自动初始化，线程安全）"""
+    global _CPU_COUNT, _THREAD_CONFIG
+    if _THREAD_CONFIG is None:
+        with _thread_config_lock:
+            # 双重检查锁定模式
+            if _THREAD_CONFIG is None:
+                _CPU_COUNT, _THREAD_CONFIG = get_thread_config()
+                logger.info(f"Thread pool config: CPU={_CPU_COUNT}, "
+                            f"Daily={_THREAD_CONFIG['daily']}, Fund={_THREAD_CONFIG['fund']}, "
+                            f"Valuation={_THREAD_CONFIG['valuation']}, Margin={_THREAD_CONFIG['margin']}, "
+                            f"Financial={_THREAD_CONFIG['financial']}, Dividend={_THREAD_CONFIG['dividend']}")
+    return _THREAD_CONFIG
+def init_thread_config():
+    """初始化线程配置（在 main 中调用）"""
+    get_thread_config_safe()
 def get_stock_list() -> pd.DataFrame:
     """获取全市场标的列表"""
     return None
 def get_last_trading_day() -> str:
+    """获取最近一个交易日（优先使用交易日历）"""
+    try:
+        # 获取交易日历（新浪接口，包含未来日期）
+        df = ak.tool_trade_date_hist_sina()
+        if df is not None and not df.empty:
+            # 转换为日期格式并过滤出 <= 今天的日期
+            df['trade_date'] = pd.to_datetime(df['trade_date']).dt.date
+            today = get_beijing_time().date()
+            # 找到最后一个交易日
+            last_day = df[df['trade_date'] <= today]['trade_date'].iloc[-1]
+            return last_day.strftime('%Y-%m-%d')
+    except Exception as e:
+        logger.warning(f"Failed to get trading calendar: {e}")
+    # 回退：使用指数行情
     try:
         df = ak.stock_zh_index_daily_em(symbol="sh000300")
         if df is not None and not df.empty:
             date_col = 'date' if 'date' in df.columns else ('日期' if '日期' in df.columns else None)
             if date_col:
                 return pd.to_datetime(df[date_col].iloc[-1]).strftime('%Y-%m-%d')
+    except Exception: pass
+    # 最终回退：按工作日估算
     d = get_beijing_time()
     while d.weekday() >= 5:
         d -= timedelta(days=1)
         t['start_dt'] = start_dt
         pending.append(t)
+    # 应用 SYNC_LIMIT 限制
+    sync_limit = int(os.getenv("SYNC_LIMIT", -1))
+    if sync_limit > 0 and len(pending) > sync_limit:
+        logger.info(f"Limiting sync to first {sync_limit} targets (out of {len(pending)})")
+        pending = pending[:sync_limit]
     if not pending: return 0
     logger.info(f"Syncing {len(pending)} targets...")
     all_new_data = []
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['daily']) as executor:
         futures = {executor.submit(get_target_daily, t['code'], t['start_dt'], t['market']): t['code'] for t in pending}
         for i, future in enumerate(as_completed(futures), 1):
             res = future.result()
             local_path = Path(f"/tmp/data/parquet/{filename}")  # Sync Space 使用 /tmp
             local_path.parent.mkdir(parents=True, exist_ok=True)
+            # 增量核心：先检查本地是否有，没有再从云端拉取
+            old_df = None
+            if local_path.exists():
                 try:
+                    old_df = pd.read_parquet(local_path)
+                    logger.info(f"Using local cache for {filename}")
+                except Exception: pass
+            if old_df is None:
+                repo_id = os.getenv("DATASET_REPO_ID")
+                if repo_id:
+                    try:
+                        old_file = hf_hub_download(repo_id=repo_id, filename=f"data/parquet/{filename}", repo_type="dataset")
+                        old_df = pd.read_parquet(old_file)
+                        logger.info(f"Downloaded {filename} from cloud")
+                    except Exception:
+                        pass
+            # 合并新数据
             month_inc = inc_df[(inc_df['trade_date'].dt.year == yr) & (inc_df['trade_date'].dt.month == mo)]
+            if old_df is not None:
+                final_month_df = pd.concat([old_df, month_inc]).drop_duplicates(subset=['code', 'trade_date'])
+            else:
+                final_month_df = month_inc
+            final_month_df.to_parquet(local_path)
+            logger.info(f"Saved updated data for {filename}")
     return len(all_new_data)
+# ==================== 新增：资金流向数据同步 ====================
+def get_stock_fund_flow(code: str, market: str) -> Optional[pd.DataFrame]:
+    """获取单只股票资金流向数据"""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            # 确定 market 参数
+            if market == '北交所' or code.startswith(('8', '4', '920')):
+                mk = 'bj'
+            elif code.startswith(('6', '9')):
+                mk = 'sh'
+            else:
+                mk = 'sz'
+            df = ak.stock_individual_fund_flow(stock=code, market=mk)
+            if df is not None and not df.empty:
+                # 标准化列名
+                rename_map = {
+                    '日期': 'trade_date', '收盘价': 'close', '涨跌幅': 'pct_chg',
+                    '主力净流入-净额': 'main_net_inflow',
+                    '主力净流入-净占比': 'main_net_inflow_pct',
+                    '超大单净流入-净额': 'huge_net_inflow',
+                    '超大单净流入-净占比': 'huge_net_inflow_pct',
+                    '大单净流入-净额': 'large_net_inflow',
+                    '大单净流入-净占比': 'large_net_inflow_pct',
+                    '中单净流入-净额': 'medium_net_inflow',
+                    '中单净流入-净占比': 'medium_net_inflow_pct',
+                    '小单净流入-净额': 'small_net_inflow',
+                    '小单净流入-净占比': 'small_net_inflow_pct',
+                }
+                df = df.rename(columns=rename_map)
+                df['trade_date'] = pd.to_datetime(df['trade_date'])
+                df['code'] = code
+                cols = ['code', 'trade_date', 'close', 'pct_chg',
+                        'main_net_inflow', 'main_net_inflow_pct',
+                        'huge_net_inflow', 'huge_net_inflow_pct',
+                        'large_net_inflow', 'large_net_inflow_pct',
+                        'medium_net_inflow', 'medium_net_inflow_pct',
+                        'small_net_inflow', 'small_net_inflow_pct']
+                return df[[c for c in cols if c in df.columns]]
+        except Exception as e:
+            if attempt == max_retries - 1:
+                pass  # 静默失败，很���股票可能没有资金流向数据
+            time.sleep(0.5)
+    return None
+def sync_fund_flow(targets: List[Dict[str, str]], last_trade_day: str) -> int:
+    """同步资金流向数据（极致增量版）"""
+    logger.info("Syncing fund flow data...")
+    flow_path = Path("/tmp/data/fund_flow.parquet")
+    flow_path.parent.mkdir(parents=True, exist_ok=True)
+    old_df = None
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    # 1. 优先读取本地缓存
+    if flow_path.exists():
+        try:
+            old_df = pd.read_parquet(flow_path)
+            global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+            existing_codes = set(old_df['code'].unique())
+            logger.info(f"Local fund flow cache found, latest date: {global_latest_date}")
+        except Exception as e:
+            logger.warning(f"Failed to read local fund flow cache: {e}")
+    # 2. 本地无缓存，尝试从云端拉取
+    if old_df is None:
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                old_file = hf_hub_download(repo_id=repo_id, filename="data/fund_flow.parquet", repo_type="dataset")
+                old_df = pd.read_parquet(old_file)
+                global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+                existing_codes = set(old_df['code'].unique())
+                old_df.to_parquet(flow_path) # 存入本地缓存
+                logger.info(f"Downloaded fund flow from cloud, latest date: {global_latest_date}")
+            except Exception:
+                logger.info("No existing fund flow data found in cloud.")
+    # 3. 全局水位线拦截 + 新股检测
+    stock_targets = [t for t in targets if t['market'] not in ['ETF', 'LOF', 'REITs', '可转债']]
+    new_codes = [t for t in stock_targets if t['code'] not in existing_codes]
+    if global_latest_date >= last_trade_day and not new_codes:
+        logger.info(f"Fund flow data is already up to date ({global_latest_date}) and no new stocks. Skip.")
+        return 0
+    # 4. 增量获取
+    if global_latest_date >= last_trade_day:
+        logger.info(f"Global date is up to date, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+    else:
+        logger.info(f"Syncing fund flow from {global_latest_date} to {last_trade_day}...")
+        sync_targets = stock_targets
+    all_data = []
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['fund']) as executor:
+        futures = {executor.submit(get_stock_fund_flow, t['code'], t['market']): t['code'] for t in sync_targets}
+        for i, future in enumerate(as_completed(futures), 1):
+            res = future.result()
+            if res is not None and not res.empty:
+                # 只保留新日期的数据
+                code = futures[future]
+                # 如果是老股票，只保留大于全局最新日期的数据
+                if code in existing_codes:
+                    res = res[res['trade_date'] > pd.to_datetime(global_latest_date)]
+                if not res.empty:
+                    all_data.append(res)
+                    success_count += 1
+            if i % 500 == 0:
+                logger.info(f"Fund flow progress: {i}/{len(sync_targets)}, success: {success_count}")
+    # 5. 合并保存
+    if all_data:
+        new_df = pd.concat(all_data, ignore_index=True)
+        final_df = pd.concat([old_df, new_df]) if old_df is not None else new_df
+        final_df = final_df.drop_duplicates(subset=['code', 'trade_date'])
+        final_df.to_parquet(flow_path)
+        logger.info(f"Fund flow updated: {len(final_df)} total records")
+    return success_count
+# ==================== 新增：估值指标数据同步 ====================
+def get_stock_valuation(code: str) -> Optional[pd.DataFrame]:
+    """获取单只股票估值指标数据"""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            df = ak.stock_a_lg_indicator(symbol=code)
+            if df is not None and not df.empty:
+                # 标准化列名
+                rename_map = {
+                    '日期': 'trade_date',
+                    '市盈率': 'pe_ttm',
+                    '市盈率TTM': 'pe_ttm',
+                    '静态市盈率': 'pe_static',
+                    '市净率': 'pb',
+                    '市销率': 'ps_ttm',
+                    '股息率': 'dv_ratio',
+                    '总市值': 'total_mv',
+                    '流通市值': 'circ_mv',
+                }
+                df = df.rename(columns=rename_map)
+                df['trade_date'] = pd.to_datetime(df['trade_date'])
+                df['code'] = code
+                cols = ['code', 'trade_date', 'pe_ttm', 'pe_static', 'pb',
+                        'ps_ttm', 'dv_ratio', 'total_mv', 'circ_mv']
+                available_cols = [c for c in cols if c in df.columns]
+                return df[available_cols]
+        except Exception:
+            time.sleep(0.5)
+    return None
+def sync_valuation(targets: List[Dict[str, str]], last_trade_day: str) -> int:
+    """同步估值指标数据（极致增量版）"""
+    logger.info("Syncing valuation data...")
+    val_path = Path("/tmp/data/valuation.parquet")
+    val_path.parent.mkdir(parents=True, exist_ok=True)
+    old_df = None
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    # 1. 优先读取本地缓存
+    if val_path.exists():
+        try:
+            old_df = pd.read_parquet(val_path)
+            global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+            existing_codes = set(old_df['code'].unique())
+            logger.info(f"Local valuation cache found, latest date: {global_latest_date}")
+        except Exception as e:
+            logger.warning(f"Failed to read local valuation cache: {e}")
+    # 2. 本地无缓存，尝试从云端拉取
+    if old_df is None:
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                old_file = hf_hub_download(repo_id=repo_id, filename="data/valuation.parquet", repo_type="dataset")
+                old_df = pd.read_parquet(old_file)
+                global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+                existing_codes = set(old_df['code'].unique())
+                old_df.to_parquet(val_path)
+                logger.info(f"Downloaded valuation from cloud, latest date: {global_latest_date}")
+            except Exception:
+                logger.info("No existing valuation data found in cloud.")
+    # 3. 全局水位线拦截 + 新股检测
+    stock_targets = [t for t in targets if t['market'] not in ['ETF', 'LOF', 'REITs', '可转债']]
+    new_codes = [t for t in stock_targets if t['code'] not in existing_codes]
+    if global_latest_date >= last_trade_day and not new_codes:
+        logger.info(f"Valuation data is already up to date ({global_latest_date}) and no new stocks. Skip.")
+        return 0
+    # 4. 增量获取
+    if global_latest_date >= last_trade_day:
+        logger.info(f"Global date is up to date, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+    else:
+        logger.info(f"Syncing valuation from {global_latest_date} to {last_trade_day}...")
+        sync_targets = stock_targets
+    all_data = []
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['valuation']) as executor:
+        futures = {executor.submit(get_stock_valuation, t['code']): t['code'] for t in sync_targets}
+        for i, future in enumerate(as_completed(futures), 1):
+            res = future.result()
+            if res is not None and not res.empty:
+                code = futures[future]
+                if code in existing_codes:
+                    res = res[res['trade_date'] > pd.to_datetime(global_latest_date)]
+                if not res.empty:
+                    all_data.append(res)
+                    success_count += 1
+            if i % 500 == 0:
+                logger.info(f"Valuation progress: {i}/{len(sync_targets)}, success: {success_count}")
+    # 5. 合并保存
+    if all_data:
+        new_df = pd.concat(all_data, ignore_index=True)
+        final_df = pd.concat([old_df, new_df]) if old_df is not None else new_df
+        final_df = final_df.drop_duplicates(subset=['code', 'trade_date'])
+        final_df.to_parquet(val_path)
+        logger.info(f"Valuation updated: {len(final_df)} total records")
+    return success_count
+# ==================== 新增：融资融券数据同步 ====================
+def get_stock_margin(code: str) -> Optional[pd.DataFrame]:
+    """获取单只股票融资融券数据"""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            # 尝试上交所
+            if code.startswith('6'):
+                df = ak.stock_margin_detail_sh(symbol=code)
+            else:
+                df = ak.stock_margin_detail_sz(symbol=code)
+            if df is not None and not df.empty:
+                # 标准化列名
+                rename_map = {
+                    '日期': 'trade_date',
+                    '融资余额': 'rzye',
+                    '融资买入额': 'rzmre',
+                    '融资偿还额': 'rzche',
+                    '融券余额': 'rqye',
+                    '融券卖出量': 'rqmcl',
+                    '融资融券余额': 'rzrqye',
+                }
+                df = df.rename(columns=rename_map)
+                if 'trade_date' in df.columns:
+                    df['trade_date'] = pd.to_datetime(df['trade_date'])
+                df['code'] = code
+                cols = ['code', 'trade_date', 'rzye', 'rzmre', 'rzche', 'rqye', 'rqmcl', 'rzrqye']
+                available_cols = [c for c in cols if c in df.columns]
+                return df[available_cols]
+        except Exception as e:
+            if attempt == max_retries - 1:
+                pass
+            time.sleep(0.5)
+    return None
+def sync_margin(targets: List[Dict[str, str]], last_trade_day: str) -> int:
+    """同步融资融券数据（极致增量版）"""
+    logger.info("Syncing margin trading data...")
+    margin_path = Path("/tmp/data/margin.parquet")
+    margin_path.parent.mkdir(parents=True, exist_ok=True)
+    old_df = None
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    # 1. 优先读取本地缓存
+    if margin_path.exists():
+        try:
+            old_df = pd.read_parquet(margin_path)
+            global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+            existing_codes = set(old_df['code'].unique())
+            logger.info(f"Local margin cache found, latest date: {global_latest_date}")
+        except Exception as e:
+            logger.warning(f"Failed to read local margin cache: {e}")
+    # 2. 本地无缓存，尝试从云端拉取
+    if old_df is None:
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                old_file = hf_hub_download(repo_id=repo_id, filename="data/margin.parquet", repo_type="dataset")
+                old_df = pd.read_parquet(old_file)
+                global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+                existing_codes = set(old_df['code'].unique())
+                old_df.to_parquet(margin_path)
+                logger.info(f"Downloaded margin from cloud, latest date: {global_latest_date}")
+            except Exception:
+                logger.info("No existing margin data found in cloud.")
+    # 3. 全局水位线拦截 + 新股检测
+    stock_targets = [t for t in targets if t['market'] in ['主板', '创业板', '科创板']]
+    new_codes = [t for t in stock_targets if t['code'] not in existing_codes]
+    if global_latest_date >= last_trade_day and not new_codes:
+        logger.info(f"Margin data is already up to date ({global_latest_date}) and no new stocks. Skip.")
+        return 0
+    # 4. 增量获取
+    if global_latest_date >= last_trade_day:
+        logger.info(f"Global date is up to date, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+    else:
+        logger.info(f"Syncing margin data from {global_latest_date} to {last_trade_day}...")
+        sync_targets = stock_targets
+    all_data = []
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['margin']) as executor:
+        futures = {executor.submit(get_stock_margin, t['code']): t['code'] for t in sync_targets}
+        for i, future in enumerate(as_completed(futures), 1):
+            res = future.result()
+            if res is not None and not res.empty:
+                code = futures[future]
+                if code in existing_codes:
+                    res = res[res['trade_date'] > pd.to_datetime(global_latest_date)]
+                if not res.empty:
+                    all_data.append(res)
+                    success_count += 1
+            if i % 500 == 0:
+                logger.info(f"Margin progress: {i}/{len(sync_targets)}, success: {success_count}")
+    # 5. 合并保存
+    if all_data:
+        new_df = pd.concat(all_data, ignore_index=True)
+        final_df = pd.concat([old_df, new_df]) if old_df is not None else new_df
+        final_df = final_df.drop_duplicates(subset=['code', 'trade_date'])
+        final_df.to_parquet(margin_path)
+        logger.info(f"Margin updated: {len(final_df)} total records")
+    return success_count
+# ==================== 新增：财务指标数据同步 ====================
+def get_stock_financial_indicator(code: str) -> Optional[pd.DataFrame]:
+    """获取单只股票财务指标数据"""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            df = ak.stock_financial_analysis_indicator(symbol=code)
+            if df is not None and not df.empty:
+                # 标准化列名
+                rename_map = {
+                    '日期': 'trade_date',
+                    '净资产收益率': 'roe',
+                    '总资产净利率': 'roa',
+                    '销售毛利率': 'gross_margin',
+                    '销售净利率': 'net_margin',
+                    '资产负债率': 'debt_ratio',
+                    '流动比率': 'current_ratio',
+                    '速动比率': 'quick_ratio',
+                    '存货周转率': 'inventory_turnover',
+                    '应收账款周转率': 'receivable_turnover',
+                    '总资产周转率': 'total_asset_turnover',
+                }
+                df = df.rename(columns=rename_map)
+                if 'trade_date' in df.columns:
+                    df['trade_date'] = pd.to_datetime(df['trade_date'])
+                df['code'] = code
+                cols = ['code', 'trade_date', 'roe', 'roa', 'gross_margin', 'net_margin',
+                        'debt_ratio', 'current_ratio', 'quick_ratio',
+                        'inventory_turnover', 'receivable_turnover', 'total_asset_turnover']
+                available_cols = [c for c in cols if c in df.columns]
+                return df[available_cols]
+        except Exception:
+            time.sleep(0.5)
+    return None
+def sync_financial_indicator(targets: List[Dict[str, str]]) -> int:
+    """同步财务指标数据（极致增量版）"""
+    logger.info("Syncing financial indicator data...")
+    fi_path = Path("/tmp/data/financial_indicator.parquet")
+    fi_path.parent.mkdir(parents=True, exist_ok=True)
+    old_df = None
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    # 1. 优先读取本地缓存
+    if fi_path.exists():
+        try:
+            old_df = pd.read_parquet(fi_path)
+            global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+            existing_codes = set(old_df['code'].unique())
+            logger.info(f"Local financial cache found, latest date: {global_latest_date}")
+        except Exception as e:
+            logger.warning(f"Failed to read local financial cache: {e}")
+    # 2. 本地无缓存，尝试从云端拉取
+    if old_df is None:
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                old_file = hf_hub_download(repo_id=repo_id, filename="data/financial_indicator.parquet", repo_type="dataset")
+                old_df = pd.read_parquet(old_file)
+                global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+                existing_codes = set(old_df['code'].unique())
+                old_df.to_parquet(fi_path)
+                logger.info(f"Downloaded financial from cloud, latest date: {global_latest_date}")
+            except Exception:
+                logger.info("No existing financial data found in cloud.")
+    # 3. 财务指标特殊拦截 + 新股检测
+    stock_targets = [t for t in targets if t['market'] not in ['ETF', 'LOF', 'REITs', '可转债']]
+    new_codes = [t for t in stock_targets if t['code'] not in existing_codes]
+    today = get_beijing_time()
+    is_recent = False
+    if global_latest_date != "2000-01-01":
+        days_diff = (today - pd.to_datetime(global_latest_date)).days
+        if days_diff < 90:
+            is_recent = True
+    if is_recent and not new_codes:
+        logger.info(f"Financial data is recent ({global_latest_date}) and no new stocks. Skip.")
+        return 0
+    # 4. 增量获取
+    if is_recent:
+        logger.info(f"Financial data is recent, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+    else:
+        logger.info(f"Syncing financial indicators (last update: {global_latest_date})...")
+        sync_targets = stock_targets
+    all_data = []
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['financial']) as executor:
+        futures = {executor.submit(get_stock_financial_indicator, t['code']): t['code'] for t in sync_targets}
+        for i, future in enumerate(as_completed(futures), 1):
+            res = future.result()
+            if res is not None and not res.empty:
+                code = futures[future]
+                if code in existing_codes:
+                    res = res[res['trade_date'] > pd.to_datetime(global_latest_date)]
+                if not res.empty:
+                    all_data.append(res)
+                    success_count += 1
+            if i % 500 == 0:
+                logger.info(f"Financial indicator progress: {i}/{len(sync_targets)}, success: {success_count}")
+    # 5. 合并保存
+    if all_data:
+        new_df = pd.concat(all_data, ignore_index=True)
+        final_df = pd.concat([old_df, new_df]) if old_df is not None else new_df
+        final_df = final_df.drop_duplicates(subset=['code', 'trade_date'])
+        final_df.to_parquet(fi_path)
+        logger.info(f"Financial updated: {len(final_df)} total records")
+    return success_count
+# ==================== 新增：股东户数数据同步 ====================
+def sync_holder_num() -> int:
+    """同步股东户数数据（批量获取）"""
+    logger.info("Syncing holder number data...")
+    try:
+        # 获取最近报告期的股东户数数据
+        df = ak.stock_zh_a_gdhs(symbol="全部")
+        if df is not None and not df.empty:
+            # 标准化列名
+            rename_map = {
+                '代码': 'code',
+                '股东户数': 'holder_num',
+                '户均持股数量': 'avg_share',
+                '户均持股金额': 'avg_value',
+                '总股本': 'total_share',
+                '总市值': 'total_value',
+                '日期': 'trade_date',
+            }
+            df = df.rename(columns=rename_map)
+            if 'trade_date' in df.columns:
+                df['trade_date'] = pd.to_datetime(df['trade_date'])
+            # 保存到 Parquet
+            hn_path = Path("/tmp/data/holder_num.parquet")
+            hn_path.parent.mkdir(parents=True, exist_ok=True)
+            df.to_parquet(hn_path)
+            logger.info(f"Holder number data saved: {len(df)} records")
+            return len(df)
+    except Exception as e:
+        logger.warning(f"Failed to sync holder number data: {e}")
+    return 0
+# ==================== 新增：分红数据同步 ====================
+def get_stock_dividend(code: str) -> Optional[pd.DataFrame]:
+    """获取单只股票分红数据"""
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            df = ak.stock_history_dividend(symbol=code)
+            if df is not None and not df.empty:
+                # 标准化列名
+                rename_map = {
+                    '公告日期': 'trade_date',
+                    '分红方案': 'dividend_type',
+                    '分红金额': 'dividend_amount',
+                    '股权登记日': 'record_date',
+                    '除权除息日': 'ex_date',
+                    '派息日': 'pay_date',
+                }
+                df = df.rename(columns=rename_map)
+                df['trade_date'] = pd.to_datetime(df['trade_date'])
+                df['code'] = code
+                cols = ['code', 'trade_date', 'dividend_type', 'dividend_amount', 'record_date', 'ex_date', 'pay_date']
+                available_cols = [c for c in cols if c in df.columns]
+                return df[available_cols]
+        except Exception:
+            time.sleep(0.5)
+    return None
+def sync_dividend(targets: List[Dict[str, str]]) -> int:
+    """同步分红数据（极致增量版）"""
+    logger.info("Syncing dividend data...")
+    div_path = Path("/tmp/data/dividend.parquet")
+    div_path.parent.mkdir(parents=True, exist_ok=True)
+    old_df = None
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    if div_path.exists():
+        try:
+            old_df = pd.read_parquet(div_path)
+            global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+            existing_codes = set(old_df['code'].unique())
+        except Exception: pass
+    if old_df is None:
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                old_file = hf_hub_download(repo_id=repo_id, filename="data/dividend.parquet", repo_type="dataset")
+                old_df = pd.read_parquet(old_file)
+                global_latest_date = old_df['trade_date'].max().strftime('%Y-%m-%d')
+                existing_codes = set(old_df['code'].unique())
+                old_df.to_parquet(div_path)
+            except Exception: pass
+    # 90天检查一次 + 新股检测
+    stock_targets = [t for t in targets if t['market'] not in ['ETF', 'LOF', 'REITs', '可转债']]
+    new_codes = [t for t in stock_targets if t['code'] not in existing_codes]
+    today = get_beijing_time()
+    is_recent = False
+    if global_latest_date != "2000-01-01":
+        if (today - pd.to_datetime(global_latest_date)).days < 90:
+            is_recent = True
+    if is_recent and not new_codes:
+        logger.info(f"Dividend data is recent and no new stocks. Skip.")
+        return 0
+    # 4. 增量获取
+    if is_recent:
+        logger.info(f"Dividend data is recent, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+    else:
+        logger.info(f"Syncing dividend data (last update: {global_latest_date})...")
+        sync_targets = stock_targets
+    all_data = []
+    success_count = 0
+    with ThreadPoolExecutor(max_workers=get_thread_config_safe()['dividend']) as executor:
+        futures = {executor.submit(get_stock_dividend, t['code']): t['code'] for t in sync_targets}
+        for i, future in enumerate(as_completed(futures), 1):
+            res = future.result()
+            if res is not None and not res.empty:
+                code = futures[future]
+                if code in existing_codes:
+                    res = res[res['trade_date'] > pd.to_datetime(global_latest_date)]
+                if not res.empty:
+                    all_data.append(res)
+                    success_count += 1
+            if i % 500 == 0:
+                logger.info(f"Dividend progress: {i}/{len(sync_targets)}, success: {success_count}")
+    if all_data:
+        new_df = pd.concat(all_data, ignore_index=True)
+        final_df = pd.concat([old_df, new_df]) if old_df is not None else new_df
+        final_df = final_df.drop_duplicates(subset=['code', 'trade_date', 'dividend_type'])
+        final_df.to_parquet(div_path)
+        logger.info(f"Dividend updated: {len(final_df)} total records")
+    return success_count
+# ==================== 新增：十大股东数据同步 ====================
+def sync_top_holders() -> int:
+    """同步十大股东数据（批量获取）"""
+    logger.info("Syncing top holders data...")
+    try:
+        today = get_beijing_time()
+        df = ak.stock_gdfx_holding_analyse_em(date=today.strftime('%Y%m%d'))
+        if df is not None and not df.empty:
+            rename_map = {
+                '股票代码': 'code',
+                '公告日期': 'trade_date',
+                '股东名称': 'holder_name',
+                '持股数量': 'hold_num',
+                '持股比例': 'hold_ratio',
+                '持股变动': 'hold_change',
+            }
+            df = df.rename(columns=rename_map)
+            df['trade_date'] = pd.to_datetime(df['trade_date'])
+            path = Path("/tmp/data/top_holders.parquet")
+            path.parent.mkdir(parents=True, exist_ok=True)
+            df.to_parquet(path)
+            logger.info(f"Top holders data saved: {len(df)} records")
+            return len(df)
+    except Exception as e:
+        logger.warning(f"Failed to sync top holders: {e}")
+    return 0
+# ==================== 新增：限售解禁数据同步 ====================
+def sync_restricted_unlock() -> int:
+    """同步限售解禁数据（批量获取）"""
+    logger.info("Syncing restricted unlock data...")
+    path = Path("/tmp/data/restricted_unlock.parquet")
+    path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        # 获取全市场限售解禁数据
+        df = ak.stock_restricted_shares(stock="all")
+        if df is not None and not df.empty:
+            rename_map = {
+                '代码': 'code',
+                '名称': 'name',
+                '解禁日期': 'unlock_date',
+                '解禁数量': 'unlock_num',
+                '解禁股本占总股本比例': 'unlock_ratio',
+            }
+            df = df.rename(columns=rename_map)
+            df['unlock_date'] = pd.to_datetime(df['unlock_date'])
+            df['trade_date'] = get_beijing_time() # 记录同步日期
+            df.to_parquet(path)
+            logger.info(f"Restricted unlock data saved: {len(df)} records")
+            return len(df)
+    except Exception as e:
+        logger.warning(f"Failed to sync restricted unlock: {e}")
+    return 0
+def main() -> int:
+    """
+    主函数 - 执行完整的数据同步流程
+    Returns:
+        int: 退出码，0 表示成功，1 表示失败
+    """
     logger.info("=" * 60)
+    logger.info("Stock Data Sync Started")
     logger.info("=" * 60)
+    try:
+        # 初始化线程配置
+        init_thread_config()
+        db = get_db()
+        db.init_db()
+        # 1. 列表同步
+        target_list = get_stock_list()
+        list_parquet = Path("/tmp/data/stock_list.parquet")
+        list_parquet.parent.mkdir(parents=True, exist_ok=True)
+        target_list.to_parquet(list_parquet)
+        # 2. 行情同步
+        last_day = get_last_trading_day()
+        logger.info(f"Last trading day: {last_day}")
+        sync_count = sync_stock_daily(target_list.to_dict('records'), last_day)
+        # 3. 指数同步
+        idx_df = get_index_daily('000300')
+        if idx_df is not None:
+            idx_path = Path("/tmp/data/parquet/index_000300.parquet")
+            idx_path.parent.mkdir(parents=True, exist_ok=True)
+            idx_df.to_parquet(idx_path)
+        # 4-10. 各类指标同步
+        logger.info("-" * 40)
+        fund_flow_count = sync_fund_flow(target_list.to_dict('records'), last_day)
+        logger.info("-" * 40)
+        valuation_count = sync_valuation(target_list.to_dict('records'), last_day)
+        logger.info("-" * 40)
+        margin_count = sync_margin(target_list.to_dict('records'), last_day)
+        logger.info("-" * 40)
+        financial_count = sync_financial_indicator(target_list.to_dict('records'))
+        logger.info("-" * 40)
+        holder_count = sync_holder_num()
+        logger.info("-" * 40)
+        dividend_count = sync_dividend(target_list.to_dict('records'))
+        logger.info("-" * 40)
+        top_holders_count = sync_top_holders()
+        logger.info("-" * 40)
+        restricted_count = sync_restricted_unlock()
+        # 11. 上传
+        logger.info("-" * 40)
+        logger.info("Uploading to Hugging Face Dataset...")
+        db.upload_db()
+        logger.info("=" * 60)
+        logger.info("Sync Completed Successfully!")
+        summary = (f"Daily={sync_count}, FundFlow={fund_flow_count}, Valuation={valuation_count}, "
+                   f"Margin={margin_count}, Financial={financial_count}, Holder={holder_count}, "
+                   f"Dividend={dividend_count}, TopHolders={top_holders_count}, Restricted={restricted_count}")
+        logger.info(f"Summary: {summary}")
+        logger.info("=" * 60)
+        return 0
+    except Exception as e:
+        logger.error(f"Sync failed with error: {e}")
+        return 1
 if __name__ == "__main__":
+    sys.exit(main())