Spaces:

superxu520
/

sync_stock

Paused

App Files Files Community

superxu520 commited on Mar 16

Commit

3d1ce67

1 Parent(s): 91985a9

"fix:optimize-daily-sync-global-watermark"

Browse files

Files changed (1) hide show

sync_data.py +70 -12

sync_data.py CHANGED Viewed

@@ -471,21 +471,79 @@ def get_index_daily(code: str) -> Optional[pd.DataFrame]:
 def sync_stock_daily(targets: List[Dict[str, str]], last_trade_day: str) -> Dict[str, Any]:
-    """增量同步逻辑，返回详细结果"""
-    db = get_db()
-    # 获取现有数据的最新日期
-    existing_latest = db.conn.execute("SELECT code, CAST(MAX(trade_date) AS VARCHAR) FROM stock_daily GROUP BY code").fetchall()
-    latest_map = {row[0]: row[1] for row in existing_latest}
     pending = []
-    for t in targets:
-        code = t['code']
-        if code in latest_map:
-            if latest_map[code] >= last_trade_day: continue
-            start_dt = (pd.to_datetime(latest_map[code]) + timedelta(days=1)).strftime('%Y-%m-%d')
-        else:
-            start_dt = (get_beijing_time() - timedelta(days=YEARS_OF_DATA * 365)).strftime('%Y-%m-%d')
         t['start_dt'] = start_dt
         pending.append(t)

 def sync_stock_daily(targets: List[Dict[str, str]], last_trade_day: str) -> Dict[str, Any]:
+    """增量同步逻辑，返回详细结果（采用全局水位线机制）"""
+    logger.info("Syncing daily data...")
+    # 1. 扫描本地 parquet 文件获取全局最新日期（类似其他指标）
+    parquet_dir = Path("/tmp/data/parquet")
+    parquet_dir.mkdir(parents=True, exist_ok=True)
+    global_latest_date = "2000-01-01"
+    existing_codes = set()
+    for f in parquet_dir.glob("*.parquet"):
+        if f.name.startswith('index_'):  # 跳过指数文件
+            continue
+        try:
+            df = pd.read_parquet(f)
+            if not df.empty and 'trade_date' in df.columns:
+                max_date = df['trade_date'].max()
+                if isinstance(max_date, pd.Timestamp):
+                    max_date = max_date.strftime('%Y-%m-%d')
+                if max_date > global_latest_date:
+                    global_latest_date = max_date
+                existing_codes.update(df['code'].unique())
+        except Exception:
+            pass
+    # 2. 如果本地没有数据，尝试从云端下载最近3个月作为基准
+    if global_latest_date == "2000-01-01":
+        repo_id = os.getenv("DATASET_REPO_ID")
+        if repo_id:
+            try:
+                files = list_repo_files(repo_id=repo_id, repo_type="dataset")
+                parquet_files = sorted([f for f in files if f.startswith("data/parquet/") and f.endswith(".parquet")])
+                # 下载最近3个月的数据作为基准
+                for pf in parquet_files[-3:]:
+                    try:
+                        local_file = hf_hub_download(repo_id=repo_id, filename=pf, repo_type="dataset")
+                        df = pd.read_parquet(local_file)
+                        if not df.empty and 'trade_date' in df.columns:
+                            max_date = df['trade_date'].max()
+                            if isinstance(max_date, pd.Timestamp):
+                                max_date = max_date.strftime('%Y-%m-%d')
+                            if max_date > global_latest_date:
+                                global_latest_date = max_date
+                            existing_codes.update(df['code'].unique())
+                    except Exception:
+                        pass
+                logger.info(f"Downloaded daily data from cloud, latest date: {global_latest_date}")
+            except Exception as e:
+                logger.info(f"No existing daily data in cloud: {e}")
+    # 3. 区分新股和存量股票
+    new_codes = [t for t in targets if t['code'] not in existing_codes]
+    # 4. 全局水位线拦截
+    if global_latest_date >= last_trade_day and not new_codes:
+        logger.info(f"Daily data is already up to date ({global_latest_date}) and no new stocks. Skip.")
+        return {'count': 0, 'failed_codes': [], 'status': 'skipped', 'message': f'Already up to date ({global_latest_date})'}
+    # 5. 确定同步策略
+    if global_latest_date >= last_trade_day:
+        logger.info(f"Global date is up to date, but found {len(new_codes)} new stocks. Syncing new stocks only.")
+        sync_targets = new_codes
+        # 新股只获取最近1年数据（而非10年）
+        start_dt = (get_beijing_time() - timedelta(days=365)).strftime('%Y-%m-%d')
+    else:
+        logger.info(f"Syncing daily data from {global_latest_date} to {last_trade_day}...")
+        sync_targets = targets
+        start_dt = (pd.to_datetime(global_latest_date) + timedelta(days=1)).strftime('%Y-%m-%d')
+    # 设置每只股票的start_dt
     pending = []
+    for t in sync_targets:
         t['start_dt'] = start_dt
         pending.append(t)