Spaces:

pgsoft
/

LogDisplayer

Running

App Files Files Community

Beracles commited on Oct 24, 2025

Commit

346738b

1 Parent(s): e7d4173

优化日志加载和清理功能，支持时间戳管理和日期范围过滤

Browse files

Files changed (1) hide show

logging_helper.py +130 -81

logging_helper.py CHANGED Viewed

@@ -8,13 +8,15 @@ from apscheduler.schedulers.background import BackgroundScheduler
 from utils import beijing, md5, json_to_str
 from huggingface_hub import HfApi
 import pandas as pd
-import datetime
 from zoneinfo import ZoneInfo
 import glob
 hf = HfApi()
 hf.token = os.environ.get("hf_token")
 class LoggingHelper:
@@ -39,6 +41,7 @@ class LoggingHelper:
         self.scheduler = BackgroundScheduler()
         self.buffer = dict[str, ds.Dataset]()
         self.need_push = dict[str, bool]()
         self.today = beijing().date()
         ds.disable_progress_bar()
         self.dataframe: pd.DataFrame
@@ -58,6 +61,7 @@ class LoggingHelper:
             self.buffer[remotepath] = self.buffer[remotepath].add_item(log)  # type: ignore
         else:
             self.buffer[remotepath] = ds.Dataset.from_dict({})
             self.buffer[remotepath] = self.buffer[remotepath].add_item(log)  # type: ignore
         self.need_push[remotepath] = True
         self.dataframe_refresh_needed = True
@@ -149,17 +153,34 @@ class LoggingHelper:
         try:
             res = hf.snapshot_download(
                 repo_id=self.repo_id,
-                repo_type="dataset",
                 local_dir=self.local_dir,
             )
             print(f"[pull] Downloaded to {res}")
         except Exception as e:
             print(f"[pull] {type(e)}: {e}")
         print("[pull] Done")
-    def get_pathes_between(
-        self, from_date: datetime.date, to_date: datetime.date
-    ) -> list[str]:
         """
         获取指定日期范围内的路径列表
@@ -167,79 +188,109 @@ class LoggingHelper:
         :param to_date: 结束日期（格式：YYYY-MM-DD 或 datetime.date），含该日期
         :return: 日期范围内的路径列表，格式为 ["YYYY/MM/DD", ...]
         """
-        pathes = []
         current_date = from_date
         while current_date <= to_date:
-            pathes.append(f"{current_date.year}/{current_date.month}/{current_date.day}")
-            current_date += datetime.timedelta(days=1)
         return pathes
-    def load_logs(self):
         """
         在启动时加载最近30天的日志数据到内存buffer
         """
-        print("[load_logs] Starting to load recent 30 days logs")
         try:
-            today = beijing().date()
-            start_date = today - datetime.timedelta(days=self.cache_days)
-            print(f"Loading logs from {start_date} to {today}")
-            # 生成最近30天的日期范围
-            pathes = self.get_pathes_between(start_date, today)
-            total_files_loaded = 0
-            # 遍历每一天的日志
-            for path in pathes:
-                date_path = "/".join([self.local_dir, path])
-                print(f"[load_logs] Processing directory: {date_path}")
-                # 检查该日期的目录是否存在
-                if not os.path.exists(date_path):
-                    print(f"[load_logs] Directory not found: {date_path}")
                     continue
-                # 加载该目录下的所有JSON文件
-                json_files = glob.glob(os.path.join(date_path, "*.json"))
-                for json_file in json_files:
-                    # 构造相对路径作为buffer的key
-                    relative_path = os.path.relpath(json_file, self.local_dir).replace(
-                        os.sep, "/"
-                    )
-                    try:
-                        # 检查文件是否为空
-                        if os.path.getsize(json_file) == 0:
-                            print(f"[load_logs] Skipping empty file: {relative_path}")
-                            continue
-                        # 加载JSON数据到Dataset
-                        dataset = ds.Dataset.from_json(json_file)
-                        if isinstance(dataset, ds.Dataset):
-                            self.buffer[relative_path] = dataset
-                            self.need_push[relative_path] = False
-                            total_files_loaded += 1
-                    except Exception as e:
-                        print(f"[load_logs] Error loading {relative_path}: {e}")
                         continue
             print(f"[load_logs] Successfully loaded {total_files_loaded} log files")
             print(f"[load_logs] Total datasets in buffer: {len(self.buffer)}")
         except Exception as e:
             print(f"[load_logs] Error: {type(e)}: {e}")
     def cleanup_old_logs(self):
-        """清理buffer中超过30天的日志数据"""
         try:
             print("[cleanup_old_logs] Starting cleanup of old logs")
-            cache_dir_to_remove = (
-                self.today - datetime.timedelta(days=(self.cache_days + 1))
-            ).strftime("%Y/%m/%d")
-            print(
-                f"[cleanup_old_logs] Removing logs in {cache_dir_to_remove} from buffer"
-            )
             removed_count = 0
             for filepath in list(self.buffer.keys()):
-                if filepath.startswith(cache_dir_to_remove):
                     del self.buffer[filepath]
                     del self.need_push[filepath]
                     removed_count += 1
             print(f"[cleanup_old_logs] Cleaned up {removed_count} old log files")
             print(
@@ -266,9 +317,7 @@ class LoggingHelper:
         self.scheduler.start()
     def refresh_dataframe(self) -> pd.DataFrame:
-        """
-        加载最近30天的日志文件并返回合并后的DataFrame
-        """
         datasets = list(self.buffer.values())
         merged_dataset = ds.concatenate_datasets(datasets)
         self.dataframe = merged_dataset.to_pandas()  # type: ignore
@@ -276,42 +325,42 @@ class LoggingHelper:
         self.dataframe_refresh_needed = False
         return self.dataframe  # type: ignore
-    def refresh(self, from_date=None, to_date=None) -> list[dict]:
         """
-        获取刷新后的日志列表，从内存buffer中合并Dataset，支持日期范围过滤
         基于timestamp字段进行日期过滤。时间戳格式为 ISO 8601 格式（如 "2025-09-08T16:01:07.526954+08:00"）
         :param from_date: 开始日期（格式：YYYY-MM-DD 或 datetime.date），含该日期的所有日志
         :param to_date: 结束日期（格式：YYYY-MM-DD 或 datetime.date），含该日期的所有日志
         :return: 按时间戳降序排列的日志字典列表
         """
         if self.dataframe_refresh_needed:
             self.refresh_dataframe()
         df = self.dataframe
-        # 将字符串日期转换为 datetime.date 对象
-        tz = ZoneInfo("Asia/Shanghai")
-        if isinstance(from_date, str):
-            from_date = (
-                datetime.datetime.strptime(from_date, "%Y-%m-%d")
-                .astimezone(tz)
-                .isoformat(timespec="microseconds")
-            )
-        if isinstance(to_date, str):
-            to_date = datetime.datetime.strptime(to_date, "%Y-%m-%d").astimezone(tz)
-            to_date += datetime.timedelta(days=1)  # 包含结束日期全天
-            to_date = to_date.isoformat(timespec="microseconds")
         print(f"[refresh] Filtering logs from {from_date} to {to_date}")
-        # 按timestamp范围过滤（包含边界日期的全天数据）
-        if from_date is not None or to_date is not None:
-            # 创建日期范围过滤条件
-            filter_condition = pd.Series([True] * len(df), index=df.index)
-            if from_date is not None:
-                filter_condition = filter_condition & (df["timestamp"] >= from_date)
-            if to_date is not None:
-                filter_condition = filter_condition & (df["timestamp"] < to_date)
-            df = df[filter_condition]
         # 按timestamp降序排序（最新日志在前）
         df = df.sort_values(by="timestamp", ascending=False)
         print(f"[refresh] Returning {len(df)} logs")

 from utils import beijing, md5, json_to_str
 from huggingface_hub import HfApi
 import pandas as pd
+from datetime import datetime, date, timedelta
 from zoneinfo import ZoneInfo
 import glob
 hf = HfApi()
 hf.token = os.environ.get("hf_token")
+TIMEZONE = ZoneInfo("Asia/Shanghai")
 class LoggingHelper:
         self.scheduler = BackgroundScheduler()
         self.buffer = dict[str, ds.Dataset]()
         self.need_push = dict[str, bool]()
+        self.timestamps = dict[str, str]()
         self.today = beijing().date()
         ds.disable_progress_bar()
         self.dataframe: pd.DataFrame
             self.buffer[remotepath] = self.buffer[remotepath].add_item(log)  # type: ignore
         else:
             self.buffer[remotepath] = ds.Dataset.from_dict({})
+            self.timestamps[remotepath] = beijing().isoformat(timespec="microseconds")
             self.buffer[remotepath] = self.buffer[remotepath].add_item(log)  # type: ignore
         self.need_push[remotepath] = True
         self.dataframe_refresh_needed = True
         try:
             res = hf.snapshot_download(
                 repo_id=self.repo_id,
+                repo_type=self.repo_type,
                 local_dir=self.local_dir,
             )
             print(f"[pull] Downloaded to {res}")
+            remotepathes = hf.list_repo_files(
+                repo_id=self.repo_id, repo_type=self.repo_type
+            )
+            jsonfiles = [f for f in remotepathes if f.endswith(".json")]
+            print(f"[pull] {len(jsonfiles)} files found in remote repo")
+            print("[pull] Parsing timestamps")
+            for remotepath in jsonfiles:
+                try:
+                    parts = remotepath.split("/")
+                    year, month, day = parts[0], parts[1], parts[2]
+                    date_obj = date(int(year), int(month), int(day))
+                    timestamp = datetime.combine(
+                        date_obj, datetime.min.time()
+                    ).isoformat(timespec="microseconds")
+                    self.timestamps[remotepath] = timestamp
+                except Exception as e:
+                    print(f"[pull] Error parsing timestamp of {remotepath}: {e}")
+                    continue
+            print("[pull] Done")
         except Exception as e:
             print(f"[pull] {type(e)}: {e}")
         print("[pull] Done")
+    def get_pathes_between(self, from_date: date, to_date: date) -> dict[str, str]:
         """
         获取指定日期范围内的路径列表
         :param to_date: 结束日期（格式：YYYY-MM-DD 或 datetime.date），含该日期
         :return: 日期范围内的路径列表，格式为 ["YYYY/MM/DD", ...]
         """
+        pathes = {}
         current_date = from_date
         while current_date <= to_date:
+            key = f"{current_date.year}/{current_date.month}/{current_date.day}"
+            value = datetime.combine(current_date, datetime.min.time()).isoformat(
+                timespec="microseconds"
+            )
+            pathes[key] = value
+            current_date += timedelta(days=1)
         return pathes
+    def load_logs(
+        self, from_timestamp: str | None = None, to_timestamp: str | None = None
+    ):
         """
         在启动时加载最近30天的日志数据到内存buffer
         """
         try:
+            start_timestamp = self.cutoff_timestamp()
+            end_timestamp = (
+                beijing()
+                .replace(hour=23, minute=59, second=59, microsecond=999999)
+                .isoformat(timespec="microseconds")
+            )
+            from_timestamp = from_timestamp or start_timestamp
+            to_timestamp = to_timestamp or end_timestamp
+            total_files_loaded = 0
+            for remotepath, timestamp in self.timestamps.items():
+                if timestamp < from_timestamp or timestamp >= to_timestamp:
                     continue
+                localpath = "/".join([self.local_dir, remotepath])
+                print(f"[load_logs] Loading file {localpath}")
+                # 检查该文件是否存在
+                if not os.path.exists(localpath):
+                    print(f"[load_logs] File not found: {localpath}")
+                    continue
+                try:
+                    # 检查文件是否为空
+                    if os.path.getsize(localpath) == 0:
+                        print(f"[load_logs] Skipping empty file: {remotepath}")
                         continue
+                    if remotepath in self.buffer:
+                        print(f"[load_logs] File already loaded: {remotepath}")
+                        continue
+                    # 加载JSON数据到Dataset
+                    dataset = ds.Dataset.from_json(localpath)
+                    if isinstance(dataset, ds.Dataset):
+                        self.buffer[remotepath] = dataset
+                        self.need_push[remotepath] = False
+                        self.timestamps[remotepath] = timestamp
+                        total_files_loaded += 1
+                except Exception as e:
+                    print(f"[load_logs] Error loading {remotepath}: {e}")
+                    continue
+            if total_files_loaded > 0:
+                self.dataframe_refresh_needed = True
             print(f"[load_logs] Successfully loaded {total_files_loaded} log files")
             print(f"[load_logs] Total datasets in buffer: {len(self.buffer)}")
         except Exception as e:
             print(f"[load_logs] Error: {type(e)}: {e}")
+    def cutoff_timestamp(self) -> str:
+        """
+        计算用于清理日志的截止时间戳
+        :return: 截止时间戳，格式为 ISO 8601 字符串
+        """
+        cutoff_date = self.today - timedelta(days=self.cache_days)
+        cutoff_timestamp = (
+            datetime.combine(cutoff_date, datetime.min.time())
+            .astimezone(TIMEZONE)
+            .isoformat(timespec="microseconds")
+        )
+        return cutoff_timestamp
     def cleanup_old_logs(self):
+        """
+        清理buffer中超过30天的日志数据
+        保留逻辑：保留最近cache_days天的日志
+        删除逻辑：删除早于 (today - cache_days) 的所有日志
+        """
         try:
             print("[cleanup_old_logs] Starting cleanup of old logs")
+            # 计算应该保留的最早日期（含这一天）
+            start_timestamp = self.cutoff_timestamp()
             removed_count = 0
             for filepath in list(self.buffer.keys()):
+                # filepath 格式类似 "2025/9/23/xx.json"
+                # 提取日期部分 "2025/9/23"
+                try:
+                    timestamp = self.timestamps[filepath]
+                    # 如果文件日期早于截断日期，则删除
+                    if timestamp >= start_timestamp:
+                        continue
                     del self.buffer[filepath]
                     del self.need_push[filepath]
                     removed_count += 1
+                    print(f"[cleanup_old_logs] Removed {filepath}")
+                except (ValueError, IndexError) as e:
+                    print(f"[cleanup_old_logs] Error parsing filepath {filepath}: {e}")
+                    continue
             print(f"[cleanup_old_logs] Cleaned up {removed_count} old log files")
             print(
         self.scheduler.start()
     def refresh_dataframe(self) -> pd.DataFrame:
+        """内存中所有日志数据合并为一个DataFrame"""
         datasets = list(self.buffer.values())
         merged_dataset = ds.concatenate_datasets(datasets)
         self.dataframe = merged_dataset.to_pandas()  # type: ignore
         self.dataframe_refresh_needed = False
         return self.dataframe  # type: ignore
+    def refresh(self, from_date: str | None, to_date: str | None) -> list[dict]:
         """
+        获取刷新后的日志列表，支持查询任意时间范围的日志（包括超过30天前的日志）
+        当查询超过30天前的日志时，会动态从磁盘加载相应数据。
         基于timestamp字段进行日期过滤。时间戳格式为 ISO 8601 格式（如 "2025-09-08T16:01:07.526954+08:00"）
         :param from_date: 开始日期（格式：YYYY-MM-DD 或 datetime.date），含该日期的所有日志
         :param to_date: 结束日期（格式：YYYY-MM-DD 或 datetime.date），含该日期的所有日志
         :return: 按时间戳降序排列的日志字典列表
         """
+        from_timestamp = None
+        if from_date is not None:
+            from_datetime = datetime.strptime(from_date, "%Y-%m-%d").astimezone(
+                TIMEZONE
+            )
+            from_timestamp = from_datetime.isoformat(timespec="microseconds")
+        to_timestamp = None
+        if to_date is not None:
+            to_datetime = datetime.strptime(to_date, "%Y-%m-%d").astimezone(
+                TIMEZONE
+            ) + timedelta(days=1)
+            to_timestamp = to_datetime.isoformat(timespec="microseconds")
+        print("[load_logs] Starting to load recent 30 days logs")
+        # 如果查询范围超出缓存范围，则加载相应的日志文件
+        self.load_logs(from_timestamp=from_timestamp, to_timestamp=to_timestamp)
         if self.dataframe_refresh_needed:
             self.refresh_dataframe()
         df = self.dataframe
         print(f"[refresh] Filtering logs from {from_date} to {to_date}")
+        # 创建日期范围过滤条件
+        filter_condition = pd.Series([True] * len(df), index=df.index)
+        filter_condition = filter_condition & (df["timestamp"] >= from_date)
+        filter_condition = filter_condition & (df["timestamp"] < to_date)
+        df = df[filter_condition]
         # 按timestamp降序排序（最新日志在前）
         df = df.sort_values(by="timestamp", ascending=False)
         print(f"[refresh] Returning {len(df)} logs")