Spaces:
Sleeping
Sleeping
| from collections import deque | |
| from datetime import datetime | |
| from typing import List, Dict, Any | |
| import json | |
| import numpy as np | |
| from lpm_kernel.L1.bio import Cluster | |
| import logging | |
| TIME_FORMAT = "%Y-%m-%d %H:%M:%S" | |
| def get_cur_time() -> str: | |
| """ | |
| Returns the current time formatted as a string. | |
| Returns: | |
| str: Current time formatted according to TIME_FORMAT. | |
| """ | |
| cur_time = datetime.now().strftime(TIME_FORMAT) | |
| return cur_time | |
| def find_connected_components( | |
| cluster_list: List[Cluster], cluster_merge_distance: float | |
| ) -> List[List[Cluster]]: | |
| """ | |
| Finds connected components in a list of clusters based on a distance threshold. | |
| Args: | |
| cluster_list: List of Cluster objects to analyze. | |
| cluster_merge_distance: Maximum distance for clusters to be considered connected. | |
| Returns: | |
| List[List[Cluster]]: List of connected components, where each component is a list of clusters. | |
| """ | |
| adjacency_matrix = np.array( | |
| [ | |
| [ | |
| np.linalg.norm(cluster1.cluster_center - cluster2.cluster_center) | |
| for cluster2 in cluster_list | |
| ] | |
| for cluster1 in cluster_list | |
| ] | |
| ) | |
| cluster_n = len(cluster_list) | |
| visited = [False] * cluster_n | |
| components = [] | |
| def bfs(start: int): | |
| queue = deque([start]) | |
| component = [] | |
| visited[start] = True | |
| while queue: | |
| node = queue.popleft() | |
| component.append(node) | |
| for neighbor in range(cluster_n): | |
| if ( | |
| not visited[neighbor] | |
| and adjacency_matrix[node, neighbor] < cluster_merge_distance | |
| ): | |
| visited[neighbor] = True | |
| queue.append(neighbor) | |
| return component | |
| for i in range(cluster_n): | |
| if not visited[i]: | |
| components.append(bfs(i)) | |
| return [[cluster_list[i] for i in component] for component in components] | |
| def is_valid_note(note: Dict[str, Any]) -> bool: | |
| """ | |
| Checks if a note contains valid creation time information. | |
| Args: | |
| note: Dictionary containing note data. | |
| Returns: | |
| bool: True if the note has a valid creation time, False otherwise. | |
| """ | |
| if "createTime" in note and note["createTime"]: | |
| return True | |
| return False | |
| def is_valid_todo(todo: Dict[str, Any]) -> bool: | |
| """ | |
| Checks if a todo item contains valid creation time information. | |
| Args: | |
| todo: Dictionary containing todo data. | |
| Returns: | |
| bool: True if the todo has a valid creation time, False otherwise. | |
| """ | |
| if "createTime" in todo and todo["createTime"]: | |
| return True | |
| return False | |
| def is_valid_chat(chat: Dict[str, Any]) -> bool: | |
| """ | |
| Checks if a chat contains valid creation time and summary information. | |
| Args: | |
| chat: Dictionary containing chat data. | |
| Returns: | |
| bool: True if the chat has valid creation time and summary, False otherwise. | |
| """ | |
| if ( | |
| "createTime" in chat | |
| and chat["createTime"] | |
| and "summary" in chat | |
| and chat["summary"] | |
| ): | |
| return True | |
| return False | |
| def save_true_topics(true_topics_res: Dict[str, Dict], topics_path: str) -> None: | |
| """ | |
| Save topics clustering results to a JSON file, excluding embedding data. | |
| Args: | |
| true_topics_res: Dictionary containing topic clustering results. | |
| topics_path: Path to save the JSON file. | |
| """ | |
| # Create a copy to avoid modifying original | |
| topics_to_save = {} | |
| for cluster_id, cluster_data in true_topics_res.items(): | |
| # Create new cluster dict without embeddings | |
| topics_to_save[cluster_id] = { | |
| "indices": cluster_data["indices"], | |
| "docIds": cluster_data["docIds"], | |
| "contents": cluster_data["contents"], | |
| "chunkIds": cluster_data["chunkIds"], | |
| "tags": cluster_data["tags"], | |
| "topic": cluster_data["topic"], | |
| "topicId": cluster_data["topicId"], | |
| "recTimes": cluster_data["recTimes"], | |
| } | |
| # Save to JSON file | |
| with open(topics_path, "w", encoding="utf-8") as f: | |
| json.dump(topics_to_save, f, ensure_ascii=False, indent=4) | |