Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from typing import Dict, List | |
| import torch | |
| import pickle | |
| # This file contains helper functions for the backend operations | |
| def build_full_prompt(prompt: str, prompt_prefix: str, prompt_suffix: str) -> str: | |
| """ | |
| Build the full prompt with instructions | |
| Args: | |
| prompt: Original financial statement content (without instructions) | |
| Returns: | |
| Full prompt with instructions | |
| """ | |
| return f"{prompt_prefix}{prompt}{prompt_suffix}" | |
| def check_gpu_utilization(): | |
| """Print detailed GPU utilization information""" | |
| if not torch.cuda.is_available(): | |
| print("❌ CUDA is not available. Running on CPU.") | |
| return False | |
| # Print GPU device information | |
| device_count = torch.cuda.device_count() | |
| print(f"✅ Found {device_count} CUDA device(s):") | |
| for i in range(torch.cuda.device_count()): | |
| device_props = torch.cuda.get_device_properties(i) | |
| print(f" Device {i}: {device_props.name}") | |
| print(f" Memory: {device_props.total_memory / 1024**3:.2f} GB") | |
| # Print current GPU usage | |
| current_device = torch.cuda.current_device() | |
| print(f"\nCurrent device: {current_device} ({torch.cuda.get_device_name(current_device)})") | |
| print(f" Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f" Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| # Try using nvidia-smi command for more detailed information | |
| try: | |
| import subprocess | |
| print("\nnvidia-smi output:") | |
| subprocess.run(['nvidia-smi'], check=True) | |
| except: | |
| print("Failed to run nvidia-smi command") | |
| return True | |
| def jensen_shannon_distance(p: Dict[str, float], q: Dict[str, float]) -> float: | |
| """ | |
| Calculate Jensen-Shannon distance between two probability distributions | |
| Args: | |
| p: First probability distribution as dictionary | |
| q: Second probability distribution as dictionary | |
| Returns: | |
| Jensen-Shannon distance (0 = identical, 1 = maximally different) | |
| """ | |
| # Ensure all keys are in both distributions | |
| all_keys = set(p.keys()) | set(q.keys()) | |
| p_vec = np.array([p.get(k, 0.0) for k in all_keys]) | |
| q_vec = np.array([q.get(k, 0.0) for k in all_keys]) | |
| # Normalize distributions | |
| p_vec = p_vec / np.sum(p_vec) if np.sum(p_vec) > 0 else p_vec | |
| q_vec = q_vec / np.sum(q_vec) if np.sum(q_vec) > 0 else q_vec | |
| # Calculate midpoint distribution | |
| m_vec = 0.5 * (p_vec + q_vec) | |
| # Calculate KL divergences and add a small epsilon to avoid log(0) | |
| eps = 1e-10 | |
| p_vec = np.maximum(p_vec, eps) | |
| q_vec = np.maximum(q_vec, eps) | |
| m_vec = np.maximum(m_vec, eps) | |
| kl_p_m = np.sum(p_vec * np.log(p_vec / m_vec)) | |
| kl_q_m = np.sum(q_vec * np.log(q_vec / m_vec)) | |
| # Jensen-Shannon divergence | |
| js_divergence = 0.5 * (kl_p_m + kl_q_m) | |
| # Convert to distance | |
| return np.sqrt(js_divergence) | |
| def load_dataset(file_path: str) -> List[str]: | |
| """ | |
| Load dataset from a text file | |
| Args: | |
| file_path: Path to the text file containing sentences | |
| Returns: | |
| metadata and mutants as a list | |
| """ | |
| with open(file_path, 'rb') as f: | |
| content = pickle.load(f) | |
| print("Loaded mutant data of type:", type(content)) | |
| # Expecting a two-element list: [metadata, mutants] | |
| metadata = content[0] # e.g., a dictionary including the header info | |
| mutants = content[1] # list of rows (each row is a list) | |
| return [metadata, mutants] | |
| def store_mutant_results(results_data, output_file): | |
| """Store results to Excel file""" | |
| header = results_data['header'] | |
| results = results_data['results'] | |
| # Create and save DataFrame | |
| import pandas as pd | |
| df = pd.DataFrame(results, columns=header) | |
| df.to_excel(output_file, index=False) | |
| print('Results stored in', output_file) | |