Spaces:

FirasHadjKacem
/

FBDF

Sleeping

App Files Files Community

FBDF / backend /helpers.py

FirasHadjKacem

added a small comment too

1cb4178 3 months ago

raw

history blame contribute delete

3.92 kB

	import numpy as np
	from typing import Dict, List
	import torch
	import pickle
	# This file contains helper functions for the backend operations
	def build_full_prompt(prompt: str, prompt_prefix: str, prompt_suffix: str) -> str:
	"""
	Build the full prompt with instructions
	Args:
	prompt: Original financial statement content (without instructions)
	Returns:
	Full prompt with instructions
	"""
	return f"{prompt_prefix}{prompt}{prompt_suffix}"

	def check_gpu_utilization():
	"""Print detailed GPU utilization information"""
	if not torch.cuda.is_available():
	print("❌ CUDA is not available. Running on CPU.")
	return False

	# Print GPU device information
	device_count = torch.cuda.device_count()
	print(f"✅ Found {device_count} CUDA device(s):")

	for i in range(torch.cuda.device_count()):
	device_props = torch.cuda.get_device_properties(i)
	print(f" Device {i}: {device_props.name}")
	print(f" Memory: {device_props.total_memory / 1024**3:.2f} GB")

	# Print current GPU usage
	current_device = torch.cuda.current_device()
	print(f"\nCurrent device: {current_device} ({torch.cuda.get_device_name(current_device)})")
	print(f" Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
	print(f" Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

	# Try using nvidia-smi command for more detailed information
	try:
	import subprocess
	print("\nnvidia-smi output:")
	subprocess.run(['nvidia-smi'], check=True)
	except:
	print("Failed to run nvidia-smi command")

	return True

	def jensen_shannon_distance(p: Dict[str, float], q: Dict[str, float]) -> float:
	"""
	Calculate Jensen-Shannon distance between two probability distributions

	Args:
	p: First probability distribution as dictionary
	q: Second probability distribution as dictionary

	Returns:
	Jensen-Shannon distance (0 = identical, 1 = maximally different)
	"""
	# Ensure all keys are in both distributions
	all_keys = set(p.keys()) \| set(q.keys())
	p_vec = np.array([p.get(k, 0.0) for k in all_keys])
	q_vec = np.array([q.get(k, 0.0) for k in all_keys])

	# Normalize distributions
	p_vec = p_vec / np.sum(p_vec) if np.sum(p_vec) > 0 else p_vec
	q_vec = q_vec / np.sum(q_vec) if np.sum(q_vec) > 0 else q_vec
	# Calculate midpoint distribution
	m_vec = 0.5 * (p_vec + q_vec)
	# Calculate KL divergences and add a small epsilon to avoid log(0)
	eps = 1e-10
	p_vec = np.maximum(p_vec, eps)
	q_vec = np.maximum(q_vec, eps)
	m_vec = np.maximum(m_vec, eps)

	kl_p_m = np.sum(p_vec * np.log(p_vec / m_vec))
	kl_q_m = np.sum(q_vec * np.log(q_vec / m_vec))

	# Jensen-Shannon divergence
	js_divergence = 0.5 * (kl_p_m + kl_q_m)

	# Convert to distance
	return np.sqrt(js_divergence)

	def load_dataset(file_path: str) -> List[str]:
	"""
	Load dataset from a text file

	Args:
	file_path: Path to the text file containing sentences

	Returns:
	metadata and mutants as a list
	"""
	with open(file_path, 'rb') as f:
	content = pickle.load(f)
	print("Loaded mutant data of type:", type(content))
	# Expecting a two-element list: [metadata, mutants]
	metadata = content[0] # e.g., a dictionary including the header info
	mutants = content[1] # list of rows (each row is a list)
	return [metadata, mutants]

	def store_mutant_results(results_data, output_file):
	"""Store results to Excel file"""
	header = results_data['header']
	results = results_data['results']

	# Create and save DataFrame
	import pandas as pd
	df = pd.DataFrame(results, columns=header)
	df.to_excel(output_file, index=False)
	print('Results stored in', output_file)