Loki / src /loki /preprocess.py

Upload 42 files

1e315b6 verified 9 months ago

14.1 kB

	import scanpy as sc
	import numpy as np
	import pandas as pd
	import json
	import os
	from PIL import Image



	def generate_gene_df(ad, house_keeping_genes, todense=True):
	"""
	Generates a DataFrame with the top 50 genes for each observation in an AnnData object.
	It removes genes containing '.' or '-' in their names, as well as genes listed in
	the provided `house_keeping_genes` DataFrame/Series under the 'genesymbol' column.

	:param ad: An AnnData object containing gene expression data.
	:type ad: anndata.AnnData
	:param house_keeping_genes: DataFrame or Series with a 'genesymbol' column listing housekeeping genes to exclude.
	:type house_keeping_genes: pandas.DataFrame or pandas.Series
	:param todense: Whether to convert the sparse matrix (ad.X) to a dense matrix before creating a DataFrame.
	:type todense: bool
	:return: A DataFrame (`top_k_genes_str`) that contains a 'label' column. Each row in 'label' is a string
	with the top 50 gene names (space-separated) for that observation.
	:rtype: pandas.DataFrame
	"""

	# Remove genes containing '.' in their names
	ad = ad[:, ~ad.var.index.str.contains('.', regex=False)]
	# Remove genes containing '-'
	ad = ad[:, ~ad.var.index.str.contains('-', regex=False)]
	# Exclude housekeeping genes
	ad = ad[:, ~ad.var.index.isin(house_keeping_genes['genesymbol'])]

	# Convert to dense if requested; otherwise use the data as-is
	if todense:
	expr = pd.DataFrame(ad.X.todense(), index=ad.obs.index, columns=ad.var.index)
	else:
	expr = pd.DataFrame(ad.X, index=ad.obs.index, columns=ad.var.index)

	# For each row (observation), find the top 50 genes with the highest expression
	top_k_genes = expr.apply(lambda s, n: pd.Series(s.nlargest(n).index), axis=1, n=50)

	# Create a new DataFrame to store the labels (space-separated top gene names)
	top_k_genes_str = pd.DataFrame()
	top_k_genes_str['label'] = top_k_genes[top_k_genes.columns].astype(str) \
	.apply(lambda x: ' '.join(x), axis=1)

	return top_k_genes_str



	def segment_patches(img_array, coord, patch_dir, height=20, width=20):
	"""
	Extracts small image patches centered at specified coordinates and saves them as individual PNG files.

	:param img_array: A NumPy array representing the full-resolution image. Shape is expected to be (H, W[, C]).
	:type img_array: numpy.ndarray
	:param coord: A pandas DataFrame containing patch center coordinates in columns "pixel_x" and "pixel_y".
	The index corresponds to spot IDs. Example columns: ["pixel_x", "pixel_y"].
	:type coord: pandas.DataFrame
	:param patch_dir: Directory path where the patch images will be saved.
	:type patch_dir: str
	:param height: The patch's height in pixels (distance in the y-direction).
	:type height: int
	:param width: The patch's width in pixels (distance in the x-direction).
	:type width: int
	:return: None. The function saves image patches to `patch_dir` but does not return anything.
	"""

	# Ensure the output directory exists; create it if it doesn't
	if not os.path.exists(patch_dir):
	os.makedirs(patch_dir)

	# Extract the overall height and width of the image
	yrange, xrange = img_array.shape[:2]

	# Iterate through each coordinate in the DataFrame
	for spot_idx in coord.index:
	# Retrieve the center x and y coordinates for the current spot
	ycenter, xcenter = coord.loc[spot_idx, ["pixel_x", "pixel_y"]]

	# Compute the top-left (x1, y1) and bottom-right (x2, y2) boundaries of the patch
	x1 = round(xcenter - width / 2)
	y1 = round(ycenter - height / 2)
	x2 = x1 + width
	y2 = y1 + height

	# Check if the patch boundaries go outside the image
	if x1 < 0 or y1 < 0 or x2 > xrange or y2 > yrange:
	print(f"Patch {spot_idx} is out of range and will be skipped.")
	continue

	# Extract the patch and convert to a PIL Image; cast to uint8 if needed
	patch_img = Image.fromarray(img_array[y1:y2, x1:x2].astype(np.uint8))

	# Create a filename for the patch image (e.g., "0_hires.png")
	patch_name = f"{spot_idx}_hires.png"
	patch_path = os.path.join(patch_dir, patch_name)

	# Save the patch image to disk
	patch_img.save(patch_path)



	def read_gct(file_path):
	"""
	Reads a GCT file, parses its dimensions, and returns the data as a pandas DataFrame.

	:param file_path: The path to the GCT file to be read.
	:return: A pandas DataFrame containing the GCT data, where the first two columns represent gene names and descriptions,
	and the subsequent columns contain the expression data.
	"""

	# Open the GCT file for reading
	with open(file_path, 'r') as file:
	# Read and ignore the first line (GCT version line)
	file.readline()

	# Read the second line which contains the dimensions of the data matrix
	dims = file.readline().strip().split() # Split the dimensions line by whitespace
	num_rows = int(dims[0]) # Number of data rows (genes)
	num_cols = int(dims[1]) # Number of data columns (samples + metadata)

	# Read the data starting from the third line, using pandas for tab-delimited data
	# The first two columns in GCT files are "Name" and "Description" (gene identifiers and annotations)
	data = pd.read_csv(file, sep='\t', header=0, nrows=num_rows)

	# Return the loaded data as a pandas DataFrame
	return data



	def get_library_id(adata):
	"""
	Retrieves the library ID from the AnnData object, assuming it contains spatial data.
	The function will return the first library ID found in `adata.uns['spatial']`.

	:param adata: AnnData object containing spatial information in `adata.uns['spatial']`.
	:return: The first library ID found in `adata.uns['spatial']`.
	:raises:
	AssertionError: If 'spatial' is not present in `adata.uns`.
	Logs an error if no library ID is found.
	"""

	# Check if 'spatial' is present in adata.uns; raises an error if not found
	assert 'spatial' in adata.uns, "spatial not present in adata.uns"

	# Retrieve the list of library IDs (which are keys in the 'spatial' dictionary)
	library_ids = adata.uns['spatial'].keys()

	try:
	# Attempt to return the first library ID (converting the keys object to a list)
	library_id = list(library_ids)[0]
	return library_id
	except IndexError:
	# If no library IDs exist, log an error message
	logger.error('No library_id found in adata')



	def get_scalefactors(adata, library_id=None):
	"""
	Retrieves the scalefactors from the AnnData object for a given library ID. If no library ID is provided,
	the function will automatically retrieve the first available library ID.

	:param adata: AnnData object containing spatial data and scalefactors in `adata.uns['spatial']`.
	:param library_id: The library ID for which the scalefactors are to be retrieved. If not provided, it defaults to the first available ID.
	:return: A dictionary containing scalefactors for the specified library ID.
	"""

	# If no library_id is provided, retrieve the first available library ID
	if library_id is None:
	library_id = get_library_id(adata)

	try:
	# Attempt to retrieve the scalefactors for the specified library ID
	scalef = adata.uns['spatial'][library_id]['scalefactors']
	return scalef
	except KeyError:
	# Log an error if the scalefactors or library ID is not found
	logger.error('scalefactors not found in adata')



	def get_spot_diameter_in_pixels(adata, library_id=None):
	"""
	Retrieves the spot diameter in pixels from the AnnData object's scalefactors for a given library ID.
	If no library ID is provided, the function will automatically retrieve the first available library ID.

	:param adata: AnnData object containing spatial data and scalefactors in `adata.uns['spatial']`.
	:param library_id: The library ID for which the spot diameter is to be retrieved. If not provided, defaults to the first available ID.

	:return: The spot diameter in full resolution pixels, or None if not found.
	"""

	# Get the scalefactors for the specified or default library ID
	scalef = get_scalefactors(adata, library_id=library_id)

	try:
	# Attempt to retrieve the spot diameter in full resolution from the scalefactors
	spot_diameter = scalef['spot_diameter_fullres']
	return spot_diameter
	except TypeError:
	# Handle case where `scalef` is None or invalid (if get_scalefactors returned None)
	pass
	except KeyError:
	# Log an error if the 'spot_diameter_fullres' key is not found in the scalefactors
	logger.error('spot_diameter_fullres not found in adata')



	def prepare_data_for_alignment(data_path, scale_type='tissue_hires_scalef'):
	"""
	Prepares data for alignment by reading an AnnData object and preparing the high-resolution tissue image.

	:param data_path: The path to the AnnData (.h5ad) file containing the Visium data.
	:param scale_type: The type of scale factor to use (`tissue_hires_scalef` by default).

	:return:
	- ad: AnnData object containing the spatial transcriptomics data.
	- ad_coor: Numpy array of scaled spatial coordinates (adjusted for the specified resolution).
	- img: High-resolution tissue image, normalized to 8-bit unsigned integers.

	:raises:
	ValueError: If required data (e.g., scale factors, spatial coordinates, or images) is missing.
	"""

	# Load the AnnData object from the specified file path
	ad = sc.read_h5ad(data_path)

	# Ensure the variable (gene) names are unique to avoid potential conflicts
	ad.var_names_make_unique()

	try:
	# Retrieve the specified scale factor for spatial coordinates
	scalef = get_scalefactors(ad)[scale_type]
	except KeyError:
	raise ValueError(f"Scale factor '{scale_type}' not found in ad.uns['spatial']")

	# Scale the spatial coordinates using the specified scale factor
	try:
	ad_coor = np.array(ad.obsm['spatial']) * scalef
	except KeyError:
	raise ValueError("Spatial coordinates not found in ad.obsm['spatial']")

	# Retrieve the high-resolution tissue image
	try:
	img = ad.uns['spatial'][get_library_id(ad)]['images']['hires']
	except KeyError:
	raise ValueError("High-resolution image not found in ad.uns['spatial']")

	# If the image values are normalized to [0, 1], convert to 8-bit format for compatibility
	if img.max() < 1.1:
	img = (img * 255).astype('uint8')

	return ad, ad_coor, img



	def load_data_for_annotation(st_data_path, json_path, in_tissue=True):
	"""
	Loads spatial transcriptomics (ST) data from an .h5ad file and prepares it for annotation.

	:param sample_type: The type or category of the sample (used to locate the data in the directory structure).
	:param sample_name: The name of the sample (used to locate specific files).
	:param in_tissue: Boolean flag to filter the data to include only spots that are in tissue. Default is True.

	:return:
	- st_ad: AnnData object containing the spatial transcriptomics data, with spatial coordinates in `obs`.
	- library_id: The library ID associated with the spatial data.
	- roi_polygon: Region of interest polygon loaded from a JSON file for further annotation or analysis.
	"""

	# Load the spatial transcriptomics data into an AnnData object
	st_ad = sc.read_h5ad(st_data_path)

	# Optionally filter the data to include only spots that are within the tissue
	if in_tissue:
	st_ad = st_ad[st_ad.obs['in_tissue'] == 1]

	# Initialize pixel coordinates for spatial information
	st_ad.obs[["pixel_y", "pixel_x"]] = None # Ensure the columns exist
	st_ad.obs[["pixel_y", "pixel_x"]] = st_ad.obsm['spatial'] # Copy spatial coordinates into obs

	# Retrieve the library ID associated with the spatial data
	library_id = get_library_id(st_ad)

	# Load the region of interest (ROI) polygon from a JSON file
	with open(json_path) as f:
	roi_polygon = json.load(f)

	return st_ad, library_id, roi_polygon



	def read_polygons(file_path, slide_id):
	"""
	Reads polygon data from a JSON file for a specific slide ID, extracting coordinates, colors, and thickness.

	:param file_path: Path to the JSON file containing polygon configurations.
	:param slide_id: Identifier for the specific slide whose polygon data is to be extracted.
	:return:
	- polygons: A list of numpy arrays, where each array contains the coordinates of a polygon.
	- polygon_colors: A list of color values corresponding to each polygon.
	- polygon_thickness: A list of thickness values for each polygon's border.
	"""

	# Open the JSON file and load the polygon configurations into a Python dictionary
	with open(file_path, 'r') as f:
	polygons_configs = json.load(f)

	# Check if the given slide_id exists in the polygon configurations
	if slide_id not in polygons_configs:
	return None, None, None # If slide_id is not found, return None for all outputs

	# Extract the polygon coordinates, colors, and thicknesses for the given slide_id
	polygons = [np.array(poly['coords']) for poly in polygons_configs[slide_id]] # Convert polygon coordinates to numpy arrays
	polygon_colors = [poly['color'] for poly in polygons_configs[slide_id]] # Extract the color for each polygon
	polygon_thickness = [poly['thickness'] for poly in polygons_configs[slide_id]] # Extract the thickness for each polygon

	# Return the polygons, their colors, and their thicknesses
	return polygons, polygon_colors, polygon_thickness