RCP79 commited on
Commit
5498f6f
·
verified ·
1 Parent(s): d079ffd

I have this code:

Browse files

#%%
#!/usr/bin/env python3
# Required installations (run these in your environment if not already installed):
# pip install multimolecule umap-learn pacmap plotly torch transformers pandas numpy h5py

import pandas as pd
import numpy as np
import torch
from multimolecule import RnaTokenizer, RnaErnieModel
import umap
import pacmap
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from torch.utils.data import DataLoader, Dataset
import h5py # For saving to HDF5
import os

# =============================================================================
# CONFIGURATION: Customize these parameters for different data frames
# =============================================================================
CONFIG = {
'file_path': '/Users/roger/Desktop/Feina/Colaboracions/Claire/BGI_20250811_data/Analysis/Saved_data/df_merge.h5', # Path to input H5 file
'h5_key': 'df_merge', # Key for the DataFrame in H5 (set to None to auto-detect or use first key)
'seq_col': 'full_sequence', # Column name containing raw sequences (with T instead of U)
'output_seq_col': 'sequence', # New column name for processed sequences (U-replaced)
'filter_query': "(counts_pwt03 > 0) | (counts_pwt04 > 0)", # Pandas query string for filtering (e.g., "col1 > 0 & col2 == 'value'"); set to None for no filter
'from_char': 'T', # Character to replace in sequences (DNA to RNA)
'to_char': 'U', # Replacement character
'batch_size': 512, # Batch size for inference (tune based on seq length and RAM)
'num_workers': 4, # DataLoader workers (increase for faster I/O on multi-core)
'max_length': None, # Max sequence length for tokenization (auto-computed if None)
'model_name': 'multimolecule/rnaernie', # HuggingFace model name (assumes RNAErnie-compatible)
'output_file': 'rna_embeddings_reductions.h5', # Output H5 file path
'umap_neighbors': 15, # UMAP n_neighbors (lower = faster)
'umap_min_dist': 0.1, # UMAP min_dist
'pacmap_iters': 50, # PaCMAP num_iters (lower = faster)
'pacmap_mn_ratio': 0.5 # PaCMAP MN_ratio
}

# =============================================================================
# Custom Dataset Class (Moved to top level to allow pickling for multiprocessing)
# =============================================================================
class SequenceDataset(Dataset):
def __init__(self, sequences, tokenizer, max_length):
self.sequences = sequences
self.tokenizer = tokenizer
self.max_length = max_length

def __len__(self):
return len(self.sequences)

def __getitem__(self, idx):
tokenized = self.tokenizer(
self.sequences[idx],
return_tensors="pt",
padding=True,
truncation=True,
max_length=self.max_length
)
# Squeeze the batch dim (1) to return flat tensors of shape (max_length,)
# This ensures proper collation in DataLoader to (batch_size, max_length)
return {k: v.squeeze(0) for k, v in tokenized.items()}

# =============================================================================
# Load and Preprocess Data
# =============================================================================
def load_and_preprocess(config):
file_path = config['file_path']
h5_key = config['h5_key']

# List keys if needed
with h5py.File(file_path, 'r') as f:
keys = list(f.keys())
print("Keys in the H5 file:", keys)

# Load DataFrame
if h5_key is None:
h5_key = keys[0] if keys else None
df = pd.read_hdf(file_path, key=h5_key)

print("DataFrame shape:", df.shape)
print("Columns:", list(df.columns))

# Create processed sequence column
seq_col = config['seq_col']
output_seq_col = config['output_seq_col']
if seq_col not in df.columns:
raise ValueError(f"Sequence column '{seq_col}' not found in DataFrame.")

df[output_seq_col] = df[seq_col].str.replace(config['from_char'], config['to_char'])

# Filter if query provided
df_filtered = df
if config['filter_query']:
df_filtered = df.query(config['filter_query'])
print(f"Filtered DataFrame shape (query: '{config['filter_query']}'):", df_filtered.shape)

# Extract sequences
sequences = df_filtered[output_seq_col].tolist()

# Compute max length if not provided
max_len = config['max_length']
if max_len is None:
max_len = max(len(seq) for seq in sequences)

print(f"Number of sequences: {len(sequences)}, Max sequence length: {max_len}")

return df, df_filtered, sequences, max_len

# =============================================================================
# Model and Embedding Generation
# =============================================================================
def generate_embeddings(sequences, tokenizer, model, device, batch_size, num_workers, max_length):
# Use the top-level SequenceDataset class
dataset = SequenceDataset(sequences, tokenizer, max_length)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers
)

embeddings = []
with torch.no_grad():
for batch in dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)

# Masked mean pooling
if 'attention_mask' in batch:
mask = batch['attention_mask'].unsqueeze(-1).expand(outputs.last_hidden_state.shape).float()
masked_emb = outputs.last_hidden_state * mask
sum_emb = masked_emb.sum(dim=1)
count_emb = mask.sum(dim=1)
emb = (sum_emb / count_emb).squeeze().cpu().numpy()
else:
emb = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

embeddings.extend(emb)

return np.array(embeddings)

# =============================================================================
# Dimensionality Reduction
# =============================================================================
def reduce_dimensions(embeddings, config):
metric = 'cosine'
random_state = 42

# UMAP 2D
umap_2d = umap.UMAP(
n_components=2,
n_neighbors=config['umap_neighbors'],
min_dist=config['umap_min_dist'],
random_state=random_state,
metric=metric
).fit_transform(embeddings)

# UMAP 3D
umap_3d = umap.UMAP(
n_components=3,
n_neighbors=config['umap_neighbors'],
min_dist=config['umap_min_dist'],
random_state=random_state,
metric=metric
).fit_transform(embeddings)

# PaCMAP 2D (init with UMAP)
pac_2d = pacmap.PaCMAP(
n_components=2,
random_state=random_state,
MN_ratio=config['pacmap_mn_ratio'],
num_iters=config['pacmap_iters']
).fit_transform(embeddings, init_low_dim=umap_2d)

# PaCMAP 3D (init with UMAP)
pac_3d = pacmap.PaCMAP(
n_components=3,
random_state=random_state,
MN_ratio=config['pacmap_mn_ratio'],
num_iters=config['pacmap_iters']
).fit_transform(embeddings, init_low_dim=umap_3d)

return {
'umap_2d': umap_2d,
'umap_3d': umap_3d,
'pacmap_2d': pac_2d,
'pacmap_3d': pac_3d
}

# =============================================================================
# Save Outputs
# =============================================================================
def save_outputs(df, df_filtered, embeddings, reductions, output_file):
with h5py.File(output_file, 'w') as f:
# Save full original DataFrame (with processed sequences)
df.to_hdf(f, key='data_full', mode='w')

# Save filtered DataFrame
df_filtered.to_hdf(f, key='data_filtered', mode='r+')

# Save embeddings
f.create_dataset('embeddings', data=embeddings)

# Save reductions
f.create_dataset('umap_2d', data=reductions['umap_2d'])
f.create_dataset('umap_3d', data=reductions['umap_3d'])
f.create_dataset('pacmap_2d', data=reductions['pacmap_2d'])
f.create_dataset('pacmap_3d', data=reductions['pacmap_3d'])

print(f"Saved all data to {output_file}")

# =============================================================================
# Plotting
# =============================================================================
def plot_reductions(reductions):
# 2D comparison
fig_2d = make_subplots(
rows=1, cols=2,
subplot_titles=('UMAP 2D', 'PaCMAP 2D')
)

fig_2d.add_trace(
go.Scatter(
x=reductions['umap_2d'][:, 0],
y=reductions['umap_2d'][:, 1],
mode='markers',
marker=dict(size=5, opacity=0.7),
name='Points'
),
row=1, col=1
)

fig_2d.add_trace(
go.Scatter(
x=reductions['pacmap_2d'][:, 0],
y=reductions['pacmap_2d'][:, 1],
mode='markers',
marker=dict(size=5, opacity=0.7),
name='Points'
),
row=1, col=2
)

fig_2d.update_layout(
title_text='2D Dimensionality Reduction Comparison',
height=500
)
fig_2d.show()

# UMAP 3D
fig_umap_3d = go.Figure(data=go.Scatter3d(
x=reductions['umap_3d'][:, 0],
y=reductions['umap_3d'][:, 1],
z=reductions['umap_3d'][:, 2],
mode='markers',
marker=dict(size=5, opacity=0.7),
name='Points'
))
fig_umap_3d.update_layout(
title='UMAP 3D',
scene=dict(xaxis_title='Dim 1', yaxis_title='Dim 2', zaxis_title='Dim 3'),
height=600
)
fig_umap_3d.show()

# PaCMAP 3D
fig_pac_3d = go.Figure(data=go.Scatter3d(
x=reductions['pacmap_3d'][:, 0],
y=reductions['pacmap_3d'][:, 1],
z=reductions['pacmap_3d'][:, 2],
mode='markers',
marker=dict(size=5, opacity=0.7),
name='Points'
))
fig_pac_3d.update_layout(
title='PaCMAP 3D',

Files changed (1) hide show
  1. README.md +8 -5
README.md CHANGED
@@ -1,10 +1,13 @@
1
  ---
2
- title: Deepsite Project
3
- emoji: 📉
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: static
7
  pinned: false
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: DeepSite Project
3
+ colorFrom: gray
4
+ colorTo: purple
5
+ emoji: 🐳
6
  sdk: static
7
  pinned: false
8
+ tags:
9
+ - deepsite-v3
10
  ---
11
 
12
+ # Welcome to your new DeepSite project!
13
+ This project was created with [DeepSite](https://deepsite.hf.co).