Spaces:
Starting
Starting
Ashkan Taghipour (The University of Western Australia)
UI overhaul: immersive chapter-based experience
14ba315 | """Offline precomputation for the Pigeon Pea Pangenome Atlas.""" | |
| import json | |
| import re | |
| from collections import Counter, defaultdict | |
| import numpy as np | |
| import pandas as pd | |
| from scipy.spatial.distance import pdist, squareform | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| from src.utils import logger, timer, parse_country | |
| def compute_gene_frequency(pav: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Compute per-gene frequency and core class. | |
| Output columns: gene_id, freq_count, freq_pct, core_class | |
| """ | |
| n_lines = pav.shape[1] | |
| freq_count = pav.sum(axis=1).astype(int) | |
| freq_pct = (freq_count / n_lines * 100).round(2) | |
| def classify(pct): | |
| if pct >= 95: | |
| return "core" | |
| elif pct >= 15: | |
| return "shell" | |
| return "cloud" | |
| core_class = freq_pct.map(classify) | |
| df = pd.DataFrame({ | |
| "gene_id": pav.index, | |
| "freq_count": freq_count.values, | |
| "freq_pct": freq_pct.values, | |
| "core_class": core_class.values, | |
| }) | |
| logger.info(f"Gene frequency: {(df['core_class']=='core').sum()} core, " | |
| f"{(df['core_class']=='shell').sum()} shell, " | |
| f"{(df['core_class']=='cloud').sum()} cloud") | |
| return df | |
| def compute_line_stats(pav: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Compute per-line statistics. | |
| Output columns: line_id, country, genes_present_count, unique_genes_count | |
| """ | |
| n_lines = pav.shape[1] | |
| records = [] | |
| for line_id in pav.columns: | |
| presence = pav[line_id] | |
| genes_present = int(presence.sum()) | |
| # Unique genes: present in this line but no others | |
| unique_mask = (pav.sum(axis=1) == 1) & (presence == 1) | |
| unique_count = int(unique_mask.sum()) | |
| country = parse_country(line_id) | |
| records.append({ | |
| "line_id": line_id, | |
| "country": country, | |
| "genes_present_count": genes_present, | |
| "unique_genes_count": unique_count, | |
| }) | |
| df = pd.DataFrame(records) | |
| logger.info(f"Line stats computed for {len(df)} lines") | |
| return df | |
| def compute_line_embedding(pav: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| UMAP embedding + KMeans clustering of lines. | |
| Output columns: line_id, umap_x, umap_y, cluster_id | |
| """ | |
| import umap | |
| # Transpose: rows = lines, columns = genes | |
| X = pav.T.values.astype(np.float32) | |
| line_ids = list(pav.columns) | |
| # UMAP | |
| reducer = umap.UMAP(n_components=2, metric="jaccard", n_neighbors=15, | |
| min_dist=0.1, random_state=42) | |
| embedding = reducer.fit_transform(X) | |
| # KMeans clustering — pick k by silhouette | |
| best_k, best_score = 3, -1 | |
| for k in range(3, min(9, len(line_ids))): | |
| km = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| labels = km.fit_predict(embedding) | |
| score = silhouette_score(embedding, labels) | |
| if score > best_score: | |
| best_k, best_score = k, score | |
| best_labels = labels | |
| logger.info(f"UMAP + KMeans: best k={best_k}, silhouette={best_score:.3f}") | |
| df = pd.DataFrame({ | |
| "line_id": line_ids, | |
| "umap_x": embedding[:, 0], | |
| "umap_y": embedding[:, 1], | |
| "cluster_id": best_labels, | |
| }) | |
| return df | |
| def compute_similarity_topk(pav: pd.DataFrame, k: int = 15) -> pd.DataFrame: | |
| """ | |
| Pairwise Jaccard similarity, keep top-K neighbors per line. | |
| Output columns: line_id, neighbor_line_id, jaccard_score | |
| """ | |
| X = pav.T.values.astype(np.float32) | |
| line_ids = list(pav.columns) | |
| n = len(line_ids) | |
| # Compute pairwise Jaccard distance, convert to similarity | |
| dist_vec = pdist(X, metric="jaccard") | |
| dist_mat = squareform(dist_vec) | |
| sim_mat = 1.0 - dist_mat | |
| records = [] | |
| for i in range(n): | |
| scores = sim_mat[i].copy() | |
| scores[i] = -1 # exclude self | |
| top_idx = np.argsort(scores)[::-1][:k] | |
| for j in top_idx: | |
| records.append({ | |
| "line_id": line_ids[i], | |
| "neighbor_line_id": line_ids[j], | |
| "jaccard_score": round(float(scores[j]), 4), | |
| }) | |
| df = pd.DataFrame(records) | |
| logger.info(f"Similarity top-{k}: {len(df)} pairs") | |
| return df | |
| def build_gff_gene_parquet(gff_genes: pd.DataFrame, output_path: str) -> None: | |
| """Save parsed GFF gene DataFrame to parquet.""" | |
| gff_genes.to_parquet(output_path, index=False) | |
| logger.info(f"GFF gene index saved: {output_path}") | |
| def build_protein_parquet(protein_df: pd.DataFrame, output_path: str) -> None: | |
| """Save protein index to parquet.""" | |
| protein_df.to_parquet(output_path, index=False) | |
| logger.info(f"Protein index saved: {output_path}") | |
| def save_contig_index(contig_index: dict, contig_mapping: dict, output_path: str) -> None: | |
| """Save contig index as JSON.""" | |
| import json | |
| data = {} | |
| for contig_id, length in contig_index.items(): | |
| gff_seqid = None | |
| for gff_id, fasta_id in contig_mapping.items(): | |
| if fasta_id == contig_id: | |
| gff_seqid = gff_id | |
| break | |
| data[contig_id] = { | |
| "length": length, | |
| "gff_seqid": gff_seqid or contig_id, | |
| "fasta_header": contig_id, | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| logger.info(f"Contig index saved: {output_path}") | |
| def compute_hotspot_bins(gff_genes: pd.DataFrame, gene_freq: pd.DataFrame, | |
| contig_index: dict, bin_size: int = 100_000) -> pd.DataFrame: | |
| """ | |
| Bin genes along contigs and compute variability scores. | |
| Output columns: contig_id, bin_start, bin_end, total_genes, cloud_genes, | |
| shell_genes, core_genes, mean_freq, variability_score | |
| """ | |
| # Join gff with gene frequency | |
| merged = gff_genes.merge(gene_freq, on="gene_id", how="inner") | |
| merged["midpoint"] = (merged["start"] + merged["end"]) // 2 | |
| records = [] | |
| for contig_id in merged["contig_id"].unique(): | |
| contig_genes = merged[merged["contig_id"] == contig_id] | |
| max_pos = contig_genes["end"].max() | |
| for bin_start in range(0, max_pos + bin_size, bin_size): | |
| bin_end = bin_start + bin_size | |
| in_bin = contig_genes[ | |
| (contig_genes["midpoint"] >= bin_start) & | |
| (contig_genes["midpoint"] < bin_end) | |
| ] | |
| if len(in_bin) == 0: | |
| continue | |
| core_count = int((in_bin["core_class"] == "core").sum()) | |
| shell_count = int((in_bin["core_class"] == "shell").sum()) | |
| cloud_count = int((in_bin["core_class"] == "cloud").sum()) | |
| mean_freq = float(in_bin["freq_pct"].mean()) | |
| variability_score = cloud_count + 0.5 * shell_count | |
| records.append({ | |
| "contig_id": contig_id, | |
| "bin_start": bin_start, | |
| "bin_end": bin_end, | |
| "total_genes": len(in_bin), | |
| "core_genes": core_count, | |
| "shell_genes": shell_count, | |
| "cloud_genes": cloud_count, | |
| "mean_freq": round(mean_freq, 2), | |
| "variability_score": round(variability_score, 2), | |
| }) | |
| df = pd.DataFrame(records) | |
| logger.info(f"Hotspot bins computed: {len(df)} bins across {df['contig_id'].nunique()} contigs") | |
| return df | |
| def compute_cluster_markers(pav: pd.DataFrame, embedding: pd.DataFrame, | |
| top_n: int = 50) -> pd.DataFrame: | |
| """ | |
| Find marker genes for each cluster. | |
| Output columns: cluster_id, gene_id, in_cluster_freq, out_cluster_freq, marker_score | |
| """ | |
| clusters = embedding[["line_id", "cluster_id"]].copy() | |
| records = [] | |
| for cid in sorted(clusters["cluster_id"].unique()): | |
| in_lines = set(clusters[clusters["cluster_id"] == cid]["line_id"]) | |
| out_lines = set(clusters[clusters["cluster_id"] != cid]["line_id"]) | |
| in_cols = [c for c in pav.columns if c in in_lines] | |
| out_cols = [c for c in pav.columns if c in out_lines] | |
| if not in_cols or not out_cols: | |
| continue | |
| in_freq = pav[in_cols].mean(axis=1) | |
| out_freq = pav[out_cols].mean(axis=1) | |
| marker_score = in_freq - out_freq | |
| top_genes = marker_score.nlargest(top_n) | |
| for gene_id, score in top_genes.items(): | |
| records.append({ | |
| "cluster_id": int(cid), | |
| "gene_id": gene_id, | |
| "in_cluster_freq": round(float(in_freq[gene_id]), 4), | |
| "out_cluster_freq": round(float(out_freq[gene_id]), 4), | |
| "marker_score": round(float(score), 4), | |
| }) | |
| df = pd.DataFrame(records) | |
| logger.info(f"Cluster markers: {len(df)} total across {df['cluster_id'].nunique()} clusters") | |
| return df | |
| # --------------------------------------------------------------------------- | |
| # New precomputation functions for UI overhaul | |
| # --------------------------------------------------------------------------- | |
| def compute_line_embedding_3d(pav: pd.DataFrame, | |
| embedding_2d: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| 3D UMAP embedding of lines, reusing cluster_id from the 2D embedding. | |
| Output columns: line_id, umap_x, umap_y, umap_z, cluster_id | |
| """ | |
| import umap | |
| # Transpose: rows = lines, columns = genes | |
| X = pav.T.values.astype(np.float32) | |
| line_ids = list(pav.columns) | |
| # UMAP with 3 components | |
| reducer = umap.UMAP(n_components=3, metric="jaccard", n_neighbors=15, | |
| min_dist=0.1, random_state=42) | |
| embedding = reducer.fit_transform(X) | |
| # Reuse cluster_id from 2D embedding | |
| cluster_map = dict(zip(embedding_2d["line_id"], embedding_2d["cluster_id"])) | |
| df = pd.DataFrame({ | |
| "line_id": line_ids, | |
| "umap_x": embedding[:, 0], | |
| "umap_y": embedding[:, 1], | |
| "umap_z": embedding[:, 2], | |
| "cluster_id": [int(cluster_map.get(lid, -1)) for lid in line_ids], | |
| }) | |
| logger.info(f"3D UMAP embedding computed for {len(df)} lines") | |
| return df | |
| def build_sunburst_data(gene_freq: pd.DataFrame, output_path: str) -> None: | |
| """ | |
| Build Plotly go.Sunburst hierarchy arrays and save as JSON. | |
| Structure: total -> core / shell / cloud | |
| """ | |
| core_count = int((gene_freq["core_class"] == "core").sum()) | |
| shell_count = int((gene_freq["core_class"] == "shell").sum()) | |
| cloud_count = int((gene_freq["core_class"] == "cloud").sum()) | |
| total_count = core_count + shell_count + cloud_count | |
| data = { | |
| "ids": ["total", "core", "shell", "cloud"], | |
| "labels": ["All Genes", "Core", "Shell", "Cloud"], | |
| "parents": ["", "total", "total", "total"], | |
| "values": [total_count, core_count, shell_count, cloud_count], | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| logger.info(f"Sunburst hierarchy saved: {output_path} " | |
| f"(total={total_count}, core={core_count}, " | |
| f"shell={shell_count}, cloud={cloud_count})") | |
| def build_polar_contig_layout(hotspots: pd.DataFrame, | |
| contig_index: dict, | |
| output_path: str, | |
| top_n: int = 20) -> None: | |
| """ | |
| Assign the top contigs (by gene count) angular sectors for a polar layout. | |
| Saves per-contig metadata and per-bin variability mapped to angular positions. | |
| """ | |
| # Aggregate gene counts per contig from hotspot bins | |
| contig_gene_counts = ( | |
| hotspots.groupby("contig_id")["total_genes"] | |
| .sum() | |
| .nlargest(top_n) | |
| ) | |
| top_contigs = list(contig_gene_counts.index) | |
| # Sum total length of selected contigs (use contig_index if available) | |
| contig_lengths = {} | |
| for cid in top_contigs: | |
| contig_lengths[cid] = contig_index.get(cid, int( | |
| hotspots[hotspots["contig_id"] == cid]["bin_end"].max())) | |
| total_length = sum(contig_lengths.values()) | |
| # Assign angular sectors proportional to contig length | |
| sectors = [] | |
| theta_cursor = 0.0 | |
| for cid in top_contigs: | |
| length = contig_lengths[cid] | |
| arc = (length / total_length) * 360.0 if total_length > 0 else 0 | |
| theta_start = round(theta_cursor, 4) | |
| theta_end = round(theta_cursor + arc, 4) | |
| # Map bins for this contig to angular positions | |
| contig_bins = hotspots[hotspots["contig_id"] == cid].sort_values("bin_start") | |
| bins_mapped = [] | |
| for _, row in contig_bins.iterrows(): | |
| # Map bin midpoint position to angular position within the sector | |
| bin_mid = (row["bin_start"] + row["bin_end"]) / 2 | |
| frac = bin_mid / length if length > 0 else 0 | |
| theta_bin = theta_start + frac * arc | |
| bins_mapped.append({ | |
| "theta": round(theta_bin, 4), | |
| "total_genes": int(row["total_genes"]), | |
| "variability_score": float(row["variability_score"]), | |
| "core_genes": int(row["core_genes"]), | |
| "shell_genes": int(row["shell_genes"]), | |
| "cloud_genes": int(row["cloud_genes"]), | |
| }) | |
| sectors.append({ | |
| "contig_id": cid, | |
| "theta_start": theta_start, | |
| "theta_end": theta_end, | |
| "total_genes": int(contig_gene_counts[cid]), | |
| "total_length": int(length), | |
| "bins": bins_mapped, | |
| }) | |
| theta_cursor += arc | |
| with open(output_path, "w") as f: | |
| json.dump(sectors, f, indent=2) | |
| logger.info(f"Polar contig layout saved: {output_path} " | |
| f"({len(sectors)} contigs, 360-degree arc)") | |
| def compute_radar_axes(protein_index: pd.DataFrame, | |
| output_path: str, | |
| top_n: int = 10) -> None: | |
| """ | |
| Find the top amino acids across all proteins and compute global mean percentages. | |
| Parses composition_summary strings (e.g. 'L:9.8%, S:7.2%, A:6.5%, G:5.8%, V:5.5%'). | |
| Saves: { "axes": [...], "global_mean": {aa: mean_pct, ...} } | |
| """ | |
| # Parse all composition summaries to accumulate per-protein AA percentages | |
| aa_totals = defaultdict(list) # aa -> list of pct values (one per protein) | |
| for comp_str in protein_index["composition_summary"]: | |
| if not comp_str or pd.isna(comp_str): | |
| continue | |
| # Parse tokens like "L:9.8%" | |
| for token in comp_str.split(","): | |
| token = token.strip() | |
| match = re.match(r"([A-Z]):(\d+\.?\d*)%", token) | |
| if match: | |
| aa = match.group(1) | |
| pct = float(match.group(2)) | |
| aa_totals[aa].append(pct) | |
| # Compute mean percentage for each AA (proteins where AA was not in top-5 | |
| # are treated as 0 for ranking, but we report only the mean when present) | |
| n_proteins = len(protein_index) | |
| aa_mean = {} | |
| for aa, pct_list in aa_totals.items(): | |
| # Mean across ALL proteins (assume 0 for those where it wasn't in top-5) | |
| aa_mean[aa] = round(sum(pct_list) / n_proteins, 3) | |
| # Select top-N by global mean | |
| sorted_aas = sorted(aa_mean.items(), key=lambda x: -x[1])[:top_n] | |
| axes = [aa for aa, _ in sorted_aas] | |
| global_mean = {aa: pct for aa, pct in sorted_aas} | |
| data = { | |
| "axes": axes, | |
| "global_mean": global_mean, | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| logger.info(f"Radar axes saved: {output_path} (top {top_n} AAs: {axes})") | |