Spaces:

ChatterjeeLab
/

PeptiVerse

Running

PeptiVerse / app.py

ynuozhang

update models

728610a 12 days ago

55.6 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import xgboost as xgb
	from transformers import AutoTokenizer, AutoModel, AutoConfig, EsmModel, EsmTokenizer
	import plotly.graph_objects as go
	from pathlib import Path
	import json
	import time
	from typing import List, Dict, Any, Tuple, Optional
	import subprocess
	from collections import defaultdict
	from huggingface_hub import snapshot_download
	from pathlib import Path
	import os
	from inference import (
	PeptiVersePredictor,
	read_best_manifest_csv,
	BestRow,
	canon_model,
	)

	try:
	from Bio.SeqUtils.ProtParam import ProteinAnalysis
	BIOPYTHON_AVAILABLE = True
	except ImportError:
	BIOPYTHON_AVAILABLE = False
	print("BioPython not available. Using fallback for pI/charge calculations.")

	def pick_assets_root() -> Path:
	# HF Spaces container uses /home/user; detect via SPACE_ID or existence
	spaces_root = Path("/home/user/assets")
	if os.environ.get("SPACE_ID") or spaces_root.parent.exists():
	try:
	spaces_root.mkdir(parents=True, exist_ok=True)
	return spaces_root
	except Exception:
	pass # fall through to local options

	# Allow manual override
	env = os.environ.get("HF_ASSETS_DIR")
	if env:
	p = Path(env); p.mkdir(parents=True, exist_ok=True)
	return p

	# Local fallbacks
	for p in [Path.home() / "assets", Path.cwd() / "assets", Path("/tmp/assets")]:
	try:
	p.mkdir(parents=True, exist_ok=True)
	return p
	except Exception:
	continue
	raise RuntimeError("No writable assets directory found.")

	ASSETS = pick_assets_root()

	# Put all caches on the same writable disk
	for k, v in {
	"HF_HOME": str(ASSETS / "hf"),
	"HUGGINGFACE_HUB_CACHE": str(ASSETS / "hf" / "cache"),
	"TRANSFORMERS_CACHE": str(ASSETS / "transformers"),
	"HF_DATASETS_CACHE": str(ASSETS / "hf" / "datasets"),
	"XDG_CACHE_HOME": str(ASSETS / "xdg"),
	"TMPDIR": str(ASSETS / "tmp"),
	}.items():
	os.environ.setdefault(k, v)
	Path(v).mkdir(parents=True, exist_ok=True)

	ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
	ASSETS_DATA = ASSETS / "training_data_cleaned"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)

	MODEL_REPO = "ChatterjeeLab/PeptiVerse" # model repo
	DATASET_REPO = "ChatterjeeLab/PeptiVerse" # dataset repo

	def fetch_models_and_data():
	snapshot_download(
	repo_id=MODEL_REPO,
	local_dir=str(ASSETS_MODELS),
	local_dir_use_symlinks=False,
	allow_patterns=[
	# Model files
	"training_classifiers/*/best_model.json",
	"training_classifiers/*/best_model.pt",
	"training_classifiers/*/best_model.joblib",
	# Tokenizer files
	"tokenizer/new_vocab.txt",
	"tokenizer/new_splits.txt",
	# Training data for distributions
	"training_data_cleaned/*/.csv",
	],
	)

	fetch_models_and_data()

	BEST_TXT = Path("best_models.txt")
	TRAINING_ROOT = ASSETS_MODELS / "training_classifiers"
	TOKENIZER_DIR = ASSETS_MODELS / "tokenizer"

	# Banned models that should fall back to XGB
	BANNED_MODELS = {"svm", "enet", "svm_gpu", "enet_gpu"}

	# "lower is better" exceptions for classification labeling
	LOWER_BETTER = {"hemolysis", "toxicity"}

	# Property display names and descriptions
	PROPERTY_INFO = {
	'solubility': {
	'display': '💧 Solubility',
	'description': 'Aqueous solubility',
	'direction': '↑',
	'pass_label': 'Soluble',
	'fail_label': 'Insoluble'
	},
	'permeability_penetrance': {
	'display': '🔬 Permeability (Penetrance)',
	'description': 'Cell penetration capability',
	'direction': '↑',
	'pass_label': 'Permeable',
	'fail_label': 'Non-permeable'
	},
	'hemolysis': {
	'display': '🩸 Hemolysis',
	'description': 'Red blood cell membrane disruption',
	'direction': '↓',
	'pass_label': 'Non-hemolytic',
	'fail_label': 'Hemolytic'
	},
	'nf': {
	'display': '👯 Non-Fouling',
	'description': 'Resistance to protein adsorption',
	'direction': '↑',
	'pass_label': 'Non-fouling',
	'fail_label': 'Fouling'
	},
	'halflife': {
	'display': '⏱️ Half-Life',
	'description': 'Serum stability',
	'direction': '↑',
	'unit': 'hours'
	},
	'toxicity': {
	'display': '☠️ Toxicity',
	'description': 'Cytotoxicity',
	'direction': '↓',
	'pass_label': 'Non-toxic',
	'fail_label': 'Toxic'
	},
	'permeability_pampa': {
	'display': '🪣 Permeability (PAMPA)',
	'description': 'PAMPA assay permeability',
	'direction': '',
	'threshold': -6, # Values > -6 are permeable
	'pass_label': 'Permeable',
	'fail_label': 'Non-permeable'
	},
	'permeability_caco2': {
	'display': '🪣 Permeability (Caco-2)',
	'description': 'Caco-2 cell permeability',
	'direction': '',
	'threshold': -6, # Values > -6 are permeable
	'pass_label': 'Permeable',
	'fail_label': 'Non-permeable'
	},
	'binding_affinity': {
	'display': '🔗 Binding Affinity',
	'description': 'Protein-peptide binding strength',
	'direction': '↑',
	'thresholds': {'tight': 9, 'weak': 7}
	}
	}
	PROP_ORDER = [
	'solubility',
	'permeability_penetrance',
	'hemolysis',
	'nf',
	'halflife',
	'toxicity',
	'permeability_pampa',
	'permeability_caco2',
	'binding_affinity',
	]


	# Distribution-only keys
	DIST_KEYS = {
	"binding_affinity_wt": "🔗 Binding Affinity — WT (distribution)",
	"binding_affinity_smiles": "🔗 Binding Affinity — SMILES (distribution)",
	"binding_affinity_all": "🔗 Binding Affinity — WT+SMILES (distribution)",
	"halflife_wt": "⏱️ Half-life — WT (distribution)",
	"halflife_smiles": "⏱️ Half-life — SMILES (distribution)",
	"halflife_all": "⏱️ Half-life — WT+SMILES (distribution)",
	}

	def create_filtered_manifest(manifest_path: Path) -> Dict[str, BestRow]:
	"""Read manifest and replace banned models with XGB"""
	original = read_best_manifest_csv(manifest_path)
	filtered = {}

	for prop_key, row in original.items():
	# Normalize property key for half-life
	normalized_key = prop_key
	if prop_key in ['halflife', 'half_life']:
	normalized_key = 'halflife'

	# Check and potentially replace WT model
	wt_model = canon_model(row.best_wt)
	if wt_model in BANNED_MODELS:
	wt_model = "XGB"
	elif wt_model is None:
	wt_model = row.best_wt
	else:
	wt_model = row.best_wt

	# Check and potentially replace SMILES model
	smiles_model = canon_model(row.best_smiles)
	if smiles_model in BANNED_MODELS:
	smiles_model = "XGB"
	elif smiles_model is None:
	smiles_model = row.best_smiles
	else:
	smiles_model = row.best_smiles

	# Create modified row
	filtered[normalized_key] = BestRow(
	property_key=normalized_key,
	best_wt=wt_model if wt_model != row.best_wt else row.best_wt,
	best_smiles=smiles_model if smiles_model != row.best_smiles else row.best_smiles,
	task_type=row.task_type,
	thr_wt=row.thr_wt,
	thr_smiles=row.thr_smiles,
	)

	return filtered

	class AppContext:
	def __init__(self):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	self.best = create_filtered_manifest(BEST_TXT)

	self.predictor = PeptiVersePredictor(
	manifest_path=BEST_TXT,
	classifier_weight_root=ASSETS_MODELS,
	esm_name="facebook/esm2_t33_650M_UR50D",
	clm_name="aaronfeller/PeptideCLM-23M-all",
	smiles_vocab=str(TOKENIZER_DIR / "new_vocab.txt"),
	smiles_splits=str(TOKENIZER_DIR / "new_splits.txt"),
	device=str(self.device),
	)

	# override manifest AND reload models so keys/folders match
	self.predictor.manifest = self.best
	self.predictor.models.clear()
	self.predictor.meta.clear()
	self.predictor._load_all_best_models()


	CTX: AppContext \| None = None

	def initialize():
	global CTX
	if CTX is None:
	CTX = AppContext()
	return CTX

	def get_available_properties(ctx, modality: str) -> Dict[str, bool]:
	"""
	Returns dict of property -> bool indicating if available for the modality
	"""
	available = {}
	for prop_key in PROPERTY_INFO.keys():
	if prop_key not in ctx.best:
	available[prop_key] = False
	continue

	row = ctx.best[prop_key]
	if modality == "Sequence":
	model = row.best_wt
	else:
	model = row.best_smiles

	# Check if model exists and is not empty/dash
	if not model or model in {"-", "—", "NA", "N/A", None}:
	available[prop_key] = False
	else:
	# Check if we actually have the model loaded
	mode = "wt" if modality == "Sequence" else "smiles"
	available[prop_key] = (prop_key, mode) in ctx.predictor.models

	return available

	def get_threshold(ctx: AppContext, prop: str, modality: str) -> float \| None:
	row = ctx.best.get(prop)
	if row is None:
	return None
	return row.thr_wt if modality == "Sequence" else row.thr_smiles

	def get_best_models_table(ctx: AppContext) -> pd.DataFrame:
	"""Generate a table showing best models and thresholds"""
	data = []
	for prop_key, row in ctx.best.items():
	prop_info = PROPERTY_INFO.get(prop_key, {})
	display_name = prop_info.get('display', prop_key)

	data.append({
	'Property': display_name,
	'Best Model (Sequence)': row.best_wt if row.best_wt else '—',
	'Threshold (Sequence)': f"{row.thr_wt:.4f}" if row.thr_wt is not None else '—',
	'Best Model (SMILES)': row.best_smiles if row.best_smiles else '—',
	'Threshold (SMILES)': f"{row.thr_smiles:.4f}" if row.thr_smiles is not None else '—',
	'Task Type': row.task_type
	})

	return pd.DataFrame(data)

	try:
	from rdkit import Chem
	from rdkit.Chem import Descriptors, AllChem
	RDKIT_AVAILABLE = True
	except ImportError:
	RDKIT_AVAILABLE = False
	print("RDKit not available. SMILES input will be disabled.")
	import re

	AA_RE = re.compile(r'^[ACDEFGHIKLMNPQRSTVWYBXZJUO\-]+$', re.IGNORECASE)

	def is_aa_sequence_like(s: str) -> bool:
	s = s.strip().replace(" ", "")
	if not s:
	return False
	# Very lenient: allow AA letters + optional '-' for readability
	return bool(AA_RE.fullmatch(s)) and any(c.isalpha() for c in s)

	def is_smiles_like(s: str) -> bool:
	s = s.strip()
	if not s:
	return False
	# Heuristic: SMILES often contains these symbols; also reject if it looks like pure AA
	maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
	return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2

	# ==================== Sequence Analysis ====================

	class SequenceAnalyzer:
	"""Calculate physicochemical properties of peptide sequences
	If biopython fail.
	"""
	# pKa values for amino acids
	PKA_VALUES = {
	'N_term': 9.6,
	'C_term': 2.3,
	'D': 3.9, # Aspartic acid
	'E': 4.2, # Glutamic acid
	'H': 6.0, # Histidine
	'C': 8.3, # Cysteine
	'Y': 10.1, # Tyrosine
	'K': 10.5, # Lysine
	'R': 12.5, # Arginine
	}

	@classmethod
	def calculate_net_charge(cls, sequence: str, pH: float = 7.0) -> float:
	"""Calculate net charge at given pH using Henderson-Hasselbalch equation"""
	if BIOPYTHON_AVAILABLE:
	try:
	analyzer = ProteinAnalysis(sequence)
	return analyzer.charge_at_pH(pH)
	except:
	pass

	# Fallback calculation
	charge = 0

	# N-terminus
	charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['N_term']))

	# C-terminus
	charge -= 1 / (1 + 10**(cls.PKA_VALUES['C_term'] - pH))

	# Count charged residues
	for aa in sequence:
	if aa in 'KR': # Positive
	pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['K' if aa == 'K' else 'R'])
	charge += 1 / (1 + 10**(pH - pKa))
	elif aa in 'DE': # Negative
	pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['D' if aa == 'D' else 'E'])
	charge -= 1 / (1 + 10**(pKa - pH))
	elif aa == 'H': # Histidine (positive when protonated)
	charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['H']))
	elif aa == 'C': # Cysteine (negative when deprotonated)
	charge -= 1 / (1 + 10**(cls.PKA_VALUES['C'] - pH))
	elif aa == 'Y': # Tyrosine (negative when deprotonated)
	charge -= 1 / (1 + 10**(cls.PKA_VALUES['Y'] - pH))

	return round(charge, 2)

	@classmethod
	def calculate_isoelectric_point(cls, sequence: str) -> float:
	"""Calculate theoretical pI using bisection method"""
	if BIOPYTHON_AVAILABLE:
	try:
	analyzer = ProteinAnalysis(sequence)
	return analyzer.isoelectric_point()
	except:
	pass

	# Fallback: Bisection method
	pH_min, pH_max = 0.0, 14.0
	epsilon = 0.01

	while (pH_max - pH_min) > epsilon:
	pH_mid = (pH_min + pH_max) / 2
	charge = cls.calculate_net_charge(sequence, pH_mid)

	if abs(charge) < epsilon:
	return round(pH_mid, 2)

	if charge > 0:
	pH_min = pH_mid
	else:
	pH_max = pH_mid

	return round((pH_min + pH_max) / 2, 2)

	@classmethod
	def calculate_molecular_weight(cls, sequence: str) -> float:
	"""Calculate molecular weight"""
	if BIOPYTHON_AVAILABLE:
	try:
	analyzer = ProteinAnalysis(sequence)
	return analyzer.molecular_weight()
	except:
	pass

	# Fallback: approximate calculation
	weights = {
	'A': 89.1, 'C': 121.2, 'D': 133.1, 'E': 147.1, 'F': 165.2,
	'G': 75.1, 'H': 155.2, 'I': 131.2, 'K': 146.2, 'L': 131.2,
	'M': 149.2, 'N': 132.1, 'P': 115.1, 'Q': 146.2, 'R': 174.2,
	'S': 105.1, 'T': 119.1, 'V': 117.1, 'W': 204.2, 'Y': 181.2
	}

	mw = sum(weights.get(aa, 0) for aa in sequence)
	# Subtract water for peptide bonds
	mw -= 18.0 * (len(sequence) - 1)
	return round(mw, 1)

	@classmethod
	def calculate_hydrophobicity(cls, sequence: str) -> float:
	"""Calculate GRAVY (grand average of hydropathy)"""
	if BIOPYTHON_AVAILABLE:
	try:
	analyzer = ProteinAnalysis(sequence)
	return analyzer.gravy()
	except:
	pass

	# Kyte-Doolittle scale
	hydrophobicity = {
	'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
	'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
	'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
	'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
	}

	if len(sequence) == 0:
	return 0

	total = sum(hydrophobicity.get(aa, 0) for aa in sequence)
	return round(total / len(sequence), 2)

	# ==================== Data Management ====================

	class TrainingDataManager:
	def __init__(self, data_dir=None):
	possible_dirs = [
	ASSETS_MODELS / "training_data_cleaned", # In HF downloaded location
	Path("training_data_cleaned"), # Local relative path
	ASSETS_DATA, # Original location
	]

	self.data_dir = None
	for d in possible_dirs:
	if d.exists():
	self.data_dir = d
	print(f"Using data directory: {d}")
	break

	if self.data_dir is None:
	print(f"WARNING: No data directory found. Tried: {possible_dirs}")
	self.data_dir = ASSETS_DATA # Fallback
	self.data_dir.mkdir(exist_ok=True)

	self.statistics = self.load_statistics()

	def load_csv_data(self, filepath: Path, value_column, is_binary: bool = False) -> Optional[Dict]:
	"""Load data from a CSV file.
	value_column can be a string OR a list/tuple of candidate column names.
	"""
	if not filepath.exists():
	print(f"File not found: {filepath}")
	return None
	try:
	df = pd.read_csv(filepath, encoding="utf-8", on_bad_lines="skip")
	print(f"Columns in {filepath.name}: {df.columns.tolist()[:5]}...")

	# Case-insensitive column map
	col_lower = {col.lower(): col for col in df.columns}

	# allow list/tuple of candidates
	if isinstance(value_column, (list, tuple)):
	chosen = None
	for c in value_column:
	if c is None:
	continue
	c_l = str(c).lower()
	if c_l in col_lower:
	chosen = col_lower[c_l]
	break
	if chosen is None:
	print(f"None of candidate columns {value_column} found. Available: {list(df.columns)[:10]}")
	return None
	value_column = chosen
	else:
	# keep original behavior, but safe-cast to str
	vc_l = str(value_column).lower()
	if vc_l not in col_lower:
	alternatives = {
	'label': ['label', 'labels', 'y', 'target'],
	'affinity': ['affinity', 'pkd', 'pki', 'binding_affinity'],
	'pampa': ['pampa', 'pampa_value', 'permeability'],
	'caco2': ['caco2', 'caco-2', 'caco_2'],
	'log_hour': ['log_hour', 'loghour', 'log_hours', 'loghours'],
	'half_life_hours': ['half_life_hours', 'halflife_hours', 'hours'],
	'half_life_seconds': ['half_life_seconds', 'halflife_seconds', 'seconds'],
	}
	found = False
	for alt in alternatives.get(vc_l, []):
	if alt.lower() in col_lower:
	value_column = col_lower[alt.lower()]
	found = True
	break
	if not found:
	print(f"Column {value_column} not found. Available: {list(df.columns)[:10]}")
	return None
	else:
	value_column = col_lower[vc_l]

	vals = pd.to_numeric(df[value_column], errors="coerce").dropna().to_numpy()
	if len(vals) == 0:
	print(f"No valid values found in column {value_column}")
	return None

	print(f"Loaded {len(vals)} values from {filepath.name}")

	if is_binary:
	unique_vals = np.unique(vals)
	if not set(unique_vals).issubset({0, 1, 0.0, 1.0}):
	vals = (vals > 0.5).astype(int)

	return {"values": vals, "n_samples": len(vals)}

	except Exception as e:
	print(f"Error loading {filepath}: {e}")
	import traceback
	traceback.print_exc()
	return None


	def load_statistics(self):
	"""Load pre-computed statistics for each property from actual data files"""
	stats = {}

	# Map properties to their data files and value columns
	data_mappings = {
	'hemolysis': {
	'files': [
	'hemolysis/hemo_meta_with_split.csv',
	'hemolysis/hemolysis_meta_with_split.csv',
	],
	'column': 'label',
	'is_binary': True
	},
	'solubility': {
	'files': [
	'solubility/sol_meta_with_split.csv',
	'solubility/solubility_meta_with_split.csv',
	],
	'column': 'label',
	'is_binary': True
	},
	"binding_affinity_wt": {
	"files": ["binding_affinity/binding_affinity_wt_meta_with_split.csv"],
	"column": "affinity",
	"is_binary": False
	},
	"binding_affinity_smiles": {
	"files": ["binding_affinity/binding_affinity_smiles_meta_with_split.csv"],
	"column": "affinity",
	"is_binary": False
	},
	"binding_affinity_all": {
	"files": [
	"binding_affinity/binding_affinity_wt_meta_with_split.csv",
	"binding_affinity/binding_affinity_smiles_meta_with_split.csv",
	],
	"column": "affinity",
	"is_binary": False
	},

	"halflife_wt": {
	"files": [
	"half_life/halflife_with_split.csv",
	"half_life/halflife_meta_with_split.csv",
	],
	"column": ["half_life_hours", "log_hour", "log_hours"],
	"is_binary": False
	},
	"halflife_smiles": {
	"files": [
	"half_life/halflife_smiles_with_split.csv",
	"half_life/halflife_smiles_with_splits.csv",
	"half_life/halflife_smiles_meta_with_split.csv",
	],
	"column": ["half_life_hours", "log_hour", "log_hours"],
	"is_binary": False
	},
	"halflife_all": {
	"files": [
	"half_life/halflife_with_split.csv",
	"half_life/halflife_meta_with_split.csv",
	"half_life/halflife_smiles_with_split.csv",
	"half_life/halflife_smiles_with_splits.csv",
	"half_life/halflife_smiles_meta_with_split.csv",
	],
	"column": ["half_life_hours", "log_hour", "log_hours"],
	"is_binary": False
	},
	'nf': {
	'files': [
	'nonfouling/nf_meta_with_split.csv',
	'nf/nf_meta_with_split.csv',
	],
	'column': 'label',
	'is_binary': True
	},
	'permeability_penetrance': {
	'files': [
	'permeability/perm_meta_with_split.csv',
	'permeability_penetrance/permeability_meta_with_split.csv',
	],
	'column': 'label',
	'is_binary': True
	},
	'permeability_pampa': {
	'files': [
	'permeability_pampa/pampa_meta_with_split.csv',
	'pampa/pampa_meta_with_split.csv',
	],
	'column': 'PAMPA',
	'is_binary': False
	},
	'permeability_caco2': {
	'files': [
	'permeability_caco2/caco2_meta_with_split.csv',
	'caco2/caco2_meta_with_split.csv',
	],
	'column': 'Caco2',
	'is_binary': False
	},
	'toxicity': {
	'files': [
	'toxicity/tox_meta_with_split.csv',
	'toxicity/toxicity_meta_with_split.csv',
	],
	'column': 'label',
	'is_binary': True
	}
	}

	# Load actual data
	for prop_key, mapping in data_mappings.items():
	all_vals = []
	loaded_from = []

	for file_path in mapping['files']:
	filepath = self.data_dir / file_path
	if not filepath.exists():
	continue

	d = self.load_csv_data(
	filepath,
	mapping['column'],
	mapping.get('is_binary', False)
	)
	if d:
	all_vals.append(d["values"])
	loaded_from.append(file_path)

	if all_vals:
	vals = np.concatenate(all_vals, axis=0)

	prop_info = PROPERTY_INFO.get(prop_key, {})
	stats[prop_key] = {
	"values": vals,
	"description": prop_info.get("description", ""),
	"unit": "Probability" if mapping.get("is_binary") else prop_info.get("unit", "Score"),
	"n_samples": int(vals.shape[0]),
	"kind": "binary" if mapping.get("is_binary") else "continuous",
	"loaded_from": loaded_from, # optional: good for debugging
	}

	# thresholds / unit tweaks
	if prop_key == "binding_affinity":
	stats[prop_key]["threshold"] = 9
	stats[prop_key]["threshold_secondary"] = 7
	stats[prop_key]["unit"] = "pKd/pKi"

	elif prop_key in ["permeability_pampa", "permeability_caco2"]:
	stats[prop_key]["threshold"] = -6
	stats[prop_key]["unit"] = "log Peff" if prop_key == "permeability_pampa" else "log Papp"

	elif prop_key == "halflife":
	stats[prop_key]["unit"] = "hours"
	# for distribution plotting
	if prop_key.startswith("binding_affinity"):
	stats[prop_key]["threshold"] = 9
	stats[prop_key]["threshold_secondary"] = 7
	stats[prop_key]["unit"] = "pKd/pKi"

	elif prop_key.startswith("halflife"):
	stats[prop_key]["unit"] = "hours"
	print(f"✓ Loaded {prop_key} from {loaded_from} ({len(vals)} samples)")
	continue

	# fallback synthetic
	print(f"⚠ Using synthetic data for {prop_key}")

	return stats

	def get_distribution_plot(self, property_name, current_value=None):
	if property_name not in self.statistics:
	return None
	s = self.statistics[property_name]
	vals = np.asarray(s["values"])
	kind = s.get("kind", "continuous")

	if kind == "binary":
	n0 = int((vals == 0).sum())
	n1 = int((vals == 1).sum())
	total = max(n0 + n1, 1)
	fig = go.Figure()

	prop_info = PROPERTY_INFO.get(property_name, {})
	labels = [
	prop_info.get('fail_label', 'Negative (0)'),
	prop_info.get('pass_label', 'Positive (1)')
	]

	fig.add_trace(go.Bar(x=labels, y=[n0, n1]))
	fig.update_layout(
	title=f"{prop_info.get('display', property_name)} — Class Distribution",
	xaxis_title="Class",
	yaxis_title="Count",
	height=400,
	showlegend=False,
	annotations=[
	dict(x=labels[0], y=n0, text=f"{n0} ({n0/total:.1%})", showarrow=False, yshift=8),
	dict(x=labels[1], y=n1, text=f"{n1} ({n1/total:.1%})", showarrow=False, yshift=8),
	],
	)
	return fig

	# Continuous distribution
	fig = go.Figure()
	fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))

	# Primary threshold (if any)
	if "threshold" in s and s["threshold"] is not None:
	fig.add_vline(
	x=float(s["threshold"]),
	line_dash="dash",
	line_color="purple" if property_name == "binding_affinity" else "red",
	annotation_text=(
	"Tight threshold: {:.3f}".format(float(s["threshold"]))
	if property_name == "binding_affinity"
	else "Threshold: {:.3f}".format(float(s["threshold"]))
	),
	)

	# Secondary threshold for binding (weak)
	if property_name == "binding_affinity" and "threshold_secondary" in s and s["threshold_secondary"] is not None:
	fig.add_vline(
	x=float(s["threshold_secondary"]),
	line_dash="dash",
	line_color="orange",
	annotation_text="Weak threshold: {:.3f}".format(float(s["threshold_secondary"])),
	)

	# Current value
	if current_value is not None:
	fig.add_vline(
	x=float(current_value),
	line_dash="solid",
	line_color="green",
	line_width=3,
	annotation_text=f"Your Result: {float(current_value):.3f}",
	)

	prop_info = PROPERTY_INFO.get(property_name, {})
	fig.update_layout(
	title=f"{prop_info.get('display', property_name)} Distribution",
	xaxis_title=s.get("unit", ""),
	yaxis_title="Count",
	height=400,
	showlegend=False,
	)
	return fig

	def get_property_info(self, property_name):
	if property_name not in self.statistics:
	return None
	s = self.statistics[property_name]
	vals = np.asarray(s["values"])
	kind = s.get("kind", "continuous")

	info = {
	"description": s.get("description", ""),
	"unit": s.get("unit", ""),
	"n_samples": int(len(vals)),
	"mean": float(np.mean(vals)),
	"std": float(np.std(vals)),
	"min": float(np.min(vals)),
	"max": float(np.max(vals)),
	"percentiles": {},
	}

	if kind == "binary":
	info["n_neg"] = int((vals == 0).sum())
	info["n_pos"] = int((vals == 1).sum())
	else:
	pct = np.percentile(vals, [10, 25, 50, 75, 90])
	info["percentiles"] = {
	"10%": float(pct[0]),
	"25%": float(pct[1]),
	"50% (median)": float(pct[2]),
	"75%": float(pct[3]),
	"90%": float(pct[4]),
	}

	return info

	# ==================== Gradio Interface ====================

	def predict_properties(
	input_text: str,
	input_type: str, # "Sequence" or "SMILES"
	protein_text: str, # For binding affinity
	selected_props: list[str], # from individual checkboxes
	include_physicochemical: bool,
	pH_value: float,
	progress=gr.Progress()
	):
	if not input_text or not input_text.strip():
	return None, "⚠️ Please provide input."

	lines = [s.strip() for s in input_text.split("\n") if s.strip()]
	if input_type == "Sequence":
	bad = [s for s in lines if not is_aa_sequence_like(s)]
	if bad:
	return None, f"⚠️ Input Type=Sequence but {len(bad)} line(s) don't look like AA sequences. Example: {bad[0][:60]}"
	else:
	bad = [s for s in lines if not is_smiles_like(s)]
	if bad:
	return None, f"⚠️ Input Type=SMILES but {len(bad)} line(s) don't look like SMILES. Example: {bad[0][:60]}"

	ctx = initialize()
	print("keys in ctx.best:", sorted(ctx.best.keys()))
	print("loaded model keys:", sorted(ctx.predictor.models.keys()))
	print("halflife wt loaded?", ("halflife","wt") in ctx.predictor.models)
	print("halflife smiles loaded?", ("halflife","smiles") in ctx.predictor.models)
	if not selected_props:
	return None, "⚠️ Please select at least one property."

	results = []
	analyzer = SequenceAnalyzer()

	# Check availability
	available = get_available_properties(ctx, input_type)
	unavailable = [p for p in selected_props if not available.get(p, False)]
	if unavailable:
	unavailable_names = [PROPERTY_INFO.get(p, {}).get('display', p) for p in unavailable]
	return None, f"⚠️ These properties are not supported for {input_type}: {', '.join(unavailable_names)}"

	for i, s in enumerate(lines):
	progress((i + 1) / len(lines), f"Processing {i+1}/{len(lines)}")

	# Regular property predictions
	for prop in selected_props:
	if prop == "binding_affinity":
	# Handle binding affinity separately
	if not protein_text or not protein_text.strip():
	results.append({
	"Input": s[:30] + "..." if len(s) > 30 else s,
	"Property": PROPERTY_INFO[prop]['display'],
	"Prediction": "N/A",
	"Value": "Requires protein",
	"Unit": "",
	})
	continue

	mode = "wt" if input_type == "Sequence" else "smiles"
	try:
	result = ctx.predictor.predict_binding_affinity(mode, protein_text.strip(), s)
	affinity = result["affinity"]

	# Determine binding class based on thresholds
	if affinity >= 9:
	class_label = "Tight binding"
	elif affinity >= 7:
	class_label = "Medium binding"
	else:
	class_label = "Weak binding"

	results.append({
	"Input": s[:30] + "..." if len(s) > 30 else s,
	"Property": PROPERTY_INFO[prop]['display'],
	"Prediction": class_label,
	"Value": f"{affinity:.3f}",
	"Unit": "pKd/pKi",
	})
	except Exception as e:
	print(f"Error predicting binding affinity: {e}")
	results.append({
	"Input": s[:30] + "..." if len(s) > 30 else s,
	"Property": PROPERTY_INFO[prop]['display'],
	"Prediction": "Error",
	"Value": "Failed",
	"Unit": "",
	})
	continue

	# Regular properties
	mode = "wt" if input_type == "Sequence" else "smiles"

	try:
	result = ctx.predictor.predict_property(prop, mode, s)
	score = result["score"]

	prop_info = PROPERTY_INFO.get(prop, {})

	# Determine label based on property type
	if prop in ['permeability_pampa', 'permeability_caco2']:
	# Special handling for permeability assays
	label = prop_info['pass_label'] if score > -6 else prop_info['fail_label']
	unit = "log Peff" if prop == 'permeability_pampa' else "log Papp"
	elif prop == 'halflife':
	# Regression task, no pass/fail
	label = "—"
	unit = prop_info.get('unit', 'hours')
	else:
	# Classification tasks
	thr = get_threshold(ctx, prop, input_type)
	if thr is not None:
	if prop in LOWER_BETTER:
	label = prop_info.get('pass_label', 'Pass') if score < thr else prop_info.get('fail_label', 'Fail')
	else:
	label = prop_info.get('pass_label', 'Pass') if score >= thr else prop_info.get('fail_label', 'Fail')
	else:
	label = "—"
	unit = "Probability"

	results.append({
	"Input": s[:30] + "..." if len(s) > 30 else s,
	"Property": prop_info.get('display', prop),
	"Prediction": label,
	"Value": f"{score:.3f}",
	"Unit": unit,
	})
	except Exception as e:
	print(f"Error predicting {prop} for {s[:30]}: {e}")
	continue

	# physicochemical only for AA sequence modality
	if input_type == "Sequence" and include_physicochemical:
	analysis = {
	"length": len(s),
	"molecular_weight": analyzer.calculate_molecular_weight(s),
	"net_charge": analyzer.calculate_net_charge(s, pH_value),
	"isoelectric_point": analyzer.calculate_isoelectric_point(s),
	"hydrophobicity": analyzer.calculate_hydrophobicity(s),
	}
	short = s[:30] + "..." if len(s) > 30 else s
	results += [
	{"Input": short, "Property": "📏 Length", "Prediction": "", "Value": str(analysis["length"]), "Unit": "aa"},
	{"Input": short, "Property": "⚖️ Molecular Weight", "Prediction": "", "Value": f"{analysis['molecular_weight']:.1f}", "Unit": "Da"},
	{"Input": short, "Property": f"⚡ Net Charge (pH {pH_value})", "Prediction": "", "Value": f"{analysis['net_charge']:.2f}", "Unit": ""},
	{"Input": short, "Property": "🎯 Isoelectric Point", "Prediction": "", "Value": f"{analysis['isoelectric_point']:.2f}", "Unit": "pH"},
	{"Input": short, "Property": "💦 Hydrophobicity (GRAVY)", "Prediction": "", "Value": f"{analysis['hydrophobicity']:.2f}", "Unit": "GRAVY"},
	]

	df = pd.DataFrame(results)
	status = f"✅ Completed {len(df)} rows ({len(lines)} input(s), {len(selected_props)} selected properties)."
	return df, status

	def show_distribution(property_name, predicted_value=None):
	"""Show distribution plot + info for selected property."""
	data_manager = TrainingDataManager()

	if not property_name:
	return None, "Select a property to view its distribution."

	# Get the first property if a list was passed
	prop = property_name[0] if isinstance(property_name, list) else property_name

	# Generate the plot
	fig = data_manager.get_distribution_plot(prop, predicted_value)

	# Build info panel
	info = data_manager.get_property_info(prop)

	if not info:
	return fig, "No information available for this property."

	prop_info = PROPERTY_INFO.get(prop, {})
	title = DIST_KEYS.get(prop, PROPERTY_INFO.get(prop, {}).get("display", prop))

	kind = data_manager.statistics.get(prop, {}).get("kind", "continuous")

	if kind == "binary":
	n_pos = info.get("n_pos", 0)
	n_neg = info.get("n_neg", 0)
	total = max(n_pos + n_neg, 1)
	info_text = f"""
	#### {title} Information

	Description: {info.get('description','')}

	Statistics (Binary):
	- Samples: {info['n_samples']:,}
	- {prop_info.get('pass_label', 'Positive')} (1): {n_pos:,} ({n_pos/total:.1%})
	- {prop_info.get('fail_label', 'Negative')} (0): {n_neg:,} ({n_neg/total:.1%})
	"""
	else:
	p = info.get("percentiles", {})
	info_text = f"""
	#### {title} Information

	Description: {info.get('description','')}

	Statistics:
	- Samples: {info['n_samples']:,}
	- Mean: {info['mean']:.3f} {info['unit']}
	- Std Dev: {info['std']:.3f}
	- Range: [{info['min']:.3f}, {info['max']:.3f}]

	Percentiles:
	- 10%: {p.get('10%', float('nan')):.3f}
	- 25%: {p.get('25%', float('nan')):.3f}
	- 50% (median): {p.get('50% (median)', float('nan')):.3f}
	- 75%: {p.get('75%', float('nan')):.3f}
	- 90%: {p.get('90%', float('nan')):.3f}
	"""

	return fig, info_text

	def load_example(example_name):
	"""Load example sequences"""
	examples = {
	"T7 Peptide": ("HAIYPRH", ""),
	"Protein-Peptide": (
	"GIVEQCCTSICSLYQLENYCN",
	"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLST"
	),
	"Cyclic Peptide (SMILES)": (
	"CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O",
	""
	),
	"Protein-Cyclic Peptide (SMILES)": (
	"CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O",
	"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLST"
	),
	"None": ("", ""),
	}
	return examples.get(example_name, ("", ""))


	def on_example_change(name: str):
	if not name:
	return gr.update(), gr.update()

	binder, protein = load_example(name)
	show_protein = name in ["Protein-Peptide", "Protein-Cyclic Peptide (SMILES)"]
	return (
	gr.update(value=binder),
	gr.update(value=protein, visible=show_protein),
	)

	def on_modality_change(modality, *checkbox_values):
	ctx = initialize()
	available = get_available_properties(ctx, modality)

	updates = []
	for i, prop_key in enumerate(PROP_ORDER):
	is_available = available.get(prop_key, False)
	prop_info = PROPERTY_INFO[prop_key]
	label_text = f"{prop_info['display']} {prop_info.get('direction','')}".rstrip()
	if not is_available:
	label_text += " (Not supported)"
	if prop_key == "binding_affinity" and is_available:
	label_text += " *"

	current_value = checkbox_values[i] if i < len(checkbox_values) else False
	updates.append(gr.update(
	label=label_text,
	interactive=is_available,
	value=False if not is_available else current_value
	))
	return updates


	def collect_selected_properties(*checkbox_values):
	selected = []
	for i, prop_key in enumerate(PROP_ORDER):
	if i < len(checkbox_values) and checkbox_values[i]:
	selected.append(prop_key)
	return selected


	# ==================== Gradio App ====================

	def load_custom_css():
	"""Load CSS styling document"""
	css_file = "peptiverse_styles.css"

	try:
	with open(css_file, 'r', encoding='utf-8') as f:
	return f.read()
	except FileNotFoundError:
	print(f"Warning: CSS file '{css_file}' not found. Using default styles.")
	# Minimal fallback CSS
	return """
	.gradio-container {
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
	font-size: 16px !important;
	}
	"""
	except Exception as e:
	print(f"Error loading CSS: {e}")
	return ""

	custom_css = load_custom_css()

	def get_title_html():
	"""Load light/dark SVG title and swap via prefers-color-scheme"""
	import base64, os

	def load_svg_b64(path):
	if not os.path.exists(path):
	return None
	with open(path, "rb") as f:
	return base64.b64encode(f.read()).decode("utf-8")

	light_b64 = load_svg_b64("peptiverse-light-withlogo.svg")
	dark_b64 = load_svg_b64("peptiverse-dark-withlogo.svg")

	if light_b64 or dark_b64:
	imgs = []

	if light_b64:
	imgs.append(f'''
	<img class="logo logo-light"
	src="data:image/svg+xml;base64,{light_b64}"
	alt="PeptiVerse"
	style="max-height: 200px;" />
	''')

	if dark_b64:
	imgs.append(f'''
	<img class="logo logo-dark"
	src="data:image/svg+xml;base64,{dark_b64}"
	alt="PeptiVerse"
	style="max-height: 200px;" />
	''')

	return f'''
	<div class="svg-title-container">
	{''.join(imgs)}
	</div>
	'''

	# ---------- Fallback ----------
	return '''
	<div class="svg-title-container">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 700 140"
	style="width: 100%; max-width: 700px; height: auto;">
	<defs>
	<linearGradient id="titleGradient" x1="0%" y1="0%" x2="100%" y2="100%">
	<stop offset="0%" style="stop-color:#667eea"/>
	<stop offset="100%" style="stop-color:#764ba2"/>
	</linearGradient>
	<filter id="shadow">
	<feDropShadow dx="0" dy="3" stdDeviation="4" flood-opacity="0.15"/>
	</filter>
	</defs>
	<text x="50%" y="50%"
	text-anchor="middle"
	dominant-baseline="middle"
	style="font-size:72px;font-weight:bold;
	fill:url(#titleGradient);filter:url(#shadow);">
	🌐 PeptiVerse
	</text>
	</svg>
	</div>
	'''


	with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as demo:
	ctx = initialize()

	# Header with SVG title support
	title_html = get_title_html()
	gr.HTML(title_html)

	gr.Markdown(
	"""
	# 🌐 PeptiVerse
	""",
	visible=False
	)

	with gr.Tabs():
	# Main Prediction Tab
	with gr.TabItem("🔬 Predict", elem_classes="predict-tab"):
	with gr.Row():
	# Input Section
	with gr.Column(scale=1):
	with gr.Group():
	gr.Markdown("### 📝 Input")

	input_type = gr.Radio(
	["Sequence", "SMILES"],
	label="Input Type",
	value="Sequence"
	)

	# Load T7 peptide by default
	input_text = gr.Textbox(
	label="Peptide Sequence(s) / SMILES",
	placeholder="Enter amino acid sequence(s) or SMILES, one per line",
	lines=6,
	value="HAIYPRH"
	)

	protein_seq = gr.Textbox(
	label="Protein Sequence (for binding prediction)",
	placeholder="Enter target protein sequence",
	lines=3,
	visible=False
	)

	gr.Markdown("Examples:")
	example_dropdown = gr.Dropdown(
	choices=["None", "T7 Peptide", "Protein-Peptide", "Cyclic Peptide (SMILES)", "Protein-Cyclic Peptide (SMILES)"],
	label="Load Example",
	value="T7 Peptide", # Set T7 as default
	interactive=True,
	allow_custom_value=False
	)

	# Property Selection
	with gr.Column(scale=1):
	with gr.Group():
	gr.Markdown("### ⚙️ Select Properties")

	with gr.Accordion("Physicochemical Properties", open=True, elem_id="acc_phys"):
	include_physicochemical = gr.Checkbox(
	label="🧪 Calculate Basic Properties",
	value=True,
	info="MW, net charge, pI, hydrophobicity (Sequence only)"
	)

	pH_value = gr.Slider(
	minimum=0,
	maximum=14,
	value=7.0,
	step=0.1,
	label="pH for Net Charge",
	info="Physiological pH is ~7.4"
	)

	# Create individual checkboxes in fixed order
	with gr.Accordion("Prediction Properties", open=True, elem_id="acc_pred"):
	property_checkboxes = []
	available = get_available_properties(ctx, "Sequence")

	for prop_key in PROP_ORDER:
	prop_info = PROPERTY_INFO[prop_key]
	is_available = available.get(prop_key, False)

	label_text = f"{prop_info['display']} {prop_info.get('direction','')}".rstrip()
	if not is_available:
	label_text += " (Not supported)"
	if prop_key == "binding_affinity" and is_available:
	label_text += " *"

	default_on = (prop_key in ["solubility", "hemolysis"]) # optional defaults
	cb = gr.Checkbox(
	label=label_text,
	value=is_available and default_on,
	interactive=is_available,
	elem_id=f"checkbox_{prop_key}",
	)
	property_checkboxes.append(cb)

	gr.Markdown("*Requires protein sequence input above", elem_classes="text-sm text-gray-500")


	# Best Models Tab
	with gr.TabItem("📋 Best Models", elem_classes="best-models-tab"):
	gr.Markdown("### Current Best Models Configuration")
	gr.Markdown("This table shows the models and thresholds currently being used for predictions:")
	best_models_df = gr.Dataframe(
	value=get_best_models_table(ctx),
	headers=["Property", "Best Model (Sequence)", "Threshold (Sequence)",
	"Best Model (SMILES)", "Threshold (SMILES)", "Task Type"],
	interactive=False,
	elem_id="best_models_df"
	)
	gr.Markdown("""
	Note: Models marked as SVM, SVR, or ENET are automatically replaced with XGB
	as these models are not currently supported in the deployment environment.
	""")

	# Distribution Analysis Tab
	with gr.TabItem("📊 Distributions", elem_classes="distributions-tab"):
	with gr.Row():
	with gr.Column(scale=1):
	base_props = [
	k for k in PROPERTY_INFO.keys()
	if k not in {"halflife", "binding_affinity"}
	]

	dist_choices = base_props + list(DIST_KEYS.keys())

	property_selector = gr.Dropdown(
	choices=dist_choices,
	label="Select Property",
	value="binding_affinity_all"
	)
	test_value = gr.Number(label="Test Value among Distribution", value=None)
	show_dist_btn = gr.Button("Show Distribution")

	with gr.Column(scale=2):
	dist_plot_tab = gr.Plot(label="Score Distribution")
	dist_info_tab = gr.Markdown()

	# Data Documentation Tab
	with gr.TabItem("📚 Documentation", elem_classes="documentation-tab"):
	# Load documentation
	doc_file_path = "description.md"
	try:
	with open(doc_file_path, "r", encoding="utf-8") as f:
	markdown_content = f.read()
	except FileNotFoundError:
	print(f"Warning: Documentation file '{doc_file_path}' not found.")
	markdown_content = """
	# Documentation

	Documentation file not found. Please ensure `description.md` is in the same directory as the app.
	"""
	except Exception as e:
	print(f"Error loading documentation: {e}")
	markdown_content = "# Error loading documentation"

	gr.Markdown(markdown_content)
	# Action Buttons
	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")
	predict_btn = gr.Button("🚀 Predict Properties", variant="primary", scale=2)

	# Status
	status_output = gr.Markdown("")

	# Results Section
	with gr.Group():
	gr.Markdown("### 📊 Results")

	results_df = gr.Dataframe(
	headers=["Input", "Property", "Prediction", "Value", "Unit"],
	datatype=["str", "str", "str", "str", "str"],
	interactive=False,
	elem_id="results_df"
	)

	# Footer
	gr.Markdown(
	"""
	---
	<div style='text-align: center; color: #6b7280;'>
	<p>PeptiVerse - A Unified Platform for peptide therapeutic property prediction.</p>
	<p>Please cite our work if you use this tool in your research.</p>
	</div>
	"""
	)

	# Event Handlers
	def update_visibility(binding_checked):
	return gr.update(visible=binding_checked)

	# Update checkbox states when modality changes
	input_type.change(
	on_modality_change,
	inputs=[input_type] + property_checkboxes,
	outputs=property_checkboxes
	)

	# Show protein sequence input when binding affinity is selected
	BINDING_IDX = PROP_ORDER.index("binding_affinity")

	property_checkboxes[BINDING_IDX].change(
	update_visibility,
	inputs=[property_checkboxes[BINDING_IDX]],
	outputs=[protein_seq],
	)

	example_dropdown.change(
	on_example_change,
	inputs=[example_dropdown],
	outputs=[input_text, protein_seq]
	)

	predict_btn.click(
	lambda input_text, input_type, protein_text, include_physicochemical, pH_value, *checkbox_values:
	predict_properties(
	input_text, input_type, protein_text,
	collect_selected_properties(*checkbox_values),
	include_physicochemical, pH_value
	),
	inputs=[input_text, input_type, protein_seq, include_physicochemical, pH_value] + property_checkboxes,
	outputs=[results_df, status_output]
	)

	clear_btn.click(
	lambda: ["", "", "None", None, ""] + [False] * len(property_checkboxes),
	outputs=[input_text, protein_seq, example_dropdown, results_df, status_output] + property_checkboxes
	)

	show_dist_btn.click(
	show_distribution,
	inputs=[property_selector, test_value],
	outputs=[dist_plot_tab, dist_info_tab]
	)

	if __name__ == "__main__":
	print("Initializing models...")
	initialize()
	print("Ready!")
	demo.launch(share=True)