Spaces:
Sleeping
Sleeping
| import re | |
| import torch | |
| import pandas as pd | |
| import gradio as gr | |
| from io import StringIO | |
| from transformers import EsmForSequenceClassification, EsmTokenizer | |
| # --- Load tokenizer & model --- | |
| tokenizer = EsmTokenizer.from_pretrained( | |
| "facebook/esm2_t6_8M_UR50D", | |
| do_lower_case=False | |
| ) | |
| model = EsmForSequenceClassification.from_pretrained("model/best_model5") | |
| model.eval() | |
| # --- FASTA Reader --- | |
| def read_fasta(fasta_string): | |
| sequences = [] | |
| headers = [] | |
| seq_buffer = [] | |
| for line in StringIO(fasta_string): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line.startswith(">"): | |
| if seq_buffer: | |
| sequences.append("".join(seq_buffer)) | |
| seq_buffer.clear() | |
| headers.append(line) | |
| else: | |
| if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line): | |
| raise ValueError( | |
| "Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed." | |
| ) | |
| if len(line) > 30: | |
| raise ValueError( | |
| f"Sequence too long: '{line}' ({len(line)} > 30 characters)." | |
| ) | |
| seq_buffer.append(line) | |
| if seq_buffer: | |
| sequences.append("".join(seq_buffer)) | |
| if len(headers) != len(sequences): | |
| raise ValueError( | |
| f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. " | |
| "Each header must be followed by a sequence." | |
| ) | |
| return headers, sequences | |
| def predict_peptide_class(sequences): | |
| sequences = [str(s) for s in sequences] | |
| inputs = tokenizer( | |
| sequences, | |
| return_tensors='pt', | |
| padding=True, | |
| truncation=True, | |
| max_length=30 | |
| ) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = torch.sigmoid(logits)[:, 1].cpu().numpy() | |
| classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs] | |
| return probs, classes | |
| def predict_from_fasta(fasta_input): | |
| """Processes FASTA input and returns predictions in table + CSV download.""" | |
| try: | |
| headers, sequences = read_fasta(fasta_input) | |
| if not sequences: | |
| df = pd.DataFrame({"Error": ["No valid sequences found."]}) | |
| return df, None | |
| probs, classes = predict_peptide_class(sequences) | |
| probs_rounded = [f"{p:.2f}" for p in probs] | |
| df = pd.DataFrame({ | |
| "Header": headers, | |
| "Sequence": sequences, | |
| "Probability": probs_rounded, | |
| "Predicted Class": classes | |
| }) | |
| # Save as CSV file | |
| csv_path = "predictions.csv" | |
| df.to_csv(csv_path, index=False) | |
| return df, csv_path | |
| except ValueError as e: | |
| df = pd.DataFrame({"Error": [str(e)]}) | |
| return df, None | |
| except Exception as e: | |
| df = pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]}) | |
| return df, None | |
| iface = gr.Interface( | |
| fn=predict_from_fasta, | |
| inputs=gr.Textbox( | |
| lines=10, | |
| placeholder="Paste your peptide sequences in FASTA format here" | |
| ), | |
| outputs=[ | |
| gr.Dataframe(label="Predictions"), | |
| gr.File(label="Download CSV") | |
| ], | |
| title="B3PP Predictor", | |
| description=( | |
| "Submit peptide sequences in FASTA format to determine their potential as " | |
| "blood-brain barrier penetration peptides. Sequences must consist exclusively " | |
| "of natural amino acids in uppercase letters, with a maximum length of 30 characters." | |
| ) | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |