Spaces:
Runtime error
Runtime error
app
Browse files- .gitattributes +2 -0
- .gitignore +3 -0
- app.py +43 -2
- predict.py +25 -0
- requirements.txt +5 -0
- src/cgr.py +77 -0
- src/fcgr.py +72 -0
- src/model_loader.py +39 -0
- src/models/resnet50_6mers.py +103 -0
- src/pipeline.py +85 -0
- src/preprocessing.py +15 -0
- src/utils.py +25 -0
- trained-models/model-34-0.954.hdf5 +3 -0
- trained-models/preprocessing.json +3 -0
.gitattributes
CHANGED
|
@@ -26,3 +26,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 26 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 28 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 26 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 28 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.hdf5 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
env/
|
| 2 |
+
.vscode/
|
| 3 |
+
__pycache__/
|
app.py
CHANGED
|
@@ -1,4 +1,45 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
st.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from predict import predict_seq
|
| 3 |
+
from Bio import SeqIO
|
| 4 |
+
from io import StringIO
|
| 5 |
+
from src.utils import array2img
|
| 6 |
+
from src.fcgr import FCGR
|
| 7 |
|
| 8 |
+
with st.sidebar:
|
| 9 |
+
st.write("Options")
|
| 10 |
+
kmer = st.slider(label="kmer to visualize FCGR",
|
| 11 |
+
min_value=6,
|
| 12 |
+
max_value=9,
|
| 13 |
+
value=8
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# App
|
| 17 |
+
st.title('Sars-cov-2 classification with FCGR')
|
| 18 |
+
|
| 19 |
+
# load fasta file
|
| 20 |
+
uploaded_file = st.file_uploader(label="Load fasta file")
|
| 21 |
+
|
| 22 |
+
if uploaded_file is not None:
|
| 23 |
+
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
| 24 |
+
record = SeqIO.parse(stringio , "fasta")
|
| 25 |
+
fasta = next(record)
|
| 26 |
+
|
| 27 |
+
with st.spinner("Inference..."):
|
| 28 |
+
prediction, confidence, fcgr = predict_seq(str(fasta.seq), return_fcgr=True)
|
| 29 |
+
st.success("Done!")
|
| 30 |
+
|
| 31 |
+
st.write("### Results ")
|
| 32 |
+
st.write("Prediction: ", prediction)
|
| 33 |
+
st.write("Confidence: ", confidence)
|
| 34 |
+
|
| 35 |
+
# To generate the image to show
|
| 36 |
+
with st.spinner("Plotting FCGR"):
|
| 37 |
+
gen_fcgr = FCGR(kmer)
|
| 38 |
+
fcgr = gen_fcgr(fasta.seq)
|
| 39 |
+
img = array2img(fcgr)
|
| 40 |
+
# Show FCGR
|
| 41 |
+
st.image(
|
| 42 |
+
image=img,
|
| 43 |
+
caption="FCGR",
|
| 44 |
+
use_column_width="auto",
|
| 45 |
+
width=20)
|
predict.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import numpy as np
|
| 3 |
+
from src.fcgr import FCGR
|
| 4 |
+
from src.model_loader import ModelLoader
|
| 5 |
+
from src.preprocessing import Pipeline
|
| 6 |
+
from src.utils import clean_seq
|
| 7 |
+
fcgr = FCGR(k=6)
|
| 8 |
+
loader = ModelLoader()
|
| 9 |
+
order_output = ['S','L','G','V','GR','GH','GV','GK','GRY','O','GRA']
|
| 10 |
+
model = loader("resnet50_6mers", 11, "trained-models/model-34-0.954.hdf5")
|
| 11 |
+
with open("trained-models/preprocessing.json") as fp:
|
| 12 |
+
pipe = json.load(fp)
|
| 13 |
+
preprocessing = Pipeline(pipe)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def predict_seq(seq, return_fcgr=False):
|
| 17 |
+
array = fcgr(clean_seq(seq))
|
| 18 |
+
array = preprocessing(array)
|
| 19 |
+
pred = model.predict(np.expand_dims(np.expand_dims(array,axis=0),axis=-1))[0]
|
| 20 |
+
argmax = pred.argmax()
|
| 21 |
+
confidence = pred[argmax]
|
| 22 |
+
|
| 23 |
+
if return_fcgr:
|
| 24 |
+
return order_output[argmax], confidence, array
|
| 25 |
+
return order_output[argmax], confidence
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
tensorflow==2.7
|
| 3 |
+
biopython
|
| 4 |
+
numpy
|
| 5 |
+
Pillow
|
src/cgr.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"From original work: CGR for gene structure"
|
| 2 |
+
from typing import Dict, Optional
|
| 3 |
+
from collections import namedtuple
|
| 4 |
+
|
| 5 |
+
# coordinates for x+iy
|
| 6 |
+
Coord = namedtuple("Coord", ["x","y"])
|
| 7 |
+
|
| 8 |
+
# coordinates for a CGR encoding
|
| 9 |
+
CGRCoords = namedtuple("CGRCoords", ["N","x","y"])
|
| 10 |
+
|
| 11 |
+
# coordinates for each nucleotide in the 2d-plane
|
| 12 |
+
DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))
|
| 13 |
+
|
| 14 |
+
class CGR:
|
| 15 |
+
"Chaos Game Representation for DNA"
|
| 16 |
+
def __init__(self, coords: Optional[Dict[chr,tuple]]=None):
|
| 17 |
+
self.nucleotide_coords = DEFAULT_COORDS if coords is None else coords
|
| 18 |
+
self.cgr_coords = CGRCoords(0,0,0)
|
| 19 |
+
|
| 20 |
+
def nucleotide_by_coords(self,x,y):
|
| 21 |
+
"Get nucleotide by coordinates (x,y)"
|
| 22 |
+
# filter nucleotide by coordinates
|
| 23 |
+
filtered = dict(filter(lambda item: item[1] == Coord(x,y), self.nucleotide_coords.items()))
|
| 24 |
+
|
| 25 |
+
return list(filtered.keys())[0]
|
| 26 |
+
|
| 27 |
+
def forward(self, nucleotide: str):
|
| 28 |
+
"Compute next CGR coordinates"
|
| 29 |
+
x = (self.cgr_coords.x + self.nucleotide_coords.get(nucleotide).x)/2
|
| 30 |
+
y = (self.cgr_coords.y + self.nucleotide_coords.get(nucleotide).y)/2
|
| 31 |
+
|
| 32 |
+
# update cgr_coords
|
| 33 |
+
self.cgr_coords = CGRCoords(self.cgr_coords.N+1,x,y)
|
| 34 |
+
|
| 35 |
+
def backward(self,):
|
| 36 |
+
"Compute last CGR coordinates. Current nucleotide can be inferred from (x,y)"
|
| 37 |
+
# get current nucleotide based on coordinates
|
| 38 |
+
n_x,n_y = self.coords_current_nucleotide()
|
| 39 |
+
nucleotide = self.nucleotide_by_coords(n_x,n_y)
|
| 40 |
+
|
| 41 |
+
# update coordinates to the previous one
|
| 42 |
+
x = 2*self.cgr_coords.x - n_x
|
| 43 |
+
y = 2*self.cgr_coords.y - n_y
|
| 44 |
+
|
| 45 |
+
# update cgr_coords
|
| 46 |
+
self.cgr_coords = CGRCoords(self.cgr_coords.N-1,x,y)
|
| 47 |
+
|
| 48 |
+
return nucleotide
|
| 49 |
+
|
| 50 |
+
def coords_current_nucleotide(self,):
|
| 51 |
+
x = 1 if self.cgr_coords.x>0 else -1
|
| 52 |
+
y = 1 if self.cgr_coords.y>0 else -1
|
| 53 |
+
return x,y
|
| 54 |
+
|
| 55 |
+
def encode(self, sequence: str):
|
| 56 |
+
"From DNA sequence to CGR"
|
| 57 |
+
# reset starting position to (0,0,0)
|
| 58 |
+
self.reset_coords()
|
| 59 |
+
for nucleotide in sequence:
|
| 60 |
+
self.forward(nucleotide)
|
| 61 |
+
return self.cgr_coords
|
| 62 |
+
|
| 63 |
+
def reset_coords(self,):
|
| 64 |
+
self.cgr_coords = CGRCoords(0,0,0)
|
| 65 |
+
|
| 66 |
+
def decode(self, N:int, x:int, y:int)->str:
|
| 67 |
+
"From CGR to DNA sequence"
|
| 68 |
+
self.cgr_coords = CGRCoords(N,x,y)
|
| 69 |
+
|
| 70 |
+
# decoded sequence
|
| 71 |
+
sequence = []
|
| 72 |
+
|
| 73 |
+
# Recover the entire genome
|
| 74 |
+
while self.cgr_coords.N>0:
|
| 75 |
+
nucleotide = self.backward()
|
| 76 |
+
sequence.append(nucleotide)
|
| 77 |
+
return "".join(sequence[::-1])
|
src/fcgr.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .cgr import CGR
|
| 2 |
+
from itertools import product
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
class FCGR(CGR):
|
| 7 |
+
"""Frequency matrix CGR
|
| 8 |
+
an (2**k x 2**k) 2D representation will be created for a
|
| 9 |
+
n-long sequence.
|
| 10 |
+
- k represents the k-mer.
|
| 11 |
+
- 2**k x 2**k = 4**k the total number of k-mers (sequences of length k)
|
| 12 |
+
- pixel value correspond to the value of the frequency for each k-mer
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, k: int,):
|
| 16 |
+
super().__init__()
|
| 17 |
+
self.k = k # k-mer representation
|
| 18 |
+
self.kmers = list("".join(kmer) for kmer in product("ACGT", repeat=self.k))
|
| 19 |
+
self.kmer2pixel = self.kmer2pixel_position()
|
| 20 |
+
|
| 21 |
+
def __call__(self, sequence: str):
|
| 22 |
+
"Given a DNA sequence, returns an array with his frequencies in the same order as FCGR"
|
| 23 |
+
self.count_kmers(sequence)
|
| 24 |
+
|
| 25 |
+
# Create an empty array to save the FCGR values
|
| 26 |
+
array_size = int(2**self.k)
|
| 27 |
+
freq_matrix = np.zeros((array_size,array_size))
|
| 28 |
+
|
| 29 |
+
# Assign frequency to each box in the matrix
|
| 30 |
+
for kmer, freq in self.freq_kmer.items():
|
| 31 |
+
pos_x, pos_y = self.kmer2pixel[kmer]
|
| 32 |
+
freq_matrix[int(pos_x)-1,int(pos_y)-1] = freq
|
| 33 |
+
return freq_matrix
|
| 34 |
+
|
| 35 |
+
def count_kmer(self, kmer):
|
| 36 |
+
if "N" not in kmer:
|
| 37 |
+
self.freq_kmer[kmer] += 1
|
| 38 |
+
|
| 39 |
+
def count_kmers(self, sequence: str):
|
| 40 |
+
self.freq_kmer = defaultdict(int)
|
| 41 |
+
# representativity of kmers
|
| 42 |
+
last_j = len(sequence) - self.k + 1
|
| 43 |
+
kmers = (sequence[i:(i+self.k)] for i in range(last_j))
|
| 44 |
+
# count kmers in a dictionary
|
| 45 |
+
list(self.count_kmer(kmer) for kmer in kmers)
|
| 46 |
+
|
| 47 |
+
def kmer_probabilities(self, sequence: str):
|
| 48 |
+
self.probabilities = defaultdict(float)
|
| 49 |
+
N=len(sequence)
|
| 50 |
+
for key, value in self.freq_kmer.items():
|
| 51 |
+
self.probabilities[key] = float(value) / (N - self.k + 1)
|
| 52 |
+
|
| 53 |
+
def pixel_position(self, kmer: str):
|
| 54 |
+
"Get pixel position in the FCGR matrix for a k-mer"
|
| 55 |
+
|
| 56 |
+
coords = self.encode(kmer)
|
| 57 |
+
N,x,y = coords.N, coords.x, coords.y
|
| 58 |
+
|
| 59 |
+
# Coordinates from [-1,1]² to [1,2**k]²
|
| 60 |
+
np_coords = np.array([(x + 1)/2, (y + 1)/2]) # move coordinates from [-1,1]² to [0,1]²
|
| 61 |
+
np_coords *= 2**self.k # rescale coordinates from [0,1]² to [0,2**k]²
|
| 62 |
+
x,y = np.ceil(np_coords) # round to upper integer
|
| 63 |
+
|
| 64 |
+
# Turn coordinates (cx,cy) into pixel (px,py) position
|
| 65 |
+
# px = 2**k-cy+1, py = cx
|
| 66 |
+
return 2**self.k-int(y)+1, int(x)
|
| 67 |
+
|
| 68 |
+
def kmer2pixel_position(self,):
|
| 69 |
+
kmer2pixel = dict()
|
| 70 |
+
for kmer in self.kmers:
|
| 71 |
+
kmer2pixel[kmer] = self.pixel_position(kmer)
|
| 72 |
+
return kmer2pixel
|
src/model_loader.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load model from /models"""
|
| 2 |
+
import importlib
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from tensorflow.python.eager.context import num_gpus
|
| 9 |
+
|
| 10 |
+
OMMIT = {".ipynb_checkpoints","__pycache__","__init__","custom_layers","custom_losses"} # files to be ommited
|
| 11 |
+
BASE_DIR = Path(__file__).resolve().parent # base directory unsupervised-dna
|
| 12 |
+
BASE_MODELS = BASE_DIR.joinpath("models") # models directory
|
| 13 |
+
|
| 14 |
+
class ModelLoader:
|
| 15 |
+
"Load models for unsupervised learning using FCGR (grayscale images)"
|
| 16 |
+
|
| 17 |
+
AVAILABLE_MODELS = [model[:-3] for model in os.listdir(BASE_MODELS) if all([ommit not in model for ommit in OMMIT])]
|
| 18 |
+
|
| 19 |
+
def __call__(self, model_name: str, n_outputs: int, weights_path: Optional[Path]=None):
|
| 20 |
+
"Get keras model"
|
| 21 |
+
|
| 22 |
+
# Call class of model to load
|
| 23 |
+
get_model = getattr(
|
| 24 |
+
importlib.import_module(
|
| 25 |
+
f"src.models.{model_name}"
|
| 26 |
+
),
|
| 27 |
+
"get_model")
|
| 28 |
+
|
| 29 |
+
# Load architecture
|
| 30 |
+
model = get_model(n_outputs)
|
| 31 |
+
|
| 32 |
+
# Load weights to the model from file
|
| 33 |
+
if weights_path is not None:
|
| 34 |
+
print(f"\n **load model weights_path** : {weights_path}")
|
| 35 |
+
model.load_weights(weights_path)
|
| 36 |
+
|
| 37 |
+
print("\n**Model created**")
|
| 38 |
+
|
| 39 |
+
return model
|
src/models/resnet50_6mers.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# https://github.com/c1ph3rr/Deep-Residual-Learning-for-Image-Recognition/blob/master/Resnet50.py
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from tensorflow.keras.models import Model
|
| 4 |
+
from tensorflow.keras.layers import (
|
| 5 |
+
Input,
|
| 6 |
+
Conv2D,
|
| 7 |
+
Dense,
|
| 8 |
+
MaxPool2D,
|
| 9 |
+
GlobalAveragePooling2D,
|
| 10 |
+
Add,
|
| 11 |
+
Activation,
|
| 12 |
+
BatchNormalization,
|
| 13 |
+
ZeroPadding2D,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# Reference name of model
|
| 17 |
+
MODEL_NAME = str(Path(__file__).resolve().stem)
|
| 18 |
+
|
| 19 |
+
def identity_block(inp, filters, kernel_size, block, layer):
|
| 20 |
+
|
| 21 |
+
f1, f2, f3 = filters
|
| 22 |
+
|
| 23 |
+
conv_name = 'id_conv_b' + block + '_l' + layer
|
| 24 |
+
batch_name = 'id_batch_b' + block + '_l' + layer
|
| 25 |
+
|
| 26 |
+
x = Conv2D(filters=f1, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_a')(inp)
|
| 27 |
+
x = BatchNormalization(name=batch_name + '_a')(x)
|
| 28 |
+
x = Activation('relu')(x)
|
| 29 |
+
|
| 30 |
+
x = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(x)
|
| 31 |
+
x = BatchNormalization(name=batch_name + '_b')(x)
|
| 32 |
+
x = Activation('relu')(x)
|
| 33 |
+
|
| 34 |
+
x = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(x)
|
| 35 |
+
x = BatchNormalization(name=batch_name + '_c')(x)
|
| 36 |
+
|
| 37 |
+
add = Add()([inp, x])
|
| 38 |
+
x = Activation('relu')(add)
|
| 39 |
+
|
| 40 |
+
return x
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def convolutional_block(inp, filters, kernel_size, block, layer, strides=2):
|
| 44 |
+
|
| 45 |
+
f1, f2, f3 = filters
|
| 46 |
+
|
| 47 |
+
conv_name = 'res_conv_b' + block + '_l' + layer
|
| 48 |
+
batch_name = 'res_batch_b' + block + '_l' + layer
|
| 49 |
+
|
| 50 |
+
y = Conv2D(filters=f1, kernel_size=1, padding='same', strides=strides, kernel_initializer='he_normal', name=conv_name + '_a')(inp)
|
| 51 |
+
y = BatchNormalization(name=batch_name + '_a')(y)
|
| 52 |
+
y = Activation('relu')(y)
|
| 53 |
+
|
| 54 |
+
y = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(y)
|
| 55 |
+
y = BatchNormalization(name=batch_name + '_b')(y)
|
| 56 |
+
y = Activation('relu')(y)
|
| 57 |
+
|
| 58 |
+
y = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(y)
|
| 59 |
+
y = BatchNormalization(name=batch_name + '_c')(y)
|
| 60 |
+
|
| 61 |
+
shortcut = Conv2D(filters=f3, kernel_size=1, strides=strides, kernel_initializer='he_normal', name=conv_name + '_shortcut')(inp)
|
| 62 |
+
shortcut = BatchNormalization(name=batch_name + '_shortcut')(shortcut)
|
| 63 |
+
|
| 64 |
+
add = Add()([shortcut, y])
|
| 65 |
+
y = Activation('relu')(add)
|
| 66 |
+
|
| 67 |
+
return y
|
| 68 |
+
|
| 69 |
+
def get_model(n_outputs):
|
| 70 |
+
|
| 71 |
+
inp = Input(shape=(64, 64, 1), name='input')
|
| 72 |
+
padd = ZeroPadding2D(3)(inp)
|
| 73 |
+
|
| 74 |
+
conv1 = Conv2D(64, 7, strides=2, padding='valid', name='conv1')(padd)
|
| 75 |
+
conv1 = BatchNormalization(name='batch2')(conv1)
|
| 76 |
+
conv1 = Activation('relu')(conv1)
|
| 77 |
+
conv1 = ZeroPadding2D(1)(conv1)
|
| 78 |
+
conv1 = MaxPool2D(3, 2)(conv1)
|
| 79 |
+
|
| 80 |
+
conv2 = convolutional_block(conv1, [64,64,256], 3, '2', '1', strides=1)
|
| 81 |
+
conv2 = identity_block(conv2, [64,64,256], 3, '2', '2')
|
| 82 |
+
conv2 = identity_block(conv2, [64,64,256], 3, '2', '3')
|
| 83 |
+
|
| 84 |
+
conv3 = convolutional_block(conv2, [128,128,512], 3, '3', '1')
|
| 85 |
+
conv3 = identity_block(conv3, [128,128,512], 3, '3', '2')
|
| 86 |
+
conv3 = identity_block(conv3, [128,128,512], 3, '3', '3')
|
| 87 |
+
conv3 = identity_block(conv3, [128,128,512], 3, '3', '4')
|
| 88 |
+
|
| 89 |
+
conv4 = convolutional_block(conv3, [256,256,1024], 3, '4', '1')
|
| 90 |
+
conv4 = identity_block(conv4, [256,256,1024], 3, '4', '2')
|
| 91 |
+
conv4 = identity_block(conv4, [256,256,1024], 3, '4', '3')
|
| 92 |
+
conv4 = identity_block(conv4, [256,256,1024], 3, '4', '4')
|
| 93 |
+
conv4 = identity_block(conv4, [256,256,1024], 3, '4', '5')
|
| 94 |
+
conv4 = identity_block(conv4, [256,256,1024], 3, '4', '6')
|
| 95 |
+
|
| 96 |
+
conv5 = convolutional_block(conv4, [512,512,2048], 3, '5', '1')
|
| 97 |
+
conv5 = identity_block(conv5, [512,512,2048], 3, '5', '2')
|
| 98 |
+
conv5 = identity_block(conv5, [512,512,2048], 3, '5', '3')
|
| 99 |
+
|
| 100 |
+
avg_pool = GlobalAveragePooling2D()(conv5)
|
| 101 |
+
out = Dense(n_outputs, activation='softmax')(avg_pool)
|
| 102 |
+
|
| 103 |
+
return Model(inp, out)
|
src/pipeline.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from collections import OrderedDict
|
| 5 |
+
from typing import List, Tuple, Optional, Union
|
| 6 |
+
|
| 7 |
+
FUNCTIONS_PIPELINE = OrderedDict()
|
| 8 |
+
|
| 9 |
+
def register_in_pipeline(func):
|
| 10 |
+
"""Collect functions for the pipeline"""
|
| 11 |
+
print(f"Adding {func.__name__}")
|
| 12 |
+
if func.__name__ not in FUNCTIONS_PIPELINE:
|
| 13 |
+
FUNCTIONS_PIPELINE[func.__name__] = func
|
| 14 |
+
else:
|
| 15 |
+
raise Exception(f"Duplicated function with name {func.__name__}")
|
| 16 |
+
|
| 17 |
+
class Pipeline:
|
| 18 |
+
"""Define a sequence of functions to be applied to one input"""
|
| 19 |
+
FUNCTIONS_PIPELINE = FUNCTIONS_PIPELINE
|
| 20 |
+
def __init__(self, pipeline: Optional[List[Tuple[str, dict]]] = None):
|
| 21 |
+
self.pipeline = pipeline if pipeline else []
|
| 22 |
+
|
| 23 |
+
def __call__(self, x):
|
| 24 |
+
"""Apply pipeline to the input 'x'"""
|
| 25 |
+
for pipe in self.pipeline:
|
| 26 |
+
func_name, *args, kwargs = pipe
|
| 27 |
+
assert isinstance(kwargs, dict), f"Wrong declaration in {func_name!r}. Must be (str, dict) or (str, tuple, dict)"
|
| 28 |
+
# apply preprocessing
|
| 29 |
+
if args:
|
| 30 |
+
#print("args and kwargs")
|
| 31 |
+
x = self.apply(x, func_name, *args, **kwargs)
|
| 32 |
+
else:
|
| 33 |
+
#print("only kwargs")
|
| 34 |
+
x = self.apply(x, func_name, **kwargs)
|
| 35 |
+
return x
|
| 36 |
+
|
| 37 |
+
@classmethod
|
| 38 |
+
def apply(cls, x, func, *args, **kwargs):
|
| 39 |
+
"""Compute func(x, *args, **kwargs)"""
|
| 40 |
+
if func in cls.FUNCTIONS_PIPELINE:
|
| 41 |
+
return cls.FUNCTIONS_PIPELINE[func](x, *args, **kwargs)
|
| 42 |
+
else:
|
| 43 |
+
raise TypeError(f"{func} not available")
|
| 44 |
+
|
| 45 |
+
def __gt__(self, add_pipe: Union[List,Tuple]):
|
| 46 |
+
"""Add a pipe ("func_name", args, kwargs) or ("func_name", kwargs) to the current pipeline"""
|
| 47 |
+
if self.is_available(add_pipe[0]):
|
| 48 |
+
self.pipeline.append(add_pipe)
|
| 49 |
+
return self
|
| 50 |
+
else:
|
| 51 |
+
raise NotImplementedError(f"{add_pipe[0]!r} not available in Pipeline")
|
| 52 |
+
|
| 53 |
+
def is_available(self, func_name: str):
|
| 54 |
+
"""Return True if the function 'func_name' is available in Pipeline"""
|
| 55 |
+
return True if func_name in self.FUNCTIONS_PIPELINE else False
|
| 56 |
+
|
| 57 |
+
def asJSON(self, path_save: str =None):
|
| 58 |
+
"""Save pipeline configuration as json file"""
|
| 59 |
+
path_save = Path(path_save) if path_save else Path("pipeline.json")
|
| 60 |
+
with open(path_save, "w", encoding="utf8") as fp:
|
| 61 |
+
json.dump(self.pipeline, fp, indent=4, ensure_ascii=False)
|
| 62 |
+
print(f"Pipeline configuration saved at {path_save!r}")
|
| 63 |
+
|
| 64 |
+
def fromJSON(self, path_pipeline: str):
|
| 65 |
+
"""Load pipeline configuration from json file"""
|
| 66 |
+
path_pipeline = Path(path_pipeline)
|
| 67 |
+
with open(path_pipeline, "r", encoding="utf8") as fp:
|
| 68 |
+
pipeline = json.load(fp)
|
| 69 |
+
|
| 70 |
+
# Corrobate that all functions are availables
|
| 71 |
+
available_functions = {pipe[0]: self.is_available(pipe[0])
|
| 72 |
+
for pipe in pipeline}
|
| 73 |
+
|
| 74 |
+
# TODO: change with the right Exception here
|
| 75 |
+
if not all(available_functions.values()):
|
| 76 |
+
print("""
|
| 77 |
+
Some functions are not availables.
|
| 78 |
+
Please use the @register_in_pipeline decorator to include this functions to the Pipeline.
|
| 79 |
+
""")
|
| 80 |
+
functions_not_availables = dict(filter(lambda item: item[0], available_functions.items()))
|
| 81 |
+
return [func_name for func_name, available in functions_not_availables.items()
|
| 82 |
+
if available is False]
|
| 83 |
+
|
| 84 |
+
self.pipeline = pipeline
|
| 85 |
+
print(f"Pipeline loaded from {path_pipeline!r}")
|
src/preprocessing.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""All functions that can be applied as preprocessing"""
|
| 2 |
+
from .pipeline import (
|
| 3 |
+
register_in_pipeline, # decorator to make available a function to use with Pipeline class
|
| 4 |
+
Pipeline,
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
@register_in_pipeline
|
| 8 |
+
def divide_by_max(npy,):
|
| 9 |
+
"The input npy divided by his maximum value"
|
| 10 |
+
return npy / npy.max()
|
| 11 |
+
|
| 12 |
+
@register_in_pipeline
|
| 13 |
+
def divide_by_sum(npy,):
|
| 14 |
+
"The input npy divided by the sum of their values"
|
| 15 |
+
return npy / npy.sum()
|
src/utils.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def clean_seq(seq):
|
| 6 |
+
"Remove all characters different from A,C,G,T or N"
|
| 7 |
+
seq = seq.upper()
|
| 8 |
+
for letter in "BDEFHIJKLMOPQRSUVWXYZ":
|
| 9 |
+
seq = seq.replace(letter,"N")
|
| 10 |
+
return seq
|
| 11 |
+
|
| 12 |
+
def array2img(array):
|
| 13 |
+
"FCGR array to grayscale image"
|
| 14 |
+
max_color = 255
|
| 15 |
+
m, M = array.min(), array.max()
|
| 16 |
+
# rescale to [0,1]
|
| 17 |
+
img_rescaled = (array - m) / (M-m)
|
| 18 |
+
|
| 19 |
+
# invert colors black->white
|
| 20 |
+
img_array = np.ceil(max_color - img_rescaled*max_color)
|
| 21 |
+
img_array = np.array(img_array, dtype=np.int8)
|
| 22 |
+
|
| 23 |
+
# convert to Image
|
| 24 |
+
img_pil = Image.fromarray(img_array,'L')
|
| 25 |
+
return img_pil
|
trained-models/model-34-0.954.hdf5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:168a6bb54b1f3fca3febb5ee62c7ecfa5db762ae9812da180f3c383e98cfc18b
|
| 3 |
+
size 283851536
|
trained-models/preprocessing.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f991c810113366959d350e621ce3b618719326d3c9e638e5f45b47023329adf6
|
| 3 |
+
size 51
|