Spaces:

BIASLab
/

sars-cov-2-classification-fcgr

Runtime error

App Files Files Community

koke commited on Apr 25, 2022

Commit

124bf1a

1 Parent(s): 7d8e80b

app

Browse files

Files changed (14) hide show

.gitattributes +2 -0
.gitignore +3 -0
app.py +43 -2
predict.py +25 -0
requirements.txt +5 -0
src/cgr.py +77 -0
src/fcgr.py +72 -0
src/model_loader.py +39 -0
src/models/resnet50_6mers.py +103 -0
src/pipeline.py +85 -0
src/preprocessing.py +15 -0
src/utils.py +25 -0
trained-models/model-34-0.954.hdf5 +3 -0
trained-models/preprocessing.json +3 -0

.gitattributes CHANGED Viewed

@@ -26,3 +26,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+env/
+.vscode/
+__pycache__/

app.py CHANGED Viewed

@@ -1,4 +1,45 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+from predict import predict_seq
+from Bio import SeqIO
+from io import StringIO
+from src.utils import array2img
+from src.fcgr import FCGR
+with st.sidebar:
+    st.write("Options")
+    kmer = st.slider(label="kmer to visualize FCGR",
+                    min_value=6,
+                    max_value=9,
+                    value=8
+                    )
+# App
+st.title('Sars-cov-2 classification with FCGR')
+# load fasta file
+uploaded_file = st.file_uploader(label="Load fasta file")
+if uploaded_file is not None:
+    stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
+    record = SeqIO.parse(stringio , "fasta")
+    fasta = next(record)
+    with st.spinner("Inference..."):
+        prediction, confidence, fcgr = predict_seq(str(fasta.seq), return_fcgr=True)
+    st.success("Done!")
+    st.write("### Results ")
+    st.write("Prediction: ", prediction)
+    st.write("Confidence: ", confidence)
+    # To generate the image to show
+    with st.spinner("Plotting FCGR"):
+        gen_fcgr = FCGR(kmer)
+        fcgr = gen_fcgr(fasta.seq)
+        img = array2img(fcgr)
+        # Show FCGR
+        st.image(
+            image=img,
+            caption="FCGR",
+            use_column_width="auto",
+            width=20)

predict.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+import numpy as np
+from src.fcgr import FCGR
+from src.model_loader import ModelLoader
+from src.preprocessing import Pipeline
+from src.utils import clean_seq
+fcgr = FCGR(k=6)
+loader = ModelLoader()
+order_output = ['S','L','G','V','GR','GH','GV','GK','GRY','O','GRA']
+model = loader("resnet50_6mers", 11, "trained-models/model-34-0.954.hdf5")
+with open("trained-models/preprocessing.json") as fp:
+    pipe = json.load(fp)
+    preprocessing = Pipeline(pipe)
+def predict_seq(seq, return_fcgr=False):
+    array = fcgr(clean_seq(seq))
+    array = preprocessing(array)
+    pred = model.predict(np.expand_dims(np.expand_dims(array,axis=0),axis=-1))[0]
+    argmax = pred.argmax()
+    confidence = pred[argmax]
+    if return_fcgr:
+        return order_output[argmax], confidence, array
+    return order_output[argmax], confidence

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit
+tensorflow==2.7
+biopython
+numpy
+Pillow

src/cgr.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"From original work: CGR for gene structure"
+from typing import Dict, Optional
+from collections import namedtuple
+# coordinates for x+iy
+Coord = namedtuple("Coord", ["x","y"])
+# coordinates for a CGR encoding
+CGRCoords = namedtuple("CGRCoords", ["N","x","y"])
+# coordinates for each nucleotide in the 2d-plane
+DEFAULT_COORDS = dict(A=Coord(1,1),C=Coord(-1,1),G=Coord(-1,-1),T=Coord(1,-1))
+class CGR:
+    "Chaos Game Representation for DNA"
+    def __init__(self, coords: Optional[Dict[chr,tuple]]=None):
+        self.nucleotide_coords = DEFAULT_COORDS if coords is None else coords
+        self.cgr_coords = CGRCoords(0,0,0)
+    def nucleotide_by_coords(self,x,y):
+        "Get nucleotide by coordinates (x,y)"
+        # filter nucleotide by coordinates
+        filtered = dict(filter(lambda item: item[1] == Coord(x,y), self.nucleotide_coords.items()))
+        return list(filtered.keys())[0]
+    def forward(self, nucleotide: str):
+        "Compute next CGR coordinates"
+        x = (self.cgr_coords.x + self.nucleotide_coords.get(nucleotide).x)/2
+        y = (self.cgr_coords.y + self.nucleotide_coords.get(nucleotide).y)/2
+        # update cgr_coords
+        self.cgr_coords = CGRCoords(self.cgr_coords.N+1,x,y)
+    def backward(self,):
+        "Compute last CGR coordinates. Current nucleotide can be inferred from (x,y)"
+        # get current nucleotide based on coordinates
+        n_x,n_y = self.coords_current_nucleotide()
+        nucleotide = self.nucleotide_by_coords(n_x,n_y)
+        # update coordinates to the previous one
+        x = 2*self.cgr_coords.x - n_x
+        y = 2*self.cgr_coords.y - n_y
+        # update cgr_coords
+        self.cgr_coords = CGRCoords(self.cgr_coords.N-1,x,y)
+        return nucleotide
+    def coords_current_nucleotide(self,):
+        x = 1 if self.cgr_coords.x>0 else -1
+        y = 1 if self.cgr_coords.y>0 else -1
+        return x,y
+    def encode(self, sequence: str):
+        "From DNA sequence to CGR"
+        # reset starting position to (0,0,0)
+        self.reset_coords()
+        for nucleotide in sequence:
+            self.forward(nucleotide)
+        return self.cgr_coords
+    def reset_coords(self,):
+        self.cgr_coords = CGRCoords(0,0,0)
+    def decode(self, N:int, x:int, y:int)->str:
+        "From CGR to DNA sequence"
+        self.cgr_coords = CGRCoords(N,x,y)
+        # decoded sequence
+        sequence = []
+        # Recover the entire genome
+        while self.cgr_coords.N>0:
+            nucleotide = self.backward()
+            sequence.append(nucleotide)
+        return "".join(sequence[::-1])

src/fcgr.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from .cgr import CGR
+from itertools import product
+from collections import defaultdict
+import numpy as np
+class FCGR(CGR):
+    """Frequency matrix CGR
+    an (2**k x 2**k) 2D representation will be created for a
+    n-long sequence.
+    - k represents the k-mer.
+    - 2**k x 2**k = 4**k the total number of k-mers (sequences of length k)
+    - pixel value correspond to the value of the frequency for each k-mer
+    """
+    def __init__(self, k: int,):
+        super().__init__()
+        self.k = k # k-mer representation
+        self.kmers = list("".join(kmer) for kmer in product("ACGT", repeat=self.k))
+        self.kmer2pixel = self.kmer2pixel_position()
+    def __call__(self, sequence: str):
+        "Given a DNA sequence, returns an array with his frequencies in the same order as FCGR"
+        self.count_kmers(sequence)
+        # Create an empty array to save the FCGR values
+        array_size = int(2**self.k)
+        freq_matrix = np.zeros((array_size,array_size))
+        # Assign frequency to each box in the matrix
+        for kmer, freq in self.freq_kmer.items():
+            pos_x, pos_y = self.kmer2pixel[kmer]
+            freq_matrix[int(pos_x)-1,int(pos_y)-1] = freq
+        return freq_matrix
+    def count_kmer(self, kmer):
+        if "N" not in kmer:
+            self.freq_kmer[kmer] += 1
+    def count_kmers(self, sequence: str):
+        self.freq_kmer = defaultdict(int)
+        # representativity of kmers
+        last_j = len(sequence) - self.k + 1
+        kmers  = (sequence[i:(i+self.k)] for i in range(last_j))
+        # count kmers in a dictionary
+        list(self.count_kmer(kmer) for kmer in kmers)
+    def kmer_probabilities(self, sequence: str):
+        self.probabilities = defaultdict(float)
+        N=len(sequence)
+        for key, value in self.freq_kmer.items():
+            self.probabilities[key] = float(value) / (N - self.k + 1)
+    def pixel_position(self, kmer: str):
+        "Get pixel position in the FCGR matrix for a k-mer"
+        coords = self.encode(kmer)
+        N,x,y = coords.N, coords.x, coords.y
+        # Coordinates from [-1,1]² to [1,2**k]²
+        np_coords = np.array([(x + 1)/2, (y + 1)/2]) # move coordinates from [-1,1]² to [0,1]²
+        np_coords *= 2**self.k # rescale coordinates from [0,1]² to [0,2**k]²
+        x,y = np.ceil(np_coords) # round to upper integer
+        # Turn coordinates (cx,cy) into pixel (px,py) position
+        # px = 2**k-cy+1, py = cx
+        return 2**self.k-int(y)+1, int(x)
+    def kmer2pixel_position(self,):
+        kmer2pixel = dict()
+        for kmer in self.kmers:
+            kmer2pixel[kmer] = self.pixel_position(kmer)
+        return kmer2pixel

src/model_loader.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Load model from /models"""
+import importlib
+import os
+from pathlib import Path
+from typing import Optional
+from tensorflow.python.eager.context import num_gpus
+OMMIT = {".ipynb_checkpoints","__pycache__","__init__","custom_layers","custom_losses"} # files to be ommited
+BASE_DIR = Path(__file__).resolve().parent # base directory unsupervised-dna
+BASE_MODELS = BASE_DIR.joinpath("models") # models directory
+class ModelLoader:
+    "Load models for unsupervised learning using FCGR (grayscale images)"
+    AVAILABLE_MODELS = [model[:-3] for model in os.listdir(BASE_MODELS) if all([ommit not in model for ommit in OMMIT])]
+    def __call__(self, model_name: str, n_outputs: int, weights_path: Optional[Path]=None):
+        "Get keras model"
+        # Call class of model to load
+        get_model = getattr(
+            importlib.import_module(
+                f"src.models.{model_name}"
+            ),
+            "get_model")
+        # Load architecture
+        model = get_model(n_outputs)
+        # Load weights to the model from file
+        if weights_path is not None:
+            print(f"\n **load model weights_path** : {weights_path}")
+            model.load_weights(weights_path)
+        print("\n**Model created**")
+        return model

src/models/resnet50_6mers.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# https://github.com/c1ph3rr/Deep-Residual-Learning-for-Image-Recognition/blob/master/Resnet50.py
+from pathlib import Path
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import (
+    Input,
+    Conv2D,
+    Dense,
+    MaxPool2D,
+    GlobalAveragePooling2D,
+    Add,
+    Activation,
+    BatchNormalization,
+    ZeroPadding2D,
+)
+# Reference name of model
+MODEL_NAME = str(Path(__file__).resolve().stem)
+def identity_block(inp, filters, kernel_size, block, layer):
+    f1, f2, f3 = filters
+    conv_name = 'id_conv_b' + block + '_l' + layer
+    batch_name = 'id_batch_b' + block + '_l' + layer
+    x = Conv2D(filters=f1, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_a')(inp)
+    x = BatchNormalization(name=batch_name + '_a')(x)
+    x = Activation('relu')(x)
+    x = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(x)
+    x = BatchNormalization(name=batch_name + '_b')(x)
+    x = Activation('relu')(x)
+    x = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(x)
+    x = BatchNormalization(name=batch_name + '_c')(x)
+    add = Add()([inp, x])
+    x = Activation('relu')(add)
+    return x
+def convolutional_block(inp, filters, kernel_size, block, layer, strides=2):
+    f1, f2, f3 = filters
+    conv_name = 'res_conv_b' + block + '_l' + layer
+    batch_name = 'res_batch_b' + block + '_l' + layer
+    y = Conv2D(filters=f1, kernel_size=1, padding='same', strides=strides, kernel_initializer='he_normal', name=conv_name + '_a')(inp)
+    y = BatchNormalization(name=batch_name + '_a')(y)
+    y = Activation('relu')(y)
+    y = Conv2D(filters=f2, kernel_size=kernel_size, padding='same', kernel_initializer='he_normal', name=conv_name + '_b')(y)
+    y = BatchNormalization(name=batch_name + '_b')(y)
+    y = Activation('relu')(y)
+    y = Conv2D(filters=f3, kernel_size=1, padding='same', kernel_initializer='he_normal', name=conv_name + '_c')(y)
+    y = BatchNormalization(name=batch_name + '_c')(y)
+    shortcut = Conv2D(filters=f3, kernel_size=1, strides=strides, kernel_initializer='he_normal', name=conv_name + '_shortcut')(inp)
+    shortcut = BatchNormalization(name=batch_name + '_shortcut')(shortcut)
+    add = Add()([shortcut, y])
+    y = Activation('relu')(add)
+    return y
+def get_model(n_outputs):
+    inp = Input(shape=(64, 64, 1), name='input')
+    padd = ZeroPadding2D(3)(inp)
+    conv1 = Conv2D(64, 7, strides=2, padding='valid', name='conv1')(padd)
+    conv1 = BatchNormalization(name='batch2')(conv1)
+    conv1 = Activation('relu')(conv1)
+    conv1 = ZeroPadding2D(1)(conv1)
+    conv1 = MaxPool2D(3, 2)(conv1)
+    conv2 = convolutional_block(conv1, [64,64,256], 3, '2', '1', strides=1)
+    conv2 = identity_block(conv2, [64,64,256], 3, '2', '2')
+    conv2 = identity_block(conv2, [64,64,256], 3, '2', '3')
+    conv3 = convolutional_block(conv2, [128,128,512], 3, '3', '1')
+    conv3 = identity_block(conv3, [128,128,512], 3, '3', '2')
+    conv3 = identity_block(conv3, [128,128,512], 3, '3', '3')
+    conv3 = identity_block(conv3, [128,128,512], 3, '3', '4')
+    conv4 = convolutional_block(conv3, [256,256,1024], 3, '4', '1')
+    conv4 = identity_block(conv4, [256,256,1024], 3, '4', '2')
+    conv4 = identity_block(conv4, [256,256,1024], 3, '4', '3')
+    conv4 = identity_block(conv4, [256,256,1024], 3, '4', '4')
+    conv4 = identity_block(conv4, [256,256,1024], 3, '4', '5')
+    conv4 = identity_block(conv4, [256,256,1024], 3, '4', '6')
+    conv5 = convolutional_block(conv4, [512,512,2048], 3, '5', '1')
+    conv5 = identity_block(conv5, [512,512,2048], 3, '5', '2')
+    conv5 = identity_block(conv5, [512,512,2048], 3, '5', '3')
+    avg_pool = GlobalAveragePooling2D()(conv5)
+    out = Dense(n_outputs, activation='softmax')(avg_pool)
+    return Model(inp, out)

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import json
+from pathlib import Path
+from collections import OrderedDict
+from typing import List, Tuple, Optional, Union
+FUNCTIONS_PIPELINE = OrderedDict()
+def register_in_pipeline(func):
+    """Collect functions for the pipeline"""
+    print(f"Adding {func.__name__}")
+    if func.__name__ not in FUNCTIONS_PIPELINE:
+        FUNCTIONS_PIPELINE[func.__name__] = func
+    else:
+        raise Exception(f"Duplicated function with name {func.__name__}")
+class Pipeline:
+    """Define a sequence of functions to be applied to one input"""
+    FUNCTIONS_PIPELINE = FUNCTIONS_PIPELINE
+    def __init__(self, pipeline: Optional[List[Tuple[str, dict]]] = None):
+        self.pipeline = pipeline if pipeline else []
+    def __call__(self, x):
+        """Apply pipeline to the input 'x'"""
+        for pipe in self.pipeline:
+            func_name, *args, kwargs = pipe
+            assert isinstance(kwargs, dict), f"Wrong declaration in {func_name!r}. Must be (str, dict) or (str, tuple, dict)"
+            # apply preprocessing
+            if args:
+                #print("args and kwargs")
+                x = self.apply(x, func_name, *args, **kwargs)
+            else:
+                #print("only kwargs")
+                x = self.apply(x, func_name, **kwargs)
+        return x
+    @classmethod
+    def apply(cls, x, func, *args, **kwargs):
+        """Compute func(x, *args, **kwargs)"""
+        if func in cls.FUNCTIONS_PIPELINE:
+            return cls.FUNCTIONS_PIPELINE[func](x, *args, **kwargs)
+        else:
+            raise TypeError(f"{func} not available")
+    def __gt__(self, add_pipe: Union[List,Tuple]):
+        """Add a pipe ("func_name", args, kwargs)  or ("func_name", kwargs) to the current pipeline"""
+        if self.is_available(add_pipe[0]):
+            self.pipeline.append(add_pipe)
+            return self
+        else:
+            raise NotImplementedError(f"{add_pipe[0]!r} not available in Pipeline")
+    def is_available(self, func_name: str):
+        """Return True if the function 'func_name' is available in Pipeline"""
+        return True if func_name in self.FUNCTIONS_PIPELINE else False
+    def asJSON(self, path_save: str =None):
+        """Save pipeline configuration as json file"""
+        path_save = Path(path_save) if path_save else Path("pipeline.json")
+        with open(path_save, "w", encoding="utf8") as fp:
+            json.dump(self.pipeline, fp, indent=4, ensure_ascii=False)
+        print(f"Pipeline configuration saved at {path_save!r}")
+    def fromJSON(self, path_pipeline: str):
+        """Load pipeline configuration from json file"""
+        path_pipeline = Path(path_pipeline)
+        with open(path_pipeline, "r", encoding="utf8") as fp:
+            pipeline = json.load(fp)
+        # Corrobate that all functions are availables
+        available_functions = {pipe[0]: self.is_available(pipe[0])
+                                            for pipe in pipeline}
+        # TODO: change with the right Exception here
+        if not all(available_functions.values()):
+            print("""
+            Some functions are not availables.
+            Please use the @register_in_pipeline decorator to include this functions to the Pipeline.
+            """)
+            functions_not_availables = dict(filter(lambda item: item[0], available_functions.items()))
+            return [func_name for func_name, available in functions_not_availables.items()
+                        if available is False]
+        self.pipeline = pipeline
+        print(f"Pipeline loaded from {path_pipeline!r}")

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""All functions that can be applied as preprocessing"""
+from .pipeline import (
+    register_in_pipeline, # decorator to make available a function to use with Pipeline class
+    Pipeline,
+)
+@register_in_pipeline
+def divide_by_max(npy,):
+    "The input npy divided by his maximum value"
+    return npy / npy.max()
+@register_in_pipeline
+def divide_by_sum(npy,):
+    "The input npy divided by the sum of their values"
+    return npy / npy.sum()

src/utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from PIL import Image
+import numpy as np
+def clean_seq(seq):
+    "Remove all characters different from A,C,G,T or N"
+    seq = seq.upper()
+    for letter in "BDEFHIJKLMOPQRSUVWXYZ":
+        seq = seq.replace(letter,"N")
+    return seq
+def array2img(array):
+    "FCGR array to grayscale image"
+    max_color = 255
+    m, M = array.min(), array.max()
+    # rescale to [0,1]
+    img_rescaled = (array - m) / (M-m)
+    # invert colors black->white
+    img_array = np.ceil(max_color - img_rescaled*max_color)
+    img_array = np.array(img_array, dtype=np.int8)
+    # convert to Image
+    img_pil = Image.fromarray(img_array,'L')
+    return img_pil

trained-models/model-34-0.954.hdf5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:168a6bb54b1f3fca3febb5ee62c7ecfa5db762ae9812da180f3c383e98cfc18b
+size 283851536

trained-models/preprocessing.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f991c810113366959d350e621ce3b618719326d3c9e638e5f45b47023329adf6
+size 51