Spaces:

gptomics
/

simple-geno-model

Running

App Files Files Community

Domen Jemec commited on Feb 21, 2024

Commit

0df3e88

1 Parent(s): e9f6911

hla model v0.1

Browse files

Files changed (5) hide show

assets/hla/encoder/trimer_bow_hla.json +1 -0
assets/hla/model/rfm_hla.pkl +3 -0
hla_class.py +81 -0
pages/HLA_Type_Prediction.py +7 -4
requirements.txt +4 -1

assets/hla/encoder/trimer_bow_hla.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"AAA": 0, "AAC": 1, "AAG": 2, "AAT": 3, "ACA": 4, "ACC": 5, "ACG": 6, "ACT": 7, "AGA": 8, "AGC": 9, "AGG": 10, "AGT": 11, "ATA": 12, "ATC": 13, "ATG": 14, "ATT": 15, "CAA": 16, "CAC": 17, "CAG": 18, "CAT": 19, "CCA": 20, "CCC": 21, "CCG": 22, "CCT": 23, "CGA": 24, "CGC": 25, "CGG": 26, "CGT": 27, "CTA": 28, "CTC": 29, "CTG": 30, "CTT": 31, "GAA": 32, "GAC": 33, "GAG": 34, "GAT": 35, "GCA": 36, "GCC": 37, "GCG": 38, "GCT": 39, "GGA": 40, "GGC": 41, "GGG": 42, "GGT": 43, "GTA": 44, "GTC": 45, "GTG": 46, "GTT": 47, "TAA": 48, "TAC": 49, "TAG": 50, "TAT": 51, "TCA": 52, "TCC": 53, "TCG": 54, "TCT": 55, "TGA": 56, "TGC": 57, "TGG": 58, "TGT": 59, "TTA": 60, "TTC": 61, "TTG": 62, "TTT": 63, "UNK": 64}

assets/hla/model/rfm_hla.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b91bfdc19c2dc37096dc1be9c5703766db0598ef9a2d2f9c0a941a9cd75d1db
+size 7008153

hla_class.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import json
+import re
+import numpy as np
+import joblib
+asset_home = './assets/hla/'
+def tokenize_sequence(sequence):
+    # configs
+    n = 3
+    stride = 1
+    ngrams = []
+    ## clean up string
+    clean_sequence = re.sub(r'[^a-zA-Z]', '', sequence)
+    clean_sequence = clean_sequence.upper()
+    ## tokenize
+    for i in range(0, len(clean_sequence) - n + 1, stride):
+        # Create an n-gram
+        ngram = clean_sequence[i:i + n]
+        # Add the n-gram to the list
+        ngrams.append(ngram)
+    tokens = ' '.join(ngrams)
+    return tokens
+def bow_embedding(sequence):
+    embed_path = asset_home + 'encoder/trimer_bow_hla.json'
+    with open(embed_path, 'r') as json_file:
+        value_to_index = json.load(json_file)
+    uniq_len = len(value_to_index)
+    unknown_token = 'UNK'
+    token_seq = tokenize_sequence(sequence)
+    ## embed
+    bow_matrix = np.zeros((1,uniq_len), dtype=int)
+    tokens = token_seq.split(' ')
+    for value in tokens:
+        if value in value_to_index.keys():
+            col_idx = value_to_index[value]
+        else:
+            col_idx = value_to_index[unknown_token]
+        bow_matrix[0, col_idx] += 1
+    return bow_matrix
+def predict_class(encoding, conf_thresh=0.1):
+    model_path = asset_home + 'model/rfm_hla.pkl'
+    prediction = []
+    model = joblib.load(model_path)
+    # Use predict_proba to get class probabilities
+    class_probabilities = model.predict_proba(encoding)
+    # Iterate over the class probabilities for each instance
+    for i, probs in enumerate(class_probabilities):
+        # Get indices that would sort the array in descending order
+        sorted_indices = np.argsort(probs)[::-1]
+        # Track if any class probability is above the threshold
+        any_class_above_threshold = False
+        for class_index in sorted_indices:
+            prob = probs[class_index]
+            if prob > conf_thresh:  # Check if the probability is above the threshold
+                # Get the class name from the model's classes_
+                class_name = model.classes_[class_index]
+                prediction = prediction + [f'HLA-{class_name}: {prob:.4f}']  # Print class name and probability
+                any_class_above_threshold = True
+        if not any_class_above_threshold:
+            prediction = ['No class predicted']
+    return prediction

pages/HLA_Type_Prediction.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 import numpy
 min_seq_length = 100
 max_seq_length = 10000
@@ -18,10 +19,12 @@ input_sequence = st.text_area('Enter your sequence to analyze',
                              help=f'enter between {min_seq_length} and {max_seq_length} characters without line breaks',
                              placeholder='aactaaaagactgacaaaatttttagtctctcgAATCGGGG...')
-if st.button('Predict', disabled=len(input_sequence)>= min_seq_length):
-    status_text.text('Prediction Complete')
-    gene = 'HLA A'
-    st.markdown(f'The predicted squence is {gene}')
 footer='''<style>

 import streamlit as st
 import numpy
+from hla_class import bow_embedding, predict_class
 min_seq_length = 100
 max_seq_length = 10000
                              help=f'enter between {min_seq_length} and {max_seq_length} characters without line breaks',
                              placeholder='aactaaaagactgacaaaatttttagtctctcgAATCGGGG...')
+if st.button('Predict', disabled=len(input_sequence)< min_seq_length):
+    enc_seq = bow_embedding(input_sequence)
+    prediction = predict_class(enc_seq)
+    combined_string = '\n'.join(prediction)
+    st.markdown('## HLA Model Prediction')
+    st.markdown(combined_string)
 footer='''<style>

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 numpy
-pandas

 numpy
+pandas
+json
+re
+joblib