File size: 1,293 Bytes
63ec1fc
 
 
 
 
 
 
 
 
 
 
 
6dea890
63ec1fc
 
 
 
 
6dea890
63ec1fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
import pandas as pd
import numpy as np

from sqlalchemy import create_engine

from scripts.literature import literature_search
from scripts.uniprot import get_protein_location
from scripts.utils import *



def create_database(db_uri):
    with open('data/uniprot/mouse.json') as f:
        d = json.load(f)
    df_uniprot = pd.json_normalize(d['results'])
    proteins = d['results']

    df = pd.read_csv('data/test_table.csv')

    df = df[~df['Accession'].isna()]
    df['locations'] = df['Accession'].apply(lambda x: get_location_from_acession(x, proteins))
    df['nucleus'] = df['locations'].apply(is_in_nucleus)
    df['transcription_factor'] = df['Accession'].apply(lambda x: is_transcription_factor(x, proteins))

    threshold = 10
    df['region'] = np.where((df['Hippocampus']/df['VCN'] > threshold) & (df['Hippocampus']/df['Cortex'] > threshold), 'Hippocampus', 'inconclusive')
    df['region'] = np.where((df['VCN']/df['Hippocampus'] > threshold) & (df['VCN']/df['Cortex'] > threshold), 'VCN', df['region'])
    df['region'] = np.where((df['Cortex']/df['VCN'] > threshold) & (df['Cortex']/df['Hippocampus'] > threshold), 'Cortex', df['region'])

    engine = create_engine(db_uri, echo=False)
    
    df.to_sql(name='proteins', con=engine, if_exists='replace')