import json import pandas as pd import numpy as np from sqlalchemy import create_engine from scripts.literature import literature_search from scripts.uniprot import get_protein_location from scripts.utils import * def create_database(db_uri): with open('data/uniprot/mouse.json') as f: d = json.load(f) df_uniprot = pd.json_normalize(d['results']) proteins = d['results'] df = pd.read_csv('data/test_table.csv') df = df[~df['Accession'].isna()] df['locations'] = df['Accession'].apply(lambda x: get_location_from_acession(x, proteins)) df['nucleus'] = df['locations'].apply(is_in_nucleus) df['transcription_factor'] = df['Accession'].apply(lambda x: is_transcription_factor(x, proteins)) threshold = 10 df['region'] = np.where((df['Hippocampus']/df['VCN'] > threshold) & (df['Hippocampus']/df['Cortex'] > threshold), 'Hippocampus', 'inconclusive') df['region'] = np.where((df['VCN']/df['Hippocampus'] > threshold) & (df['VCN']/df['Cortex'] > threshold), 'VCN', df['region']) df['region'] = np.where((df['Cortex']/df['VCN'] > threshold) & (df['Cortex']/df['Hippocampus'] > threshold), 'Cortex', df['region']) engine = create_engine(db_uri, echo=False) df.to_sql(name='proteins', con=engine, if_exists='replace')