mrna-design-studio / demo /create_csv.py
offtargeteffect's picture
Deploy mRNA Design Studio (Docker SDK)
99f834c verified
Raw
History Blame Contribute Delete
8.17 kB
"""
Create CSV version of the demo database for easy import testing.
"""
import csv
import os
# Same sequences as in create_demo_db.py
UTR5_BETAGLOBIN = (
"ACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCATCTGAC"
"TCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGT"
)[:80]
UTR5_EMCV = "GGGAAATAAGAGAGAAAAGAAGAGTAAGAAGAAATATAAGAGCCACCATG"
KOZAK_TEV = "GCCACC"
UTR3_BETAGLOBIN = (
"GCTCGCTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTA"
"AACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATT"
"TTCATTGCAATGATGTATTTAAATTATTTCTGAATATTTTACTAAAAATAAATGTTTTTTAT"
)[:100]
UTR3_ALBUMIN = (
"AATAAAGATCTTTATTTTCATTAGATCTGTGTGTTGGTTTTTTGTGTGAATCGATAGTACTA"
"AATACTTTTCAGACACCAGAAATGCAGAGCAGTTCAGAGGCAGAGCCATCTATTGCTTACAT"
)[:100]
POLYA_120 = "A" * 120
POLYA_60 = "A" * 60
CDS_EGFP = (
"ATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAG"
"TAA"
)
CDS_MCHERRY = (
"ATGGTGAGCAAGGGCGAGGAGGATAACATGGCCATCATCAAGGAGTTCATGCGCTTCAAGGTGCACATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGTGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTCATGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCTCCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGCAGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCTGAGGTCAAGACCACCTACAAGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAACGTCAACATCAAGTTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAACGCGCCGAGGGCCGCCACTCCACCGGCGGCATGGACGAGCTGTACAAG"
"TAA"
)
CDS_LUC2 = (
"ATGGAAGATGCCAAAAACATTAAGAAGGGCCCAGCGCCATTCTACCCACTCGAAGACGGGAC"
"CGCCGGCGAGCAGCTGCACAAAGCCATGAAGCGCTACGCCCTGGTGCCCGGCACCATCGCCT"
"TTACCGACGCACATATCGAGGTGGACATTACCTACGCCGAGTACTTCGAGATGAGCGTTCGG"
"CTGGCAGAAGCTATGAAGCGCTATGGGCTGAATACAAACCATCGGATCGTGGTGTGCAGCGA"
"GAATAGTCTGGAGAAGATCCTGCTGAACAAAGGCCTGCCTGTAGCCGGCCTTTTCCTCCTGG"
"AAGAGCTGCGGCAGCAGTTCCAGAAGGCCCGGGAGCAGATGTTCACCTTCGTGCTCGATCTG"
"GAGGAAATGACCGCCGAAGAGGCGATTGAGAATCTGGTATTCGAGCAGTATGGAATCGACCA"
"TTATCTTGATAACCCACAATGCCTGCATGACCTGGTGCATCTGGAACCCCGAGGTCAATGTG"
"GAAGAGTTCCTGGAAAAGCTGCTGAAGGACGGTATCATCATGTTCAGCATCCATGGTTATGG"
"CTACATCCTGGGGCCCGGAACCAACTTCGATCTGGAGCGCATGATCAAGCGCGATGGGGAG"
"GTGGATATGGCCCTGATTAAGGTGTCGATGGAGCAGGCCGGCATCGACCCCGATGAGGCCGG"
"AGCCATTCGGCTGTACAAGCTGATGAAGGATAAG"
"TAA"
)[:900]
while len(CDS_LUC2) % 3 != 0:
CDS_LUC2 = CDS_LUC2[:-3] + "TAA"
CDS_SPIKE_RBD = (
"ATGTTCGTGTTCCTGGTGCTGCTGCCCCTGGTGTCCTCCCAGGTGTGCAACCTGACCACCAG"
"AACCCAGCTGCCCCCCGCCTACACCAACTCCTTCACCCGGGGCGTGTACTACCCCGACAAGG"
"TGTTCCGCTCCTCCGTGCTGCACTCCACCCAGGACCTGTTTCTGCCCTTTTTCTCCAACGTG"
"ACCTGGTTCCACGCCATCCACGTGTCCGGCACCAACGGCACAAAGCGGTTCGACAACCCCGTG"
"CTGCCCTTCAACGACGGGGTGTACTTTGCCAGCACCGAGAAGTCCAACATCATCCGGGGCTG"
"GATCTTCGGCACCACCCTGGACTCCAAGACCCAGTCCCTGCTGATCGTGAACAACGCCACCA"
"ACGTGGTCATCAAGGTGTGCGAGTTCCAGTTCTGCAACGACCCCTTCCTGGGCGTCTACTAC"
"CACAAGAACAACAAGTCCTGGATGGAGTCCGAGTTCCGGGTGTACTCCTCCGCCAACAACTG"
"CACCTTTGAGTACGTGTCCCAGCCCTTTCTGATGGACCTGGAGGGCAAACAGGGCAACTTCA"
"AGAACCTGCGCGAGTTTGTGTTTAAGAACATCGACGGCTACTTCAAGATCTACAGCAAGCAC"
)
CDS_SPIKE_RBD = CDS_SPIKE_RBD[:len(CDS_SPIKE_RBD) - len(CDS_SPIKE_RBD) % 3]
CDS_SPIKE_RBD = CDS_SPIKE_RBD[:-3] + "TAA"
CDS_EPO = (
"ATGGGGGTGCACGAATGTCCCGCCTGGCTGTGGCTGCTGCTGTCGCTGCCGTTCTCTGTGCT"
"GCCCGCCCGCGCCGTCCTCACCGTCAACTTCCCGCACCCTGCTTCCACGCCTCAGAGTCCTG"
"GAGAGGTACCTCTTGGAGGCCAAGGAGGCCGAGAATATCACGACGGGCTGTGCTGAACACTGC"
"AGCTTGAATGAGAATATCACGGTGCGCTTTCCACGCCTCATTTGCGACAGCTTTGTTCGTGG"
"TCAGGCCGTGGTCAGCTCCGATGAGGTCTTCAGGGCCCCTGTCCTCCTGCAGCTGGAATCCT"
"GGCAGCGTCTCAGCCCCTGCAGCCAGCCCTCCCAGCTGCCCTCAGCCACCTGTCCCGCCTGCT"
"CCAGAGCCTGGAGAACTTCTACCAGCCTCTGGAGCAGCTCCAGGAAGTGATCCAGGAGATGAG"
"CAAGCTGTCCGCCACGGCCGTGGAGGTCTTGGCCAGTAAGCCGGAG"
"TAA"
)
CDS_EPO = CDS_EPO[:len(CDS_EPO) - len(CDS_EPO) % 3]
if not CDS_EPO.endswith("TAA") and not CDS_EPO.endswith("TAG") and not CDS_EPO.endswith("TGA"):
CDS_EPO = CDS_EPO[:-3] + "TAA"
CDS_FIX = (
"ATGCAGCGCGTGAACATGATCATGGCCTCCCTGTGGCTGTGCTTCGTGGCCCTGTGGCAGGC"
"TGGCAACCCCAGAGAAGTACCTGTTCAAGAACGGCGACCAGCGGCCCAACAAGGAGATCCCCA"
"AGAGCATCATCCTGGAGGAGTTCAAGGCCTTCTTCTCCACCTTCATCAACCGGAAGATGATCA"
"AGCAGACCGACAAAGACCAGGTGATCAGCCTGGGCGGCAAGGACCAGGTGCTGATCCAGATGC"
"AGCCCCAGGTGAGCAAGGACTTTGGCTTCAGCCTGTGCACCTGCCCCTGGGGCCACCCCAGCC"
"CCTGCAGCAGCACATCCTGTACTTCCTGAACCAGAAGGCCAAACAGTTCCTGCTGCAGGACGAG"
"AAGGTGAAGGGCATCAACCACTGCAAGGTGCGGGTGGCCCTGGAGCAGGACGGCAGCAAGGTG"
)
CDS_FIX = CDS_FIX[:len(CDS_FIX) - len(CDS_FIX) % 3]
CDS_FIX = CDS_FIX[:-3] + "TAA"
FULL_MRNA_EGFP = UTR5_BETAGLOBIN + KOZAK_TEV + "ATG" + CDS_EGFP[3:] + UTR3_BETAGLOBIN + POLYA_120
FULL_MRNA_MCHERRY = UTR5_EMCV + CDS_MCHERRY + UTR3_ALBUMIN + POLYA_60
# Build CSV rows
CSV_DATA = [
{
'id': 1,
'gene_name': 'eGFP-hBG-UTRs',
'five_prime_utr': UTR5_BETAGLOBIN,
'cds': CDS_EGFP,
'three_prime_utr': UTR3_BETAGLOBIN,
'poly_a_tail': POLYA_120,
'full_mrna': '',
'target_protein': 'Enhanced GFP',
'organism': 'Aequorea victoria',
'expression_system': 'HEK293T',
'gc_target_percent': 52.4,
'notes': 'Human beta-globin UTRs. Classic reporter construct for mRNA transfection.'
},
{
'id': 2,
'gene_name': 'mCherry-AlbUTR',
'five_prime_utr': UTR5_BETAGLOBIN,
'cds': CDS_MCHERRY,
'three_prime_utr': UTR3_ALBUMIN,
'poly_a_tail': POLYA_120,
'full_mrna': '',
'target_protein': 'mCherry red fluorescent protein',
'organism': 'Discosoma sp.',
'expression_system': 'CHO',
'gc_target_percent': 50.1,
'notes': 'Albumin 3\'UTR for extended expression. Good cell viability.'
},
{
'id': 3,
'gene_name': 'Luc2-reporter',
'five_prime_utr': UTR5_EMCV,
'cds': CDS_LUC2,
'three_prime_utr': UTR3_ALBUMIN,
'poly_a_tail': POLYA_60,
'full_mrna': '',
'target_protein': 'Firefly luciferase',
'organism': 'Photinus pyralis',
'expression_system': 'Huh-7',
'gc_target_percent': 53.8,
'notes': 'Bioluminescence reporter. Used for LNP screening.'
},
{
'id': 4,
'gene_name': 'SpRBD-v1',
'five_prime_utr': UTR5_BETAGLOBIN,
'cds': CDS_SPIKE_RBD,
'three_prime_utr': UTR3_ALBUMIN,
'poly_a_tail': POLYA_120,
'full_mrna': '',
'target_protein': 'SARS-CoV-2 Spike RBD',
'organism': 'SARS-CoV-2',
'expression_system': 'HEK293T',
'gc_target_percent': 55.2,
'notes': 'Vaccine antigen candidate. Proline-stabilized RBD.'
},
]
def create_csv():
output_path = os.path.join(os.path.dirname(__file__), 'mrna_sequences.csv')
with open(output_path, 'w', newline='') as f:
fieldnames = ['id', 'gene_name', 'five_prime_utr', 'cds', 'three_prime_utr',
'poly_a_tail', 'full_mrna', 'target_protein', 'organism',
'expression_system', 'gc_target_percent', 'notes']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(CSV_DATA)
print(f'✓ Created {output_path}')
print(f' {len(CSV_DATA)} sequences exported')
return output_path
if __name__ == '__main__':
create_csv()