Spaces:
Running
Running
§4 Species: anchor prompt to 2nd exon, drop zebrafish
Browse files- fetch_species.py: also capture per-species exon coords from Ensembl,
translated into 0-based offsets in the trimmed seq
- species.js: new getPromptWindow() — anchors prefixEnd 35 bp into the
2nd exon (like §1's gene completion); slides prefixStart back by the
user-selected prefixLen; falls back to flat slice when exon 2 isn't
visible. Always render the prompt (no more "click run all" placeholder).
Per-row "{N} bp prompt" note.
- species.json: regenerated with exon data; zebrafish dropped (~450 My
drift made the row look like noise next to mammals + bird)
- demo.html §4 lede/takeaway: reflect new framing and dropped species
- assets/js/sections/species.js +35 -15
- data/species.json +48 -24
- demo.html +12 -12
- scripts/fetch_species.py +26 -3
assets/js/sections/species.js
CHANGED
|
@@ -40,6 +40,26 @@
|
|
| 40 |
return Math.max(20, Math.min(blocks, 12) * 10);
|
| 41 |
}
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
function renderRow(s) {
|
| 44 |
const wrap = document.createElement("div");
|
| 45 |
wrap.className = "species-row";
|
|
@@ -47,7 +67,8 @@
|
|
| 47 |
|
| 48 |
const stat = runState[s.species_id] || {};
|
| 49 |
const genText = stat.genText || "";
|
| 50 |
-
const
|
|
|
|
| 51 |
let match = 0, total = 0;
|
| 52 |
for (let i = 0; i < genText.length && i < refSlice.length; i++) {
|
| 53 |
total++;
|
|
@@ -55,12 +76,14 @@
|
|
| 55 |
}
|
| 56 |
const idPct = total > 0 ? `${((match / total) * 100).toFixed(0)}%` : "·";
|
| 57 |
const meanLp = stat.genTokens ? meanLogprob(stat.genTokens) : null;
|
|
|
|
| 58 |
|
| 59 |
wrap.innerHTML = `
|
| 60 |
<div class="species-meta">
|
| 61 |
<div class="species-name" style="border-left-color:${s.color}">${s.common}</div>
|
| 62 |
<div class="species-sub">${s.ortholog_symbol}</div>
|
| 63 |
<div class="species-sub">chr${s.chrom} · strand ${s.strand}</div>
|
|
|
|
| 64 |
<div class="species-stats">
|
| 65 |
<div class="stat-id">${idPct}</div>
|
| 66 |
<div class="stat-sub">${total > 0 ? `${match}/${total} bases` : "not run"}</div>
|
|
@@ -68,7 +91,7 @@
|
|
| 68 |
</div>
|
| 69 |
</div>
|
| 70 |
<div>
|
| 71 |
-
<div class="species-seq" data-role="output">
|
| 72 |
<div class="species-seq" data-role="ref" style="margin-top:4px"></div>
|
| 73 |
</div>
|
| 74 |
`;
|
|
@@ -76,11 +99,7 @@
|
|
| 76 |
const outEl = wrap.querySelector('[data-role="output"]');
|
| 77 |
const refEl = wrap.querySelector('[data-role="ref"]');
|
| 78 |
|
| 79 |
-
if (
|
| 80 |
-
outEl.classList.add("empty");
|
| 81 |
-
outEl.textContent = "click \"run all\" to generate";
|
| 82 |
-
refEl.style.display = "none";
|
| 83 |
-
} else if (stat.status === "error") {
|
| 84 |
outEl.classList.add("empty");
|
| 85 |
outEl.style.color = "#b00020";
|
| 86 |
outEl.textContent = stat.error || "error";
|
|
@@ -88,12 +107,13 @@
|
|
| 88 |
} else {
|
| 89 |
outEl.classList.remove("empty");
|
| 90 |
const bpl = basesPerLine(outEl);
|
| 91 |
-
const
|
|
|
|
| 92 |
const lpRange = stat.genTokens ? lpRangeOf(stat.genTokens) : null;
|
| 93 |
const colorOut = (absIdx) => {
|
| 94 |
-
if (absIdx <
|
| 95 |
const tok = stat.genTokens && stat.genTokenAtBase
|
| 96 |
-
? stat.genTokens[stat.genTokenAtBase[absIdx -
|
| 97 |
: null;
|
| 98 |
const [r, g, b] = logprobRgb(tok ? tok.logprob : null, lpRange);
|
| 99 |
return { style: `color:rgb(${r},${g},${b})` };
|
|
@@ -102,9 +122,8 @@
|
|
| 102 |
|
| 103 |
// Reference (only the generated span)
|
| 104 |
if (genText.length > 0) {
|
| 105 |
-
const
|
| 106 |
-
const
|
| 107 |
-
const refSeq = s.seq.slice(refSpanStart, refSpanEnd);
|
| 108 |
const colorRef = (absIdx, base) => {
|
| 109 |
// absIdx is local to refSeq (starts at 0)
|
| 110 |
const genIdx = absIdx;
|
|
@@ -132,7 +151,8 @@
|
|
| 132 |
}
|
| 133 |
|
| 134 |
async function generateForSpecies(s) {
|
| 135 |
-
const
|
|
|
|
| 136 |
const stat = { genText: "", genTokens: [], genTokenAtBase: [], status: "running" };
|
| 137 |
runState[s.species_id] = stat;
|
| 138 |
renderAll();
|
|
@@ -211,7 +231,7 @@
|
|
| 211 |
entry = SPECIES_DATA.find(x => x.symbol === symbol);
|
| 212 |
if (!entry) return;
|
| 213 |
els.pills.querySelectorAll(".pill").forEach(p => p.classList.toggle("active", p.dataset.gene === symbol));
|
| 214 |
-
els.info.innerHTML = `<strong>${entry.symbol}</strong> · same gene, ${entry.species.length} species · prefix
|
| 215 |
runState = {};
|
| 216 |
renderAll();
|
| 217 |
setStatus("idle");
|
|
|
|
| 40 |
return Math.max(20, Math.min(blocks, 12) * 10);
|
| 41 |
}
|
| 42 |
|
| 43 |
+
// Choose where the prompt sits inside the species seq. The §1 finding is
|
| 44 |
+
// that Carbon is most predictive when it's continuing the 2nd exon with
|
| 45 |
+
// some intron context behind it. We replicate that here: anchor prefixEnd
|
| 46 |
+
// a bit past the start of exon 2 (35 bp of exon context, like §1) and
|
| 47 |
+
// slide prefixStart back by the user-selected `prefixLen`. If exon 2
|
| 48 |
+
// isn't visible in the trimmed seq, fall back to a flat slice from start.
|
| 49 |
+
function getPromptWindow(s, prefixLen) {
|
| 50 |
+
const exons = s.exons || [];
|
| 51 |
+
const exon2 = exons.length >= 2 ? exons[1] : null;
|
| 52 |
+
if (exon2) {
|
| 53 |
+
const EXON_CTX = 35;
|
| 54 |
+
const exonLen = exon2.end - exon2.start;
|
| 55 |
+
const exonCtx = Math.min(EXON_CTX, Math.max(0, exonLen - 30));
|
| 56 |
+
const prefixEnd = Math.min(s.length, exon2.start + exonCtx);
|
| 57 |
+
const prefixStart = Math.max(0, prefixEnd - prefixLen);
|
| 58 |
+
return { prefixStart, prefixEnd };
|
| 59 |
+
}
|
| 60 |
+
return { prefixStart: 0, prefixEnd: Math.min(s.length, prefixLen) };
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
function renderRow(s) {
|
| 64 |
const wrap = document.createElement("div");
|
| 65 |
wrap.className = "species-row";
|
|
|
|
| 67 |
|
| 68 |
const stat = runState[s.species_id] || {};
|
| 69 |
const genText = stat.genText || "";
|
| 70 |
+
const { prefixStart, prefixEnd } = getPromptWindow(s, prefixLen);
|
| 71 |
+
const refSlice = s.seq.slice(prefixEnd, prefixEnd + genLen);
|
| 72 |
let match = 0, total = 0;
|
| 73 |
for (let i = 0; i < genText.length && i < refSlice.length; i++) {
|
| 74 |
total++;
|
|
|
|
| 76 |
}
|
| 77 |
const idPct = total > 0 ? `${((match / total) * 100).toFixed(0)}%` : "·";
|
| 78 |
const meanLp = stat.genTokens ? meanLogprob(stat.genTokens) : null;
|
| 79 |
+
const promptBp = prefixEnd - prefixStart;
|
| 80 |
|
| 81 |
wrap.innerHTML = `
|
| 82 |
<div class="species-meta">
|
| 83 |
<div class="species-name" style="border-left-color:${s.color}">${s.common}</div>
|
| 84 |
<div class="species-sub">${s.ortholog_symbol}</div>
|
| 85 |
<div class="species-sub">chr${s.chrom} · strand ${s.strand}</div>
|
| 86 |
+
<div class="species-sub" style="color:#999">${promptBp} bp prompt</div>
|
| 87 |
<div class="species-stats">
|
| 88 |
<div class="stat-id">${idPct}</div>
|
| 89 |
<div class="stat-sub">${total > 0 ? `${match}/${total} bases` : "not run"}</div>
|
|
|
|
| 91 |
</div>
|
| 92 |
</div>
|
| 93 |
<div>
|
| 94 |
+
<div class="species-seq" data-role="output"></div>
|
| 95 |
<div class="species-seq" data-role="ref" style="margin-top:4px"></div>
|
| 96 |
</div>
|
| 97 |
`;
|
|
|
|
| 99 |
const outEl = wrap.querySelector('[data-role="output"]');
|
| 100 |
const refEl = wrap.querySelector('[data-role="ref"]');
|
| 101 |
|
| 102 |
+
if (stat.status === "error") {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
outEl.classList.add("empty");
|
| 104 |
outEl.style.color = "#b00020";
|
| 105 |
outEl.textContent = stat.error || "error";
|
|
|
|
| 107 |
} else {
|
| 108 |
outEl.classList.remove("empty");
|
| 109 |
const bpl = basesPerLine(outEl);
|
| 110 |
+
const prompt = s.seq.slice(prefixStart, prefixEnd);
|
| 111 |
+
const total = prompt + genText;
|
| 112 |
const lpRange = stat.genTokens ? lpRangeOf(stat.genTokens) : null;
|
| 113 |
const colorOut = (absIdx) => {
|
| 114 |
+
if (absIdx < prompt.length) return { style: `color:rgb(${PROMPT_RGB.join(",")})` };
|
| 115 |
const tok = stat.genTokens && stat.genTokenAtBase
|
| 116 |
+
? stat.genTokens[stat.genTokenAtBase[absIdx - prompt.length]]
|
| 117 |
: null;
|
| 118 |
const [r, g, b] = logprobRgb(tok ? tok.logprob : null, lpRange);
|
| 119 |
return { style: `color:rgb(${r},${g},${b})` };
|
|
|
|
| 122 |
|
| 123 |
// Reference (only the generated span)
|
| 124 |
if (genText.length > 0) {
|
| 125 |
+
const refSpanEnd = Math.min(s.length, prefixEnd + genLen);
|
| 126 |
+
const refSeq = s.seq.slice(prefixEnd, refSpanEnd);
|
|
|
|
| 127 |
const colorRef = (absIdx, base) => {
|
| 128 |
// absIdx is local to refSeq (starts at 0)
|
| 129 |
const genIdx = absIdx;
|
|
|
|
| 151 |
}
|
| 152 |
|
| 153 |
async function generateForSpecies(s) {
|
| 154 |
+
const { prefixStart, prefixEnd } = getPromptWindow(s, prefixLen);
|
| 155 |
+
const prompt = s.seq.slice(prefixStart, prefixEnd);
|
| 156 |
const stat = { genText: "", genTokens: [], genTokenAtBase: [], status: "running" };
|
| 157 |
runState[s.species_id] = stat;
|
| 158 |
renderAll();
|
|
|
|
| 231 |
entry = SPECIES_DATA.find(x => x.symbol === symbol);
|
| 232 |
if (!entry) return;
|
| 233 |
els.pills.querySelectorAll(".pill").forEach(p => p.classList.toggle("active", p.dataset.gene === symbol));
|
| 234 |
+
els.info.innerHTML = `<strong>${entry.symbol}</strong> · same gene, ${entry.species.length} species · prefix anchored to the 2nd exon of each species (intron context, then generate into the exon)`;
|
| 235 |
runState = {};
|
| 236 |
renderAll();
|
| 237 |
setStatus("idle");
|
data/species.json
CHANGED
|
@@ -10,6 +10,16 @@
|
|
| 10 |
"strand": -1,
|
| 11 |
"length": 1200,
|
| 12 |
"seq": "AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGGTGGGCTCAGGATTCCAGGGTGGCTGGACCCCAGGCCCCAGCTCTGCAGCAGGGAGGACGTGGCTGGGCTCGTGAAGCATGTGGGGGTGAGCCCAGGGGCCCCAAGGCAGGGCACCTGGCCTTCAGCCTGCCTCAGCCCTGCCTGTCTCCCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGGTGAGCCAACTGCCCATTGCTGCCCCTGGCCGCCCCCAGCCACCCCCTGCTCCTGGCGCTCCCACCCAGCATGGGCAGAAGGGGGCAGGAGGCTGCCACCCAGCAGGGGGTCAGGTGCACTTTTTTAAAAAGAAGTTCTCTTGGTCACGTCCTAAAAGTGACCAGCTCCCTGTGGCCCAGTCAGAATCTCAGCCTGAGGACGGTGTTGGCTTCGGCAGCCCCGAGATACATCAGAGGGTGGGCACGCTCCTCCCTCCACTCGCCCCTCAAACAAATGCCCCGCAGCCCATTTCTCCACCCTCATTTGATGACCGCAGATTCAAGTGTTTTGTTAAGTAAAGTCCTGGGTGACCTGGGGTCACAGGGTGCCCCACGCTGCCTGCCTCTGGGCGAACACCCCATCACGCCCGGAGGAGGGCGTGGCTGCCTGCCTGAGTGGGCCAGACCCCTGTCGCCAGGCCTCACGGCAGCTCCATAGTCAGGAGATGGGGAAGATGCTGGGGACAGGCCCTGGGGAGAAGTACTGGGATCACCTGTTCAGGCTCCCACTGTGACGCTGCCCCGGGGCGGGGGAAGGAGGTGGGACATGTGGGCGTTGGGGCCTGTAGGTCCACACCCAGTGTGGGTGACCCTCCCTCTAACCTGGGTCCAGCCCGGCTGGAGATGGGTGGGAGTGCGACCTAGGGCTGGCGGGCAGGCGGGCACTGTGTCTCCCTGACTGTGTCCTCCTGTGTCCCTCTGCCTCGCCGCTGTTCCGGAACCTGCTCTGCGCGGC",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"species_id": "homo_sapiens",
|
| 14 |
"common": "human",
|
| 15 |
"color": "#1a1a1a"
|
|
@@ -22,6 +32,16 @@
|
|
| 22 |
"strand": 1,
|
| 23 |
"length": 1180,
|
| 24 |
"seq": "ACCAGGCAAGTGTTTGGAAACTGCAGCTTCAGCCCCTCTGGCCATCTGCCTACCCACCCCACCTGGAGACCTTAATGGGCCAAACAGCAAAGTCCAGGGGGCAGAGAGGAGGTACTTTGGACTATAAAGCTGGTGGGCATCCAGTAACCCCCAGCCCTTAGTGACCAGCTATAATCAGAGACCATCAGCAAGCAGGTATGTACTCTCCTCTTTGGGCCTGGCTCCCCAGCCAAGACTCCAGCGACTTTAGGGAGAATGTGGGCTCCTCTCTTACATGGATCTTTTGCTAGCCTCAACCCTGCCTATCTTTCAGGTCATTGTTTCAACATGGCCCTGTTGGTGCACTTCCTACCCCTGCTGGCCCTGCTTGCCCTCTGGGAGCCCAAACCCACCCAGGCTTTTGTCAAACAGCATCTTTGTGGTCCCCACCTGGTAGAGGCTCTCTACCTGGTGTGTGGGGAGCGTGGCTTCTTCTACACACCCAAGTCCCGCCGTGAAGTGGAGGACCCACAAGTGGAACAACTGGAGCTGGGAGGAAGCCCCGGGGACCTTCAGACCTTGGCGTTGGAGGTGGCCCGGCAGAAGCGTGGCATTGTGGATCAGTGCTGCACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAAGGCCCACCTCGACCCGCCCCACCCCTCTGCAATGAATAAAACTTTTGAATAAGCACCAAAAAAAAGAGTTCTATAATGAATGAAAAAGGATTGTGTATATAGACATCTTTTTCTCTGGCATTTATTGTCATGTTAGCATACTATTAAACCATTGTTAGGTTGGATGATTATATAATCATGTATGAAGCTTGTGATAAAACACCAGGAATAATTCAAGTATCTGGAATTCTGCTTCCTGCCCAAGAAGGTAGGCAACCGTGTAAATGCCACTGAAGCTACTAGTCTAAAAGTGAGTTATCTCTGTCTTTGTCTTACCCCCTGATGCTGTGATAAAACCCTGACAAGAGCAACTGACTCCTGAGAGGAAGGTTTATTCTAGCTCACAATTCCAGGTTACAAACAGTCCATCCGTAGCAGGGGAGTCACAGCAACAGGAACCTCAGGGAACTGCTCCTATTATCCCCACAATCAAGAATAGTGACCAATAAATAAGTGGATCTTTTCTCAAAAAAAAAAAAAAAAAAAA",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"species_id": "mus_musculus",
|
| 26 |
"common": "mouse",
|
| 27 |
"color": "#2c5aa0"
|
|
@@ -34,21 +54,15 @@
|
|
| 34 |
"strand": 1,
|
| 35 |
"length": 1200,
|
| 36 |
"seq": "TTGTCTCTTTATGAGAAATCAGCAGGGAGGCCAGCATTGGTGTGAGTGTGTGGATGGAGACAGGCTTCTGGTTATAATTGGTCATTTATTATGACTTTCAAAGCCTGATGAATAAAATATTCCTTTCCTCTTCAGAAGGTCCATTTGCTTCTGTAGTCTTGTTTTCACGTCAAAGGAGCTGAGGGACATAAGATGCCTGATGATAGCTTATTCCTCCCTTGCAACCCCCCCGTGTCTCCTTTGCTTCCTACCTCTAGGCCTCCCCCAGCTCATCATGGCTCTCTGGATCCGATCACTGCCTCTTCTGGCTCTCCTTGTCTTTTCTGGCCCTGGAACCAGCTATGCAGCTGCCAACCAGCACCTCTGTGGCTCCCACTTGGTGGAGGCTCTCTACCTGGTGTGTGGAGAGCGTGGCTTCTTCTACTCCCCCAAAGCCCGACGGGATGTCGAGCAGCCCCTAGGTAAGTCAGTTCGACCATGACTACATTCATATGCTATATGATGCAAAAAGCAACTGTCTATCTTTGATGGTGACACAAGGAATGTCCTTGGTGGGGAATGCCAGGAATACCTTAAACATACCAACAGCATCATATCACCCATGAAAAGATCGTCAGGCTAAAAAGGCAGGTGGGAGGGCAAGCAGGGAAAGGAGATTTATGAGACAGAAGGAATTGTCACAGAAAGCTCCAAATTTTTTGCTACTCTCTTGGTAGAGAGAGGCTGAAAACAGTGTTATTCCAACATTTGCATGGCAATTACTCTCACCTGGGAGTGATCATGAAAAATAAAGGTGGAAGGAACACAAGAAGCCTCTTTCTGCACCTCTTCTTGACCCACACTACCCCAGTCCTGTTCTGTGACCACATCAAGTGTGGTCATAAAAACCTCCTGCCTTCTGAAGCTGTCCATTCTTGTGCTTAAATGACTTTTTCTTGAATGTTTCCTCCTAATATTTAACCCAAATGTATCTTGTCACAAATTAGTCCTCATCTTCTCAGTAGTGGACAAAAGGAACAGTCTATCATTTCTCCTTCATGGGTGATTTTCAAACAGTTTAAAAATTGCTTCCATGTCTTGTTTTTATCTACTGTGAGCTAAAAGCCCTCAACAGCCCCAGAATTCCTTTTTAGGTCACATATTCTAGCTCTCTGTCTACATAAACTGTTCTGCATTTGGCCCATACCATTACGGAATGGT",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"species_id": "gallus_gallus",
|
| 38 |
"common": "chicken",
|
| 39 |
"color": "#c08030"
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"ortholog_symbol": "ins",
|
| 43 |
-
"ensembl_gene": "ENSDARG00000035350",
|
| 44 |
-
"ensembl_transcript": "ENSDART00000051222",
|
| 45 |
-
"chrom": "5",
|
| 46 |
-
"strand": 1,
|
| 47 |
-
"length": 1200,
|
| 48 |
-
"seq": "ATCTCCACCACCATATCCACCATTCCTCGCCTCTGCTTCGAGAACAGGTGAGTGTCGAGCGGGATGGTAAATCTACAGAGAATGCGGAGTGTAGCTTGTGTACATGTTTTTGATTAACAGAGATTGTATGTGTGTGTTTGTGTCAGTGTGACCATGGCAGTGTGGCTTCAGGCTGGTGCTCTGTTGGTCCTGTTGGTCGTGTCCAGTGTAAGCACTAACCCAGGCACACCGCAGCACCTGTGTGGATCTCATCTGGTCGATGCCCTTTATCTGGTCTGTGGCCCAACAGGCTTCTTCTACAACCCCAAGAGAGACGTTGAGCCCCTTCTGGGTAAGAAAGTCAAGTAGAGGTGTTTTGACAGTGGAGTAGTAACAGTGGAGTTTTATCATTTTTAAATCCAATTAGCGATCTCCAACTCTGGCAAGAGCACTTTTAGTTTAGTTTAGCGTAGATCATTGAATTGAATTAGACCATTAGCGTCTCGTAAATTTTTTTTTTTGATAATTTCCCTATTGAGTATATAGCTTGAAAATGATAACTTTACTATTCTTTATTTACATTGTGGACTAAGACCAGTGGAAATGAAATGTTGCTGCTTTCTACACTGTAAAAAGCGATTAGTTGCCTTTACCTAAAAAAAGAGAGTGAACTCGTTGCCTTATAATTAAGCAAACTATGTGCATATTATAAAAGTTAAGTCAATGGGTTTTCTGACTTTTTAAAAGTAAAATCAACGGTCACACTTTATTTTGATGGTCCATTTGGTAAATTGCTTCTACATGCCAACTAACTCTCATTAGATTATATGTAGACAGGTTGGGTTTAGGGTTAGGGTTAGTGTAAGTTGACATGTACTTGCAAAGTTTCTTATAGTCAGTTAAATGTCTGTTGAAGGAGCAGTATCAACAGATAATAAGCAGACAGTCTACTAATACTCAAATGGACCATCAAAATAAAGTGTTACCAAATTAACTTGTTGCTTTTGCGGTTGGTTTACTTACTTTTTTAAAGTAAAGTTACTAATCGATTTTTACAGTGTAGGTTAATATCATTCTAGCATAATAATAAAGTAACTTTGCCACTGTATCATGGCTGCAGCCATGGTTCGACAACAAAGTAACTTGCCTAGTTAGCCTAATTAATCTCCACTGTAAAAATATTTGTTAATTAAAATGTAAAAGTTTTTGGTTGATTTATGA",
|
| 49 |
-
"species_id": "danio_rerio",
|
| 50 |
-
"common": "zebrafish",
|
| 51 |
-
"color": "#2a8a8a"
|
| 52 |
}
|
| 53 |
]
|
| 54 |
},
|
|
@@ -63,6 +77,12 @@
|
|
| 63 |
"strand": -1,
|
| 64 |
"length": 1200,
|
| 65 |
"seq": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGGTAAGCTCCTGACTGAACTTGATGAGTCCTCTCTGAGTCACGGGCTCTCGGCTCCGTGTATTTTCAGCTCGGGAAAATCGCTGGGGCTGGGGGTGGGGCAGTGGGGACTTAGCGAGTTTGGGGGTGAGTGGGATGGAAGCTTGGCTAGAGGGATCATCATAGGAGTTGCATTGTTGGGAGACCTGGGTGTAGATGATGGGGATGTTAGGACCATCCGAACTCAAAGTTGAACGCCTAGGCAGAGGAGTGGAGCTTTGGGGAACCTTGAGCCGGCCTAAAGCGTACTTCTTTGCACATCCACCCGGTGCTGGGCGTAGGGAATCCCTGAAATAAAAGATGCACAAAGCATTGAGGTCTGAGACTTTTGGATCTCGAAACATTGAGAACTCATAGCTGTATATTTTAGAGCCCATGGCATCCTAGTGAAAACTGGGGCTCCATTCCGAAATGATCATTTGGGGGTGATCCGGGGAGCCCAAGCTGCTAAGGTCCCACAACTTCCGGACCTTTGTCCTTCCTGGAGCGATCTTTCCAGGCAGCCCCCGGCTCCGCTAGATGGAGAAAATCCAATTGAAGGCTGTCAGTCGTGGAAGTGAGAAGTGCTAAACCAGGGGTTTGCCCGCCAGGCCGAGGAGGACCGTCGCAATCTGAGAGGCCCGGCAGCCCTGTTATTGTTTGGCTCCACATTTACATTTCTGCCTCTTGCAGCAGCATTTCCGGTTTCTTTTTGCCGGAGCAGCTCACTATTCACCCGATGAGAGGGGAGGAGAGAGAGAGAAAATGTCCTTTAGGCCGGTTCCTCTTACTTGGCAGAGGGAGGCTGCTATTCTCCGCCTGCATTTCTTTTTCTGGATTACTTAGTTATGGCCTTTGCAAAGGCAGGGGTATTTGTTTTGATGCAAACCTCAATCCCTCCCCTTCTTTGAATGGTGTGCCCCACCCCGCGGGTCGCCTGCAACCTAGGCGGACGCTACCATGGCGTGAGACAGGGAGGGAAAGAAGTGTGCAGAAGGCAAGCCCGGAGGTATTTTCAAGAATGAGTATATCTCATCTTCCCGGAGGAAAAAAAAAAAGAATGGGTACGTC",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
"species_id": "homo_sapiens",
|
| 67 |
"common": "human",
|
| 68 |
"color": "#1a1a1a"
|
|
@@ -75,6 +95,12 @@
|
|
| 75 |
"strand": 1,
|
| 76 |
"length": 1200,
|
| 77 |
"seq": "TTTCCCCTCCCACGTGCTCACCCTGGCTAAAGTTCTGTAGCTTCAGTTCATTGGGACCATCCTGGCTGTAGGTAGCGACTACAGTTAGGGGGCACCTAGCATTCAGGCCCTCATCCTCCTCCTTCCCAGCAGGGTGTCACGCTTCTCCGAAGACTGGGTAAGTAATTGATGAGCGTGACGAGACCTCTCGGTCACTGGCTCTCTCCGTTTGCATCCATAAAACTAGAGAAAACCGTGGGGTTTGGGGGTGGGGCAGTGGGGGGACTCAGCGCGATGGAGATGGGCGGAATGGAAGCTTGGCGGGCGGGATGAACGGGAGTGTATATGTCAGATGCTGTAGTGAGGGTAGCTGATGATGATGATGTTAGGACCGACGAGCCTCACTGTCATGCACCTGCAAAGTAGAGCATATAGGGACCACTGAGATGGCCTAAGGGGTTTTCTCTCCGCTACGCGTTGTACACACTTATCTGCCCGCTGCTAGGTGATGGAAGCTCCGGAAATAACATGCACAAAGCACCAGGATTTAAGATTTTTCGAGATTCATAGCTTAAGACTTAAGACCCCCCATAGCATCCTAATGAAACCCTGGGTTCCGTTCCTGGATGAGATCGGGGTGATCCGGGGAGCCTTAGCTGCTAAGGTCCCGCAACTTCCGGACCTTTGTCCCTGGAGTGATTTCTTTTTTTTTCCAGCCGCTTCTCGACCCTGCTAGATGAAGAAAATCCAAGAAAAGCCTGAAGCACTAGCGGTGCTAGCCAGAAGTATTTGCCCTCGGGGCCCGACTCAGCCTCTTGGTCTGAAAGGCCCGCCGGCCCTGTTATTGTTTGGCTCCTTTACGTTTCTGCCGCTTGCAGGAGCATTTCCGGTTTCTTGTTTTCGGAGCAGATCACTGCTCGCCCGGCGACGGGGGAGTAGCGAAAGGGGAGAAATGGATTCTAGGCTGGTTCTGTGGTTTGAGGAGGAAAACTGCTGTCCTCGACATCTTATTTTTCTGGATTACTTGGTTATTGCTTTTGCAAAGGAGGAGGTGTTTATTTAAAAGAGTGCGCCGATAGGTCGTTTCTTCCTGCCGGAAAAGCAAATTACCGAGTATCCGGTTTTAGGGTGAGCCATTCCCTTGCTTAACGCATTCCGCGCGTCCTGAAAGCGGAAGGGGCGGGCTTGGGCCCCGGGCGCCCGGCGGTCCTAAGTGCAGTC",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
"species_id": "mus_musculus",
|
| 79 |
"common": "mouse",
|
| 80 |
"color": "#2c5aa0"
|
|
@@ -87,21 +113,19 @@
|
|
| 87 |
"strand": 1,
|
| 88 |
"length": 1200,
|
| 89 |
"seq": "GGCCTGGCCCCCGCACAGCACCTCATCCGGGTGGAGGGGAACCCCCAGGCGCGTTACCACGACGACGAGACCACCAAACGGCCACAGCGTCGTCGTCCCCTATGAGCCCCCCGAGGTACCCCCAAATCACACCCCCCCCATAACCCCCCCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAATTCCCCCCCCAAGACCCCATAACCTCCCATTGCCATGACCCCCAATCCCCCCCCCATGACCCCATAACCCCTCAATGACCCCCCCCAATGACCCCCCCAATGCCCCCAGCCCCAACCCCCCCCCCCAGTCCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCACGACCCCATAAGATCCCAATGCCGTGACCCCCCAATCCCCCCCCATGACCCCATAACCCCTCAATGACCCCCCCCAATGACCCCAACCCCAAAATACCCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAATTCCCCCCCCACGACCCCATAACCTCCCATTGCCATGACCCCCAATCCCCCCCCATGACCCTATAACCCCTCAATGACCCCCCCCCAATGACCCCCAACCCCAAAATACCCCCCCAGACCCCCCCTACAGACCCCCACAACCCCAAGACCCCCCCCCACGAACCCATAACTTCCCAATGCTGTGACCCCCAATCCCCCCCCATGACCCCATAACCCCCCAATGAACCCCAACAGCCCAATGACCCCCAGCCCCAAACCCCCCCCCCAGTCCCCTTTAAGGACCCCCCCCCAACCCCAATTCCCCCCCCACGACCCCATAACCTCCCATTGCCATGACCCCCCATCCCCCCTCATGACCCCATAACCCCTCAATGACCCCCCCCCAATGACCCCCAACCCCAAAATACCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCCCACGACCCCATAACTTCCCAATGCTGTGACCCCCAATCCCCCCCCCATGACCCCATAACCCTTCACTGACCCCCCCCAATGACCCCCAGCCCCAAACCCCCCCCAGTCCCCCTTAAGGACCCCCCCCCCACCCCATAACCCCTCAATGACACCCCCCCCCAATGACCCCCAACCCCAAAATGCCCCCCCTCAGTCCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCCCCACGACCCCATAACCTCCCATTGCCGTGACCCCCAA",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"species_id": "gallus_gallus",
|
| 91 |
"common": "chicken",
|
| 92 |
"color": "#c08030"
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"ortholog_symbol": "tp53",
|
| 96 |
-
"ensembl_gene": "ENSDARG00000035559",
|
| 97 |
-
"ensembl_transcript": "ENSDART00000135934",
|
| 98 |
-
"chrom": "5",
|
| 99 |
-
"strand": 1,
|
| 100 |
-
"length": 1200,
|
| 101 |
-
"seq": "CTGTAACTAGGGGAATCCCCAAAACTCCACGCGGATTTGCTTTGTGGATGTCCAATAACCTCCTTGTTTTGGTCATTCTTTAAGTCGATCGACTACATATCCGGGCAATCCGAAAGTCGATAATCGTAGTTTAGTGGAGAGGAGGTCGGCAAAATCAATTCTTGCAAAGGTAACAAAGCGAAACTTTTATTGACTAGCGTTAGTGGTTTGATCGATCATATTTATTAACGTTAACTAGTGTTTGTAGATTCAGCTGTTTGTATATTAAGTAAAAGTCGGGATTTGGTGTGCAACATCATTGGACCGCTTGTGTGTTGTATTATTGTAACAAACTGACGTCGATTCGATAAAAACTGGTAATGAATTAAGAGTGAAACTCGTATTTCGAACTTTAGGCCCGACATCGCTTTAAACGTTACTAAAACGTTGATGTTAACGTTAGCCGCTAAGCCTGTTAGCTAGCTAACATTAGCTGTCGAAAATGCAAACTTAGAGTTTCAGTATAATAGCTTTAGGTGACTTTTAGAAATCTTATGTACAGGTATGCGCATTTTTATTCATTTTTAGCTATATCTACTTGCGCGACAGAACTGTGAAGCTAATAAGCTAACGTTAAGCGGCATCTTTGCGACTCTGAGCGATCGCCTGTTTTTTAAATTTAAAGTGAATGTACATGTTATATAAATATCACACTTTGCATAAGAAACAACATCCCGACTTTTATTTTAACGTTATAGTAATTTTAGTTGTCGTGATATATTATTTCCCCCCAGGTTTTTATGACTTTCGTGTTTATAATTTCACAGCAATGGCGCAAAACGACAGCCAAGAGTTCGCGGAGCTCTGGGAGAAGAATTTGATGTAAGTTCGCAAGGGTCGCACTCCTGATACACAACGCTCCTCTTTTTCCTGCCTCTCTAAATTTCCCCGTATTTTTGTTAGCAGCCATGTCAGGTTGCTATAATGTACCTGTCTTAATATTTTTGGTGTTTGCTTGTTTAAGATGCATCTGTCGAACGTATTGTTTATCTGGAGTTCTTTGTATGAGCTTCAACAGATTAATACTAATTTCTCTCTCTTCCTTTTCAATTGTCTCAGAAGTATTCAGCCCCCAGGTGGTGGCTCTTGCTGGGACATCATTAATGATGAGGTATTTAAAAAAATAATCTCAAACACCCAGAAATCTCTATTTTCCTCTTT",
|
| 102 |
-
"species_id": "danio_rerio",
|
| 103 |
-
"common": "zebrafish",
|
| 104 |
-
"color": "#2a8a8a"
|
| 105 |
}
|
| 106 |
]
|
| 107 |
}
|
|
|
|
| 10 |
"strand": -1,
|
| 11 |
"length": 1200,
|
| 12 |
"seq": "AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGGTGGGCTCAGGATTCCAGGGTGGCTGGACCCCAGGCCCCAGCTCTGCAGCAGGGAGGACGTGGCTGGGCTCGTGAAGCATGTGGGGGTGAGCCCAGGGGCCCCAAGGCAGGGCACCTGGCCTTCAGCCTGCCTCAGCCCTGCCTGTCTCCCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGGTGAGCCAACTGCCCATTGCTGCCCCTGGCCGCCCCCAGCCACCCCCTGCTCCTGGCGCTCCCACCCAGCATGGGCAGAAGGGGGCAGGAGGCTGCCACCCAGCAGGGGGTCAGGTGCACTTTTTTAAAAAGAAGTTCTCTTGGTCACGTCCTAAAAGTGACCAGCTCCCTGTGGCCCAGTCAGAATCTCAGCCTGAGGACGGTGTTGGCTTCGGCAGCCCCGAGATACATCAGAGGGTGGGCACGCTCCTCCCTCCACTCGCCCCTCAAACAAATGCCCCGCAGCCCATTTCTCCACCCTCATTTGATGACCGCAGATTCAAGTGTTTTGTTAAGTAAAGTCCTGGGTGACCTGGGGTCACAGGGTGCCCCACGCTGCCTGCCTCTGGGCGAACACCCCATCACGCCCGGAGGAGGGCGTGGCTGCCTGCCTGAGTGGGCCAGACCCCTGTCGCCAGGCCTCACGGCAGCTCCATAGTCAGGAGATGGGGAAGATGCTGGGGACAGGCCCTGGGGAGAAGTACTGGGATCACCTGTTCAGGCTCCCACTGTGACGCTGCCCCGGGGCGGGGGAAGGAGGTGGGACATGTGGGCGTTGGGGCCTGTAGGTCCACACCCAGTGTGGGTGACCCTCCCTCTAACCTGGGTCCAGCCCGGCTGGAGATGGGTGGGAGTGCGACCTAGGGCTGGCGGGCAGGCGGGCACTGTGTCTCCCTGACTGTGTCCTCCTGTGTCCCTCTGCCTCGCCGCTGTTCCGGAACCTGCTCTGCGCGGC",
|
| 13 |
+
"exons": [
|
| 14 |
+
{
|
| 15 |
+
"start": 0,
|
| 16 |
+
"end": 42
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"start": 221,
|
| 20 |
+
"end": 425
|
| 21 |
+
}
|
| 22 |
+
],
|
| 23 |
"species_id": "homo_sapiens",
|
| 24 |
"common": "human",
|
| 25 |
"color": "#1a1a1a"
|
|
|
|
| 32 |
"strand": 1,
|
| 33 |
"length": 1180,
|
| 34 |
"seq": "ACCAGGCAAGTGTTTGGAAACTGCAGCTTCAGCCCCTCTGGCCATCTGCCTACCCACCCCACCTGGAGACCTTAATGGGCCAAACAGCAAAGTCCAGGGGGCAGAGAGGAGGTACTTTGGACTATAAAGCTGGTGGGCATCCAGTAACCCCCAGCCCTTAGTGACCAGCTATAATCAGAGACCATCAGCAAGCAGGTATGTACTCTCCTCTTTGGGCCTGGCTCCCCAGCCAAGACTCCAGCGACTTTAGGGAGAATGTGGGCTCCTCTCTTACATGGATCTTTTGCTAGCCTCAACCCTGCCTATCTTTCAGGTCATTGTTTCAACATGGCCCTGTTGGTGCACTTCCTACCCCTGCTGGCCCTGCTTGCCCTCTGGGAGCCCAAACCCACCCAGGCTTTTGTCAAACAGCATCTTTGTGGTCCCCACCTGGTAGAGGCTCTCTACCTGGTGTGTGGGGAGCGTGGCTTCTTCTACACACCCAAGTCCCGCCGTGAAGTGGAGGACCCACAAGTGGAACAACTGGAGCTGGGAGGAAGCCCCGGGGACCTTCAGACCTTGGCGTTGGAGGTGGCCCGGCAGAAGCGTGGCATTGTGGATCAGTGCTGCACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAAGGCCCACCTCGACCCGCCCCACCCCTCTGCAATGAATAAAACTTTTGAATAAGCACCAAAAAAAAGAGTTCTATAATGAATGAAAAAGGATTGTGTATATAGACATCTTTTTCTCTGGCATTTATTGTCATGTTAGCATACTATTAAACCATTGTTAGGTTGGATGATTATATAATCATGTATGAAGCTTGTGATAAAACACCAGGAATAATTCAAGTATCTGGAATTCTGCTTCCTGCCCAAGAAGGTAGGCAACCGTGTAAATGCCACTGAAGCTACTAGTCTAAAAGTGAGTTATCTCTGTCTTTGTCTTACCCCCTGATGCTGTGATAAAACCCTGACAAGAGCAACTGACTCCTGAGAGGAAGGTTTATTCTAGCTCACAATTCCAGGTTACAAACAGTCCATCCGTAGCAGGGGAGTCACAGCAACAGGAACCTCAGGGAACTGCTCCTATTATCCCCACAATCAAGAATAGTGACCAATAAATAAGTGGATCTTTTCTCAAAAAAAAAAAAAAAAAAAA",
|
| 35 |
+
"exons": [
|
| 36 |
+
{
|
| 37 |
+
"start": 0,
|
| 38 |
+
"end": 195
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"start": 313,
|
| 42 |
+
"end": 1180
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
"species_id": "mus_musculus",
|
| 46 |
"common": "mouse",
|
| 47 |
"color": "#2c5aa0"
|
|
|
|
| 54 |
"strand": 1,
|
| 55 |
"length": 1200,
|
| 56 |
"seq": "TTGTCTCTTTATGAGAAATCAGCAGGGAGGCCAGCATTGGTGTGAGTGTGTGGATGGAGACAGGCTTCTGGTTATAATTGGTCATTTATTATGACTTTCAAAGCCTGATGAATAAAATATTCCTTTCCTCTTCAGAAGGTCCATTTGCTTCTGTAGTCTTGTTTTCACGTCAAAGGAGCTGAGGGACATAAGATGCCTGATGATAGCTTATTCCTCCCTTGCAACCCCCCCGTGTCTCCTTTGCTTCCTACCTCTAGGCCTCCCCCAGCTCATCATGGCTCTCTGGATCCGATCACTGCCTCTTCTGGCTCTCCTTGTCTTTTCTGGCCCTGGAACCAGCTATGCAGCTGCCAACCAGCACCTCTGTGGCTCCCACTTGGTGGAGGCTCTCTACCTGGTGTGTGGAGAGCGTGGCTTCTTCTACTCCCCCAAAGCCCGACGGGATGTCGAGCAGCCCCTAGGTAAGTCAGTTCGACCATGACTACATTCATATGCTATATGATGCAAAAAGCAACTGTCTATCTTTGATGGTGACACAAGGAATGTCCTTGGTGGGGAATGCCAGGAATACCTTAAACATACCAACAGCATCATATCACCCATGAAAAGATCGTCAGGCTAAAAAGGCAGGTGGGAGGGCAAGCAGGGAAAGGAGATTTATGAGACAGAAGGAATTGTCACAGAAAGCTCCAAATTTTTTGCTACTCTCTTGGTAGAGAGAGGCTGAAAACAGTGTTATTCCAACATTTGCATGGCAATTACTCTCACCTGGGAGTGATCATGAAAAATAAAGGTGGAAGGAACACAAGAAGCCTCTTTCTGCACCTCTTCTTGACCCACACTACCCCAGTCCTGTTCTGTGACCACATCAAGTGTGGTCATAAAAACCTCCTGCCTTCTGAAGCTGTCCATTCTTGTGCTTAAATGACTTTTTCTTGAATGTTTCCTCCTAATATTTAACCCAAATGTATCTTGTCACAAATTAGTCCTCATCTTCTCAGTAGTGGACAAAAGGAACAGTCTATCATTTCTCCTTCATGGGTGATTTTCAAACAGTTTAAAAATTGCTTCCATGTCTTGTTTTTATCTACTGTGAGCTAAAAGCCCTCAACAGCCCCAGAATTCCTTTTTAGGTCACATATTCTAGCTCTCTGTCTACATAAACTGTTCTGCATTTGGCCCATACCATTACGGAATGGT",
|
| 57 |
+
"exons": [
|
| 58 |
+
{
|
| 59 |
+
"start": 0,
|
| 60 |
+
"end": 461
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
"species_id": "gallus_gallus",
|
| 64 |
"common": "chicken",
|
| 65 |
"color": "#c08030"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
]
|
| 68 |
},
|
|
|
|
| 77 |
"strand": -1,
|
| 78 |
"length": 1200,
|
| 79 |
"seq": "CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGGTAAGCTCCTGACTGAACTTGATGAGTCCTCTCTGAGTCACGGGCTCTCGGCTCCGTGTATTTTCAGCTCGGGAAAATCGCTGGGGCTGGGGGTGGGGCAGTGGGGACTTAGCGAGTTTGGGGGTGAGTGGGATGGAAGCTTGGCTAGAGGGATCATCATAGGAGTTGCATTGTTGGGAGACCTGGGTGTAGATGATGGGGATGTTAGGACCATCCGAACTCAAAGTTGAACGCCTAGGCAGAGGAGTGGAGCTTTGGGGAACCTTGAGCCGGCCTAAAGCGTACTTCTTTGCACATCCACCCGGTGCTGGGCGTAGGGAATCCCTGAAATAAAAGATGCACAAAGCATTGAGGTCTGAGACTTTTGGATCTCGAAACATTGAGAACTCATAGCTGTATATTTTAGAGCCCATGGCATCCTAGTGAAAACTGGGGCTCCATTCCGAAATGATCATTTGGGGGTGATCCGGGGAGCCCAAGCTGCTAAGGTCCCACAACTTCCGGACCTTTGTCCTTCCTGGAGCGATCTTTCCAGGCAGCCCCCGGCTCCGCTAGATGGAGAAAATCCAATTGAAGGCTGTCAGTCGTGGAAGTGAGAAGTGCTAAACCAGGGGTTTGCCCGCCAGGCCGAGGAGGACCGTCGCAATCTGAGAGGCCCGGCAGCCCTGTTATTGTTTGGCTCCACATTTACATTTCTGCCTCTTGCAGCAGCATTTCCGGTTTCTTTTTGCCGGAGCAGCTCACTATTCACCCGATGAGAGGGGAGGAGAGAGAGAGAAAATGTCCTTTAGGCCGGTTCCTCTTACTTGGCAGAGGGAGGCTGCTATTCTCCGCCTGCATTTCTTTTTCTGGATTACTTAGTTATGGCCTTTGCAAAGGCAGGGGTATTTGTTTTGATGCAAACCTCAATCCCTCCCCTTCTTTGAATGGTGTGCCCCACCCCGCGGGTCGCCTGCAACCTAGGCGGACGCTACCATGGCGTGAGACAGGGAGGGAAAGAAGTGTGCAGAAGGCAAGCCCGGAGGTATTTTCAAGAATGAGTATATCTCATCTTCCCGGAGGAAAAAAAAAAAGAATGGGTACGTC",
|
| 80 |
+
"exons": [
|
| 81 |
+
{
|
| 82 |
+
"start": 0,
|
| 83 |
+
"end": 114
|
| 84 |
+
}
|
| 85 |
+
],
|
| 86 |
"species_id": "homo_sapiens",
|
| 87 |
"common": "human",
|
| 88 |
"color": "#1a1a1a"
|
|
|
|
| 95 |
"strand": 1,
|
| 96 |
"length": 1200,
|
| 97 |
"seq": "TTTCCCCTCCCACGTGCTCACCCTGGCTAAAGTTCTGTAGCTTCAGTTCATTGGGACCATCCTGGCTGTAGGTAGCGACTACAGTTAGGGGGCACCTAGCATTCAGGCCCTCATCCTCCTCCTTCCCAGCAGGGTGTCACGCTTCTCCGAAGACTGGGTAAGTAATTGATGAGCGTGACGAGACCTCTCGGTCACTGGCTCTCTCCGTTTGCATCCATAAAACTAGAGAAAACCGTGGGGTTTGGGGGTGGGGCAGTGGGGGGACTCAGCGCGATGGAGATGGGCGGAATGGAAGCTTGGCGGGCGGGATGAACGGGAGTGTATATGTCAGATGCTGTAGTGAGGGTAGCTGATGATGATGATGTTAGGACCGACGAGCCTCACTGTCATGCACCTGCAAAGTAGAGCATATAGGGACCACTGAGATGGCCTAAGGGGTTTTCTCTCCGCTACGCGTTGTACACACTTATCTGCCCGCTGCTAGGTGATGGAAGCTCCGGAAATAACATGCACAAAGCACCAGGATTTAAGATTTTTCGAGATTCATAGCTTAAGACTTAAGACCCCCCATAGCATCCTAATGAAACCCTGGGTTCCGTTCCTGGATGAGATCGGGGTGATCCGGGGAGCCTTAGCTGCTAAGGTCCCGCAACTTCCGGACCTTTGTCCCTGGAGTGATTTCTTTTTTTTTCCAGCCGCTTCTCGACCCTGCTAGATGAAGAAAATCCAAGAAAAGCCTGAAGCACTAGCGGTGCTAGCCAGAAGTATTTGCCCTCGGGGCCCGACTCAGCCTCTTGGTCTGAAAGGCCCGCCGGCCCTGTTATTGTTTGGCTCCTTTACGTTTCTGCCGCTTGCAGGAGCATTTCCGGTTTCTTGTTTTCGGAGCAGATCACTGCTCGCCCGGCGACGGGGGAGTAGCGAAAGGGGAGAAATGGATTCTAGGCTGGTTCTGTGGTTTGAGGAGGAAAACTGCTGTCCTCGACATCTTATTTTTCTGGATTACTTGGTTATTGCTTTTGCAAAGGAGGAGGTGTTTATTTAAAAGAGTGCGCCGATAGGTCGTTTCTTCCTGCCGGAAAAGCAAATTACCGAGTATCCGGTTTTAGGGTGAGCCATTCCCTTGCTTAACGCATTCCGCGCGTCCTGAAAGCGGAAGGGGCGGGCTTGGGCCCCGGGCGCCCGGCGGTCCTAAGTGCAGTC",
|
| 98 |
+
"exons": [
|
| 99 |
+
{
|
| 100 |
+
"start": 0,
|
| 101 |
+
"end": 157
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
"species_id": "mus_musculus",
|
| 105 |
"common": "mouse",
|
| 106 |
"color": "#2c5aa0"
|
|
|
|
| 113 |
"strand": 1,
|
| 114 |
"length": 1200,
|
| 115 |
"seq": "GGCCTGGCCCCCGCACAGCACCTCATCCGGGTGGAGGGGAACCCCCAGGCGCGTTACCACGACGACGAGACCACCAAACGGCCACAGCGTCGTCGTCCCCTATGAGCCCCCCGAGGTACCCCCAAATCACACCCCCCCCATAACCCCCCCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAATTCCCCCCCCAAGACCCCATAACCTCCCATTGCCATGACCCCCAATCCCCCCCCCATGACCCCATAACCCCTCAATGACCCCCCCCAATGACCCCCCCAATGCCCCCAGCCCCAACCCCCCCCCCCAGTCCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCACGACCCCATAAGATCCCAATGCCGTGACCCCCCAATCCCCCCCCATGACCCCATAACCCCTCAATGACCCCCCCCAATGACCCCAACCCCAAAATACCCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAATTCCCCCCCCACGACCCCATAACCTCCCATTGCCATGACCCCCAATCCCCCCCCATGACCCTATAACCCCTCAATGACCCCCCCCCAATGACCCCCAACCCCAAAATACCCCCCCAGACCCCCCCTACAGACCCCCACAACCCCAAGACCCCCCCCCACGAACCCATAACTTCCCAATGCTGTGACCCCCAATCCCCCCCCATGACCCCATAACCCCCCAATGAACCCCAACAGCCCAATGACCCCCAGCCCCAAACCCCCCCCCCAGTCCCCTTTAAGGACCCCCCCCCAACCCCAATTCCCCCCCCACGACCCCATAACCTCCCATTGCCATGACCCCCCATCCCCCCTCATGACCCCATAACCCCTCAATGACCCCCCCCCAATGACCCCCAACCCCAAAATACCCCCCAGACCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCCCACGACCCCATAACTTCCCAATGCTGTGACCCCCAATCCCCCCCCCATGACCCCATAACCCTTCACTGACCCCCCCCAATGACCCCCAGCCCCAAACCCCCCCCAGTCCCCCTTAAGGACCCCCCCCCCACCCCATAACCCCTCAATGACACCCCCCCCCAATGACCCCCAACCCCAAAATGCCCCCCCTCAGTCCCCCCTAAAGACCCCCCCAACCCCAAGACCCCCCCCCACGACCCCATAACCTCCCATTGCCGTGACCCCCAA",
|
| 116 |
+
"exons": [
|
| 117 |
+
{
|
| 118 |
+
"start": 0,
|
| 119 |
+
"end": 81
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"start": 82,
|
| 123 |
+
"end": 115
|
| 124 |
+
}
|
| 125 |
+
],
|
| 126 |
"species_id": "gallus_gallus",
|
| 127 |
"common": "chicken",
|
| 128 |
"color": "#c08030"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
}
|
| 130 |
]
|
| 131 |
}
|
demo.html
CHANGED
|
@@ -804,14 +804,14 @@ print(f"delta = {delta:+.2f} (less likely if negative)")</code></pre></div>
|
|
| 804 |
<div class="section-num">§4 · Species</div>
|
| 805 |
<div class="section-title">It knows who's who</div>
|
| 806 |
<p class="lede">
|
| 807 |
-
The same gene (insulin, p53) exists in mouse
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
to continue. Each continuation should match <em>that species'</em>
|
| 811 |
-
another species' would.
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
</p>
|
| 816 |
</div>
|
| 817 |
|
|
@@ -860,9 +860,9 @@ print(f"delta = {delta:+.2f} (less likely if negative)")</code></pre></div>
|
|
| 860 |
given and continues in that species' style; identity to that species' reference often
|
| 861 |
runs 65–90% on the next 60 bp. Cut the prefix to 200 and the signal collapses to
|
| 862 |
near-random: a few hundred bases is what it takes to "lock in" on a lineage.
|
| 863 |
-
|
| 864 |
-
is enough
|
| 865 |
-
|
| 866 |
</p>
|
| 867 |
</div>
|
| 868 |
|
|
@@ -892,7 +892,7 @@ def continue_species(species_prefix):
|
|
| 892 |
)
|
| 893 |
return r.choices[0].text
|
| 894 |
|
| 895 |
-
# species_prefixes = { "human": ..., "mouse": ..., "chicken": ...
|
| 896 |
with ThreadPoolExecutor() as pool:
|
| 897 |
results = dict(zip(species_prefixes, pool.map(continue_species, species_prefixes.values())))
|
| 898 |
|
|
|
|
| 804 |
<div class="section-num">§4 · Species</div>
|
| 805 |
<div class="section-title">It knows who's who</div>
|
| 806 |
<p class="lede">
|
| 807 |
+
The same gene (insulin, p53) exists in mouse and chicken, but the surrounding sequence
|
| 808 |
+
has accumulated different mutations along each lineage for hundreds of millions of
|
| 809 |
+
years. For each species we hand Carbon up to ~400 bp leading into the 2nd exon and
|
| 810 |
+
ask it to continue inside the exon. Each continuation should match <em>that species'</em>
|
| 811 |
+
real DNA better than another species' would. The model handles closely-related species
|
| 812 |
+
well (mouse, chicken, even though they're ~300 My from human); the further you go back
|
| 813 |
+
in evolutionary time, the more the surrounding sequence drifts and the harder this
|
| 814 |
+
setup becomes.
|
| 815 |
</p>
|
| 816 |
</div>
|
| 817 |
|
|
|
|
| 860 |
given and continues in that species' style; identity to that species' reference often
|
| 861 |
runs 65–90% on the next 60 bp. Cut the prefix to 200 and the signal collapses to
|
| 862 |
near-random: a few hundred bases is what it takes to "lock in" on a lineage.
|
| 863 |
+
The gap between mouse and chicken is where you can read the evolutionary signal: 300+
|
| 864 |
+
My since the last common ancestor is enough drift that a 400 bp prefix still locks
|
| 865 |
+
Carbon in, but the per-base identity sits a notch below mouse.
|
| 866 |
</p>
|
| 867 |
</div>
|
| 868 |
|
|
|
|
| 892 |
)
|
| 893 |
return r.choices[0].text
|
| 894 |
|
| 895 |
+
# species_prefixes = { "human": ..., "mouse": ..., "chicken": ... }
|
| 896 |
with ThreadPoolExecutor() as pool:
|
| 897 |
results = dict(zip(species_prefixes, pool.map(continue_species, species_prefixes.values())))
|
| 898 |
|
scripts/fetch_species.py
CHANGED
|
@@ -23,7 +23,9 @@ SPECIES = [
|
|
| 23 |
{"id": "homo_sapiens", "common": "human", "color": "#1a1a1a"},
|
| 24 |
{"id": "mus_musculus", "common": "mouse", "color": "#2c5aa0"},
|
| 25 |
{"id": "gallus_gallus", "common": "chicken", "color": "#c08030"},
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
]
|
| 28 |
PREFIX_LEN = 1200 # cap on returned seq length per species (only ~200 will be fed as prompt)
|
| 29 |
|
|
@@ -56,9 +58,11 @@ def fetch_for(symbol, species_id):
|
|
| 56 |
|
| 57 |
chrom = g["seq_region_name"]
|
| 58 |
strand = g["strand"]
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
|
|
|
| 62 |
# Cap fetched length so the JSON stays small
|
| 63 |
if t_end - t_start + 1 > PREFIX_LEN:
|
| 64 |
if strand == 1:
|
|
@@ -75,6 +79,24 @@ def fetch_for(symbol, species_id):
|
|
| 75 |
else:
|
| 76 |
seq = revcomp(plus_seq)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
return {
|
| 79 |
"ortholog_symbol": g.get("display_name", symbol),
|
| 80 |
"ensembl_gene": g["id"],
|
|
@@ -83,6 +105,7 @@ def fetch_for(symbol, species_id):
|
|
| 83 |
"strand": strand,
|
| 84 |
"length": len(seq),
|
| 85 |
"seq": seq,
|
|
|
|
| 86 |
}
|
| 87 |
|
| 88 |
|
|
|
|
| 23 |
{"id": "homo_sapiens", "common": "human", "color": "#1a1a1a"},
|
| 24 |
{"id": "mus_musculus", "common": "mouse", "color": "#2c5aa0"},
|
| 25 |
{"id": "gallus_gallus", "common": "chicken", "color": "#c08030"},
|
| 26 |
+
# Zebrafish dropped: ~450 My from human, the model usually can't pick up
|
| 27 |
+
# the lineage from ~400 bp of context and the row looks like noise next
|
| 28 |
+
# to mammals + bird.
|
| 29 |
]
|
| 30 |
PREFIX_LEN = 1200 # cap on returned seq length per species (only ~200 will be fed as prompt)
|
| 31 |
|
|
|
|
| 58 |
|
| 59 |
chrom = g["seq_region_name"]
|
| 60 |
strand = g["strand"]
|
| 61 |
+
t_start_full = ct["start"]
|
| 62 |
+
t_end_full = ct["end"]
|
| 63 |
+
exons_genomic = [(e["start"], e["end"]) for e in ct.get("Exon", [])]
|
| 64 |
|
| 65 |
+
t_start, t_end = t_start_full, t_end_full
|
| 66 |
# Cap fetched length so the JSON stays small
|
| 67 |
if t_end - t_start + 1 > PREFIX_LEN:
|
| 68 |
if strand == 1:
|
|
|
|
| 79 |
else:
|
| 80 |
seq = revcomp(plus_seq)
|
| 81 |
|
| 82 |
+
# Translate exons from genomic coords into 0-based [start, end) offsets in
|
| 83 |
+
# `seq`. For + strand seq[i] = plus pos t_start + i. For - strand seq is
|
| 84 |
+
# revcomp(plus_seq), so seq[i] = plus pos t_end - i. Exons that fall
|
| 85 |
+
# outside the trimmed window are clipped or dropped.
|
| 86 |
+
exons_seq = []
|
| 87 |
+
seq_len = len(seq)
|
| 88 |
+
for e_start, e_end in exons_genomic:
|
| 89 |
+
if strand == 1:
|
| 90 |
+
s = e_start - t_start
|
| 91 |
+
e = e_end - t_start + 1
|
| 92 |
+
else:
|
| 93 |
+
s = t_end - e_end
|
| 94 |
+
e = t_end - e_start + 1
|
| 95 |
+
s = max(0, s); e = min(seq_len, e)
|
| 96 |
+
if e > s:
|
| 97 |
+
exons_seq.append({"start": s, "end": e})
|
| 98 |
+
exons_seq.sort(key=lambda x: x["start"])
|
| 99 |
+
|
| 100 |
return {
|
| 101 |
"ortholog_symbol": g.get("display_name", symbol),
|
| 102 |
"ensembl_gene": g["id"],
|
|
|
|
| 105 |
"strand": strand,
|
| 106 |
"length": len(seq),
|
| 107 |
"seq": seq,
|
| 108 |
+
"exons": exons_seq,
|
| 109 |
}
|
| 110 |
|
| 111 |
|