Spaces:
No application file
No application file
| static const char bases[][4] = {"TTTT", /* 00 00 00 00 */ | |
| "TTTC", /* 00 00 00 01 */ | |
| "TTTA", /* 00 00 00 10 */ | |
| "TTTG", /* 00 00 00 11 */ | |
| "TTCT", /* 00 00 01 00 */ | |
| "TTCC", /* 00 00 01 01 */ | |
| "TTCA", /* 00 00 01 10 */ | |
| "TTCG", /* 00 00 01 11 */ | |
| "TTAT", /* 00 00 10 00 */ | |
| "TTAC", /* 00 00 10 01 */ | |
| "TTAA", /* 00 00 10 10 */ | |
| "TTAG", /* 00 00 10 11 */ | |
| "TTGT", /* 00 00 11 00 */ | |
| "TTGC", /* 00 00 11 01 */ | |
| "TTGA", /* 00 00 11 10 */ | |
| "TTGG", /* 00 00 11 11 */ | |
| "TCTT", /* 00 01 00 00 */ | |
| "TCTC", /* 00 01 00 01 */ | |
| "TCTA", /* 00 01 00 10 */ | |
| "TCTG", /* 00 01 00 11 */ | |
| "TCCT", /* 00 01 01 00 */ | |
| "TCCC", /* 00 01 01 01 */ | |
| "TCCA", /* 00 01 01 10 */ | |
| "TCCG", /* 00 01 01 11 */ | |
| "TCAT", /* 00 01 10 00 */ | |
| "TCAC", /* 00 01 10 01 */ | |
| "TCAA", /* 00 01 10 10 */ | |
| "TCAG", /* 00 01 10 11 */ | |
| "TCGT", /* 00 01 11 00 */ | |
| "TCGC", /* 00 01 11 01 */ | |
| "TCGA", /* 00 01 11 10 */ | |
| "TCGG", /* 00 01 11 11 */ | |
| "TATT", /* 00 10 00 00 */ | |
| "TATC", /* 00 10 00 01 */ | |
| "TATA", /* 00 10 00 10 */ | |
| "TATG", /* 00 10 00 11 */ | |
| "TACT", /* 00 10 01 00 */ | |
| "TACC", /* 00 10 01 01 */ | |
| "TACA", /* 00 10 01 10 */ | |
| "TACG", /* 00 10 01 11 */ | |
| "TAAT", /* 00 10 10 00 */ | |
| "TAAC", /* 00 10 10 01 */ | |
| "TAAA", /* 00 10 10 10 */ | |
| "TAAG", /* 00 10 10 11 */ | |
| "TAGT", /* 00 10 11 00 */ | |
| "TAGC", /* 00 10 11 01 */ | |
| "TAGA", /* 00 10 11 10 */ | |
| "TAGG", /* 00 10 11 11 */ | |
| "TGTT", /* 00 11 00 00 */ | |
| "TGTC", /* 00 11 00 01 */ | |
| "TGTA", /* 00 11 00 10 */ | |
| "TGTG", /* 00 11 00 11 */ | |
| "TGCT", /* 00 11 01 00 */ | |
| "TGCC", /* 00 11 01 01 */ | |
| "TGCA", /* 00 11 01 10 */ | |
| "TGCG", /* 00 11 01 11 */ | |
| "TGAT", /* 00 11 10 00 */ | |
| "TGAC", /* 00 11 10 01 */ | |
| "TGAA", /* 00 11 10 10 */ | |
| "TGAG", /* 00 11 10 11 */ | |
| "TGGT", /* 00 11 11 00 */ | |
| "TGGC", /* 00 11 11 01 */ | |
| "TGGA", /* 00 11 11 10 */ | |
| "TGGG", /* 00 11 11 11 */ | |
| "CTTT", /* 01 00 00 00 */ | |
| "CTTC", /* 01 00 00 01 */ | |
| "CTTA", /* 01 00 00 10 */ | |
| "CTTG", /* 01 00 00 11 */ | |
| "CTCT", /* 01 00 01 00 */ | |
| "CTCC", /* 01 00 01 01 */ | |
| "CTCA", /* 01 00 01 10 */ | |
| "CTCG", /* 01 00 01 11 */ | |
| "CTAT", /* 01 00 10 00 */ | |
| "CTAC", /* 01 00 10 01 */ | |
| "CTAA", /* 01 00 10 10 */ | |
| "CTAG", /* 01 00 10 11 */ | |
| "CTGT", /* 01 00 11 00 */ | |
| "CTGC", /* 01 00 11 01 */ | |
| "CTGA", /* 01 00 11 10 */ | |
| "CTGG", /* 01 00 11 11 */ | |
| "CCTT", /* 01 01 00 00 */ | |
| "CCTC", /* 01 01 00 01 */ | |
| "CCTA", /* 01 01 00 10 */ | |
| "CCTG", /* 01 01 00 11 */ | |
| "CCCT", /* 01 01 01 00 */ | |
| "CCCC", /* 01 01 01 01 */ | |
| "CCCA", /* 01 01 01 10 */ | |
| "CCCG", /* 01 01 01 11 */ | |
| "CCAT", /* 01 01 10 00 */ | |
| "CCAC", /* 01 01 10 01 */ | |
| "CCAA", /* 01 01 10 10 */ | |
| "CCAG", /* 01 01 10 11 */ | |
| "CCGT", /* 01 01 11 00 */ | |
| "CCGC", /* 01 01 11 01 */ | |
| "CCGA", /* 01 01 11 10 */ | |
| "CCGG", /* 01 01 11 11 */ | |
| "CATT", /* 01 10 00 00 */ | |
| "CATC", /* 01 10 00 01 */ | |
| "CATA", /* 01 10 00 10 */ | |
| "CATG", /* 01 10 00 11 */ | |
| "CACT", /* 01 10 01 00 */ | |
| "CACC", /* 01 10 01 01 */ | |
| "CACA", /* 01 10 01 10 */ | |
| "CACG", /* 01 10 01 11 */ | |
| "CAAT", /* 01 10 10 00 */ | |
| "CAAC", /* 01 10 10 01 */ | |
| "CAAA", /* 01 10 10 10 */ | |
| "CAAG", /* 01 10 10 11 */ | |
| "CAGT", /* 01 10 11 00 */ | |
| "CAGC", /* 01 10 11 01 */ | |
| "CAGA", /* 01 10 11 10 */ | |
| "CAGG", /* 01 10 11 11 */ | |
| "CGTT", /* 01 11 00 00 */ | |
| "CGTC", /* 01 11 00 01 */ | |
| "CGTA", /* 01 11 00 10 */ | |
| "CGTG", /* 01 11 00 11 */ | |
| "CGCT", /* 01 11 01 00 */ | |
| "CGCC", /* 01 11 01 01 */ | |
| "CGCA", /* 01 11 01 10 */ | |
| "CGCG", /* 01 11 01 11 */ | |
| "CGAT", /* 01 11 10 00 */ | |
| "CGAC", /* 01 11 10 01 */ | |
| "CGAA", /* 01 11 10 10 */ | |
| "CGAG", /* 01 11 10 11 */ | |
| "CGGT", /* 01 11 11 00 */ | |
| "CGGC", /* 01 11 11 01 */ | |
| "CGGA", /* 01 11 11 10 */ | |
| "CGGG", /* 01 11 11 11 */ | |
| "ATTT", /* 10 00 00 00 */ | |
| "ATTC", /* 10 00 00 01 */ | |
| "ATTA", /* 10 00 00 10 */ | |
| "ATTG", /* 10 00 00 11 */ | |
| "ATCT", /* 10 00 01 00 */ | |
| "ATCC", /* 10 00 01 01 */ | |
| "ATCA", /* 10 00 01 10 */ | |
| "ATCG", /* 10 00 01 11 */ | |
| "ATAT", /* 10 00 10 00 */ | |
| "ATAC", /* 10 00 10 01 */ | |
| "ATAA", /* 10 00 10 10 */ | |
| "ATAG", /* 10 00 10 11 */ | |
| "ATGT", /* 10 00 11 00 */ | |
| "ATGC", /* 10 00 11 01 */ | |
| "ATGA", /* 10 00 11 10 */ | |
| "ATGG", /* 10 00 11 11 */ | |
| "ACTT", /* 10 01 00 00 */ | |
| "ACTC", /* 10 01 00 01 */ | |
| "ACTA", /* 10 01 00 10 */ | |
| "ACTG", /* 10 01 00 11 */ | |
| "ACCT", /* 10 01 01 00 */ | |
| "ACCC", /* 10 01 01 01 */ | |
| "ACCA", /* 10 01 01 10 */ | |
| "ACCG", /* 10 01 01 11 */ | |
| "ACAT", /* 10 01 10 00 */ | |
| "ACAC", /* 10 01 10 01 */ | |
| "ACAA", /* 10 01 10 10 */ | |
| "ACAG", /* 10 01 10 11 */ | |
| "ACGT", /* 10 01 11 00 */ | |
| "ACGC", /* 10 01 11 01 */ | |
| "ACGA", /* 10 01 11 10 */ | |
| "ACGG", /* 10 01 11 11 */ | |
| "AATT", /* 10 10 00 00 */ | |
| "AATC", /* 10 10 00 01 */ | |
| "AATA", /* 10 10 00 10 */ | |
| "AATG", /* 10 10 00 11 */ | |
| "AACT", /* 10 10 01 00 */ | |
| "AACC", /* 10 10 01 01 */ | |
| "AACA", /* 10 10 01 10 */ | |
| "AACG", /* 10 10 01 11 */ | |
| "AAAT", /* 10 10 10 00 */ | |
| "AAAC", /* 10 10 10 01 */ | |
| "AAAA", /* 10 10 10 10 */ | |
| "AAAG", /* 10 10 10 11 */ | |
| "AAGT", /* 10 10 11 00 */ | |
| "AAGC", /* 10 10 11 01 */ | |
| "AAGA", /* 10 10 11 10 */ | |
| "AAGG", /* 10 10 11 11 */ | |
| "AGTT", /* 10 11 00 00 */ | |
| "AGTC", /* 10 11 00 01 */ | |
| "AGTA", /* 10 11 00 10 */ | |
| "AGTG", /* 10 11 00 11 */ | |
| "AGCT", /* 10 11 01 00 */ | |
| "AGCC", /* 10 11 01 01 */ | |
| "AGCA", /* 10 11 01 10 */ | |
| "AGCG", /* 10 11 01 11 */ | |
| "AGAT", /* 10 11 10 00 */ | |
| "AGAC", /* 10 11 10 01 */ | |
| "AGAA", /* 10 11 10 10 */ | |
| "AGAG", /* 10 11 10 11 */ | |
| "AGGT", /* 10 11 11 00 */ | |
| "AGGC", /* 10 11 11 01 */ | |
| "AGGA", /* 10 11 11 10 */ | |
| "AGGG", /* 10 11 11 11 */ | |
| "GTTT", /* 11 00 00 00 */ | |
| "GTTC", /* 11 00 00 01 */ | |
| "GTTA", /* 11 00 00 10 */ | |
| "GTTG", /* 11 00 00 11 */ | |
| "GTCT", /* 11 00 01 00 */ | |
| "GTCC", /* 11 00 01 01 */ | |
| "GTCA", /* 11 00 01 10 */ | |
| "GTCG", /* 11 00 01 11 */ | |
| "GTAT", /* 11 00 10 00 */ | |
| "GTAC", /* 11 00 10 01 */ | |
| "GTAA", /* 11 00 10 10 */ | |
| "GTAG", /* 11 00 10 11 */ | |
| "GTGT", /* 11 00 11 00 */ | |
| "GTGC", /* 11 00 11 01 */ | |
| "GTGA", /* 11 00 11 10 */ | |
| "GTGG", /* 11 00 11 11 */ | |
| "GCTT", /* 11 01 00 00 */ | |
| "GCTC", /* 11 01 00 01 */ | |
| "GCTA", /* 11 01 00 10 */ | |
| "GCTG", /* 11 01 00 11 */ | |
| "GCCT", /* 11 01 01 00 */ | |
| "GCCC", /* 11 01 01 01 */ | |
| "GCCA", /* 11 01 01 10 */ | |
| "GCCG", /* 11 01 01 11 */ | |
| "GCAT", /* 11 01 10 00 */ | |
| "GCAC", /* 11 01 10 01 */ | |
| "GCAA", /* 11 01 10 10 */ | |
| "GCAG", /* 11 01 10 11 */ | |
| "GCGT", /* 11 01 11 00 */ | |
| "GCGC", /* 11 01 11 01 */ | |
| "GCGA", /* 11 01 11 10 */ | |
| "GCGG", /* 11 01 11 11 */ | |
| "GATT", /* 11 10 00 00 */ | |
| "GATC", /* 11 10 00 01 */ | |
| "GATA", /* 11 10 00 10 */ | |
| "GATG", /* 11 10 00 11 */ | |
| "GACT", /* 11 10 01 00 */ | |
| "GACC", /* 11 10 01 01 */ | |
| "GACA", /* 11 10 01 10 */ | |
| "GACG", /* 11 10 01 11 */ | |
| "GAAT", /* 11 10 10 00 */ | |
| "GAAC", /* 11 10 10 01 */ | |
| "GAAA", /* 11 10 10 10 */ | |
| "GAAG", /* 11 10 10 11 */ | |
| "GAGT", /* 11 10 11 00 */ | |
| "GAGC", /* 11 10 11 01 */ | |
| "GAGA", /* 11 10 11 10 */ | |
| "GAGG", /* 11 10 11 11 */ | |
| "GGTT", /* 11 11 00 00 */ | |
| "GGTC", /* 11 11 00 01 */ | |
| "GGTA", /* 11 11 00 10 */ | |
| "GGTG", /* 11 11 00 11 */ | |
| "GGCT", /* 11 11 01 00 */ | |
| "GGCC", /* 11 11 01 01 */ | |
| "GGCA", /* 11 11 01 10 */ | |
| "GGCG", /* 11 11 01 11 */ | |
| "GGAT", /* 11 11 10 00 */ | |
| "GGAC", /* 11 11 10 01 */ | |
| "GGAA", /* 11 11 10 10 */ | |
| "GGAG", /* 11 11 10 11 */ | |
| "GGGT", /* 11 11 11 00 */ | |
| "GGGC", /* 11 11 11 01 */ | |
| "GGGA", /* 11 11 11 10 */ | |
| "GGGG", /* 11 11 11 11 */ | |
| }; | |
| static int | |
| extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) { | |
| uint32_t i; | |
| const uint32_t size = end - start; | |
| const uint32_t byteStart = start / 4; | |
| const uint32_t byteEnd = (end + 3) / 4; | |
| if (byteSize != byteEnd - byteStart) { | |
| PyErr_Format(PyExc_RuntimeError, | |
| "unexpected number of bytes %u (expected %u)", | |
| byteSize, byteEnd - byteStart); | |
| return -1; | |
| } | |
| start -= byteStart * 4; | |
| if (byteStart + 1 == byteEnd) { | |
| /* one byte only */ | |
| memcpy(sequence, &(bases[*bytes][start]), size); | |
| } | |
| else { | |
| end -= byteEnd * 4; | |
| /* end is now a negative number equal to the distance to the byte end */ | |
| memcpy(sequence, &(bases[*bytes][start]), 4 - start); | |
| bytes++; | |
| sequence += (4 - start); | |
| for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4) | |
| memcpy(sequence, bases[*bytes], 4); | |
| memcpy(sequence, bases[*bytes], end + 4); | |
| bytes++; | |
| bytes -= byteSize; | |
| } | |
| return 0; | |
| } | |
| static void | |
| applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks) | |
| { | |
| const Py_ssize_t nBlockCount = nBlocks->shape[0]; | |
| const uint32_t* const nBlockPositions = nBlocks->buf; | |
| Py_ssize_t i; | |
| for (i = 0; i < nBlockCount; i++) { | |
| uint32_t nBlockStart = nBlockPositions[2*i]; | |
| uint32_t nBlockEnd = nBlockPositions[2*i+1]; | |
| if (nBlockEnd < start) continue; | |
| if (end < nBlockStart) break; | |
| if (nBlockStart < start) nBlockStart = start; | |
| if (end < nBlockEnd) nBlockEnd = end; | |
| memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart); | |
| } | |
| } | |
| static void | |
| applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks) | |
| { | |
| const Py_ssize_t maskBlockCount = maskBlocks->shape[0]; | |
| const uint32_t* const maskBlockPositions = maskBlocks->buf; | |
| const char diff = 'a' - 'A'; | |
| Py_ssize_t i; | |
| for (i = 0; i < maskBlockCount; i++) { | |
| uint32_t j; | |
| uint32_t maskBlockStart = maskBlockPositions[2*i]; | |
| uint32_t maskBlockEnd = maskBlockPositions[2*i+1]; | |
| if (maskBlockEnd < start) continue; | |
| if (end < maskBlockStart) break; | |
| if (maskBlockStart < start) maskBlockStart = start; | |
| if (end < maskBlockEnd) maskBlockEnd = end; | |
| for (j = maskBlockStart - start; j < maskBlockEnd - start; j++) | |
| sequence[j] += diff; | |
| } | |
| } | |
| static int | |
| blocks_converter(PyObject* object, void* pointer) | |
| { | |
| const int flag = PyBUF_ND | PyBUF_FORMAT; | |
| Py_buffer *view = pointer; | |
| if (object == NULL) goto exit; | |
| if (PyObject_GetBuffer(object, view, flag) == -1) { | |
| PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format."); | |
| return 0; | |
| } | |
| if (view->itemsize != sizeof(uint32_t) | |
| || (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) { | |
| PyErr_Format(PyExc_RuntimeError, | |
| "blocks have incorrect data type (itemsize %zd, format %s)", | |
| view->itemsize, view->format); | |
| goto exit; | |
| } | |
| if (view->ndim != 2) { | |
| PyErr_Format(PyExc_RuntimeError, | |
| "blocks have incorrect rank %d (expected 2)", view->ndim); | |
| goto exit; | |
| } | |
| if (view->shape[1] != 2) { | |
| PyErr_Format(PyExc_RuntimeError, | |
| "blocks should have two columns (found %zd)", | |
| view->shape[1]); | |
| goto exit; | |
| } | |
| return Py_CLEANUP_SUPPORTED; | |
| exit: | |
| PyBuffer_Release(view); | |
| return 0; | |
| } | |
| static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object"; | |
| static PyObject* | |
| TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords) | |
| { | |
| const unsigned char *data; | |
| Py_ssize_t start; | |
| Py_ssize_t end; | |
| Py_ssize_t step; | |
| Py_ssize_t size; | |
| Py_ssize_t length; | |
| Py_buffer nBlocks; | |
| Py_buffer maskBlocks; | |
| PyObject *object; | |
| char *sequence; | |
| static char* kwlist[] = {"data", "start", "end", "step", | |
| "nBlocks", "maskBlocks", NULL}; | |
| if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist, | |
| &data, &length, &start, &end, &step, | |
| &blocks_converter, &nBlocks, | |
| &blocks_converter, &maskBlocks)) | |
| return NULL; | |
| size = (end - start) / step; | |
| object = PyBytes_FromStringAndSize(NULL, size); | |
| if (!object) goto exit; | |
| sequence = PyBytes_AS_STRING(object); | |
| if (step == 1) { | |
| if (extract(data, length, start, end, sequence) < 0) { | |
| Py_DECREF(object); | |
| object = NULL; | |
| goto exit; | |
| } | |
| applyNs(sequence, start, end, &nBlocks); | |
| applyMask(sequence, start, end, &maskBlocks); | |
| } | |
| else { | |
| Py_ssize_t current, i; | |
| Py_ssize_t full_start, full_end; | |
| char* full_sequence; | |
| if (start <= end) { | |
| full_start = start; | |
| full_end = end; | |
| current = 0; /* first position in sequence */ | |
| } | |
| else { | |
| full_start = end + 1; | |
| full_end = start + 1; | |
| current = start - end - 1; /* last position in sequence */ | |
| } | |
| full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char)); | |
| full_sequence[full_end-full_start] = '\0'; | |
| if (!full_sequence) { | |
| Py_DECREF(object); | |
| object = NULL; | |
| goto exit; | |
| } | |
| if (extract(data, length, full_start, full_end, full_sequence) < 0) { | |
| PyMem_Free(full_sequence); | |
| Py_DECREF(object); | |
| object = NULL; | |
| goto exit; | |
| } | |
| applyNs(full_sequence, full_start, full_end, &nBlocks); | |
| applyMask(full_sequence, full_start, full_end, &maskBlocks); | |
| for (i = 0; i < size; current += step, i++) | |
| sequence[i] = full_sequence[current]; | |
| PyMem_Free(full_sequence); | |
| } | |
| exit: | |
| blocks_converter(NULL, &nBlocks); | |
| blocks_converter(NULL, &maskBlocks); | |
| return object; | |
| } | |
| static struct PyMethodDef _twoBitIO_methods[] = { | |
| {"convert", | |
| (PyCFunction)TwoBit_convert, | |
| METH_VARARGS | METH_KEYWORDS, | |
| TwoBit_convert__doc__ | |
| }, | |
| {NULL, NULL, 0, NULL} /* sentinel */ | |
| }; | |
| static struct PyModuleDef moduledef = { | |
| PyModuleDef_HEAD_INIT, | |
| "_twoBitIO", | |
| "Parser for DNA sequence data in 2bit format", | |
| -1, | |
| _twoBitIO_methods, | |
| NULL, | |
| NULL, | |
| NULL, | |
| NULL | |
| }; | |
| PyObject * | |
| PyInit__twoBitIO(void) | |
| { | |
| return PyModule_Create(&moduledef); | |
| } | |