#define PY_SSIZE_T_CLEAN #include "Python.h" static const char bases[][4] = {"TTTT", /* 00 00 00 00 */ "TTTC", /* 00 00 00 01 */ "TTTA", /* 00 00 00 10 */ "TTTG", /* 00 00 00 11 */ "TTCT", /* 00 00 01 00 */ "TTCC", /* 00 00 01 01 */ "TTCA", /* 00 00 01 10 */ "TTCG", /* 00 00 01 11 */ "TTAT", /* 00 00 10 00 */ "TTAC", /* 00 00 10 01 */ "TTAA", /* 00 00 10 10 */ "TTAG", /* 00 00 10 11 */ "TTGT", /* 00 00 11 00 */ "TTGC", /* 00 00 11 01 */ "TTGA", /* 00 00 11 10 */ "TTGG", /* 00 00 11 11 */ "TCTT", /* 00 01 00 00 */ "TCTC", /* 00 01 00 01 */ "TCTA", /* 00 01 00 10 */ "TCTG", /* 00 01 00 11 */ "TCCT", /* 00 01 01 00 */ "TCCC", /* 00 01 01 01 */ "TCCA", /* 00 01 01 10 */ "TCCG", /* 00 01 01 11 */ "TCAT", /* 00 01 10 00 */ "TCAC", /* 00 01 10 01 */ "TCAA", /* 00 01 10 10 */ "TCAG", /* 00 01 10 11 */ "TCGT", /* 00 01 11 00 */ "TCGC", /* 00 01 11 01 */ "TCGA", /* 00 01 11 10 */ "TCGG", /* 00 01 11 11 */ "TATT", /* 00 10 00 00 */ "TATC", /* 00 10 00 01 */ "TATA", /* 00 10 00 10 */ "TATG", /* 00 10 00 11 */ "TACT", /* 00 10 01 00 */ "TACC", /* 00 10 01 01 */ "TACA", /* 00 10 01 10 */ "TACG", /* 00 10 01 11 */ "TAAT", /* 00 10 10 00 */ "TAAC", /* 00 10 10 01 */ "TAAA", /* 00 10 10 10 */ "TAAG", /* 00 10 10 11 */ "TAGT", /* 00 10 11 00 */ "TAGC", /* 00 10 11 01 */ "TAGA", /* 00 10 11 10 */ "TAGG", /* 00 10 11 11 */ "TGTT", /* 00 11 00 00 */ "TGTC", /* 00 11 00 01 */ "TGTA", /* 00 11 00 10 */ "TGTG", /* 00 11 00 11 */ "TGCT", /* 00 11 01 00 */ "TGCC", /* 00 11 01 01 */ "TGCA", /* 00 11 01 10 */ "TGCG", /* 00 11 01 11 */ "TGAT", /* 00 11 10 00 */ "TGAC", /* 00 11 10 01 */ "TGAA", /* 00 11 10 10 */ "TGAG", /* 00 11 10 11 */ "TGGT", /* 00 11 11 00 */ "TGGC", /* 00 11 11 01 */ "TGGA", /* 00 11 11 10 */ "TGGG", /* 00 11 11 11 */ "CTTT", /* 01 00 00 00 */ "CTTC", /* 01 00 00 01 */ "CTTA", /* 01 00 00 10 */ "CTTG", /* 01 00 00 11 */ "CTCT", /* 01 00 01 00 */ "CTCC", /* 01 00 01 01 */ "CTCA", /* 01 00 01 10 */ "CTCG", /* 01 00 01 11 */ "CTAT", /* 01 00 10 00 */ "CTAC", /* 01 00 10 01 */ "CTAA", /* 01 00 10 10 */ "CTAG", /* 01 00 10 11 */ "CTGT", /* 01 00 11 00 */ "CTGC", /* 01 00 11 01 */ "CTGA", /* 01 00 11 10 */ "CTGG", /* 01 00 11 11 */ "CCTT", /* 01 01 00 00 */ "CCTC", /* 01 01 00 01 */ "CCTA", /* 01 01 00 10 */ "CCTG", /* 01 01 00 11 */ "CCCT", /* 01 01 01 00 */ "CCCC", /* 01 01 01 01 */ "CCCA", /* 01 01 01 10 */ "CCCG", /* 01 01 01 11 */ "CCAT", /* 01 01 10 00 */ "CCAC", /* 01 01 10 01 */ "CCAA", /* 01 01 10 10 */ "CCAG", /* 01 01 10 11 */ "CCGT", /* 01 01 11 00 */ "CCGC", /* 01 01 11 01 */ "CCGA", /* 01 01 11 10 */ "CCGG", /* 01 01 11 11 */ "CATT", /* 01 10 00 00 */ "CATC", /* 01 10 00 01 */ "CATA", /* 01 10 00 10 */ "CATG", /* 01 10 00 11 */ "CACT", /* 01 10 01 00 */ "CACC", /* 01 10 01 01 */ "CACA", /* 01 10 01 10 */ "CACG", /* 01 10 01 11 */ "CAAT", /* 01 10 10 00 */ "CAAC", /* 01 10 10 01 */ "CAAA", /* 01 10 10 10 */ "CAAG", /* 01 10 10 11 */ "CAGT", /* 01 10 11 00 */ "CAGC", /* 01 10 11 01 */ "CAGA", /* 01 10 11 10 */ "CAGG", /* 01 10 11 11 */ "CGTT", /* 01 11 00 00 */ "CGTC", /* 01 11 00 01 */ "CGTA", /* 01 11 00 10 */ "CGTG", /* 01 11 00 11 */ "CGCT", /* 01 11 01 00 */ "CGCC", /* 01 11 01 01 */ "CGCA", /* 01 11 01 10 */ "CGCG", /* 01 11 01 11 */ "CGAT", /* 01 11 10 00 */ "CGAC", /* 01 11 10 01 */ "CGAA", /* 01 11 10 10 */ "CGAG", /* 01 11 10 11 */ "CGGT", /* 01 11 11 00 */ "CGGC", /* 01 11 11 01 */ "CGGA", /* 01 11 11 10 */ "CGGG", /* 01 11 11 11 */ "ATTT", /* 10 00 00 00 */ "ATTC", /* 10 00 00 01 */ "ATTA", /* 10 00 00 10 */ "ATTG", /* 10 00 00 11 */ "ATCT", /* 10 00 01 00 */ "ATCC", /* 10 00 01 01 */ "ATCA", /* 10 00 01 10 */ "ATCG", /* 10 00 01 11 */ "ATAT", /* 10 00 10 00 */ "ATAC", /* 10 00 10 01 */ "ATAA", /* 10 00 10 10 */ "ATAG", /* 10 00 10 11 */ "ATGT", /* 10 00 11 00 */ "ATGC", /* 10 00 11 01 */ "ATGA", /* 10 00 11 10 */ "ATGG", /* 10 00 11 11 */ "ACTT", /* 10 01 00 00 */ "ACTC", /* 10 01 00 01 */ "ACTA", /* 10 01 00 10 */ "ACTG", /* 10 01 00 11 */ "ACCT", /* 10 01 01 00 */ "ACCC", /* 10 01 01 01 */ "ACCA", /* 10 01 01 10 */ "ACCG", /* 10 01 01 11 */ "ACAT", /* 10 01 10 00 */ "ACAC", /* 10 01 10 01 */ "ACAA", /* 10 01 10 10 */ "ACAG", /* 10 01 10 11 */ "ACGT", /* 10 01 11 00 */ "ACGC", /* 10 01 11 01 */ "ACGA", /* 10 01 11 10 */ "ACGG", /* 10 01 11 11 */ "AATT", /* 10 10 00 00 */ "AATC", /* 10 10 00 01 */ "AATA", /* 10 10 00 10 */ "AATG", /* 10 10 00 11 */ "AACT", /* 10 10 01 00 */ "AACC", /* 10 10 01 01 */ "AACA", /* 10 10 01 10 */ "AACG", /* 10 10 01 11 */ "AAAT", /* 10 10 10 00 */ "AAAC", /* 10 10 10 01 */ "AAAA", /* 10 10 10 10 */ "AAAG", /* 10 10 10 11 */ "AAGT", /* 10 10 11 00 */ "AAGC", /* 10 10 11 01 */ "AAGA", /* 10 10 11 10 */ "AAGG", /* 10 10 11 11 */ "AGTT", /* 10 11 00 00 */ "AGTC", /* 10 11 00 01 */ "AGTA", /* 10 11 00 10 */ "AGTG", /* 10 11 00 11 */ "AGCT", /* 10 11 01 00 */ "AGCC", /* 10 11 01 01 */ "AGCA", /* 10 11 01 10 */ "AGCG", /* 10 11 01 11 */ "AGAT", /* 10 11 10 00 */ "AGAC", /* 10 11 10 01 */ "AGAA", /* 10 11 10 10 */ "AGAG", /* 10 11 10 11 */ "AGGT", /* 10 11 11 00 */ "AGGC", /* 10 11 11 01 */ "AGGA", /* 10 11 11 10 */ "AGGG", /* 10 11 11 11 */ "GTTT", /* 11 00 00 00 */ "GTTC", /* 11 00 00 01 */ "GTTA", /* 11 00 00 10 */ "GTTG", /* 11 00 00 11 */ "GTCT", /* 11 00 01 00 */ "GTCC", /* 11 00 01 01 */ "GTCA", /* 11 00 01 10 */ "GTCG", /* 11 00 01 11 */ "GTAT", /* 11 00 10 00 */ "GTAC", /* 11 00 10 01 */ "GTAA", /* 11 00 10 10 */ "GTAG", /* 11 00 10 11 */ "GTGT", /* 11 00 11 00 */ "GTGC", /* 11 00 11 01 */ "GTGA", /* 11 00 11 10 */ "GTGG", /* 11 00 11 11 */ "GCTT", /* 11 01 00 00 */ "GCTC", /* 11 01 00 01 */ "GCTA", /* 11 01 00 10 */ "GCTG", /* 11 01 00 11 */ "GCCT", /* 11 01 01 00 */ "GCCC", /* 11 01 01 01 */ "GCCA", /* 11 01 01 10 */ "GCCG", /* 11 01 01 11 */ "GCAT", /* 11 01 10 00 */ "GCAC", /* 11 01 10 01 */ "GCAA", /* 11 01 10 10 */ "GCAG", /* 11 01 10 11 */ "GCGT", /* 11 01 11 00 */ "GCGC", /* 11 01 11 01 */ "GCGA", /* 11 01 11 10 */ "GCGG", /* 11 01 11 11 */ "GATT", /* 11 10 00 00 */ "GATC", /* 11 10 00 01 */ "GATA", /* 11 10 00 10 */ "GATG", /* 11 10 00 11 */ "GACT", /* 11 10 01 00 */ "GACC", /* 11 10 01 01 */ "GACA", /* 11 10 01 10 */ "GACG", /* 11 10 01 11 */ "GAAT", /* 11 10 10 00 */ "GAAC", /* 11 10 10 01 */ "GAAA", /* 11 10 10 10 */ "GAAG", /* 11 10 10 11 */ "GAGT", /* 11 10 11 00 */ "GAGC", /* 11 10 11 01 */ "GAGA", /* 11 10 11 10 */ "GAGG", /* 11 10 11 11 */ "GGTT", /* 11 11 00 00 */ "GGTC", /* 11 11 00 01 */ "GGTA", /* 11 11 00 10 */ "GGTG", /* 11 11 00 11 */ "GGCT", /* 11 11 01 00 */ "GGCC", /* 11 11 01 01 */ "GGCA", /* 11 11 01 10 */ "GGCG", /* 11 11 01 11 */ "GGAT", /* 11 11 10 00 */ "GGAC", /* 11 11 10 01 */ "GGAA", /* 11 11 10 10 */ "GGAG", /* 11 11 10 11 */ "GGGT", /* 11 11 11 00 */ "GGGC", /* 11 11 11 01 */ "GGGA", /* 11 11 11 10 */ "GGGG", /* 11 11 11 11 */ }; static int extract(const unsigned char* bytes, uint32_t byteSize, uint32_t start, uint32_t end, char sequence[]) { uint32_t i; const uint32_t size = end - start; const uint32_t byteStart = start / 4; const uint32_t byteEnd = (end + 3) / 4; if (byteSize != byteEnd - byteStart) { PyErr_Format(PyExc_RuntimeError, "unexpected number of bytes %u (expected %u)", byteSize, byteEnd - byteStart); return -1; } start -= byteStart * 4; if (byteStart + 1 == byteEnd) { /* one byte only */ memcpy(sequence, &(bases[*bytes][start]), size); } else { end -= byteEnd * 4; /* end is now a negative number equal to the distance to the byte end */ memcpy(sequence, &(bases[*bytes][start]), 4 - start); bytes++; sequence += (4 - start); for (i = byteStart+1; i < byteEnd-1; i++, bytes++, sequence += 4) memcpy(sequence, bases[*bytes], 4); memcpy(sequence, bases[*bytes], end + 4); bytes++; bytes -= byteSize; } return 0; } static void applyNs(char sequence[], uint32_t start, uint32_t end, Py_buffer *nBlocks) { const Py_ssize_t nBlockCount = nBlocks->shape[0]; const uint32_t* const nBlockPositions = nBlocks->buf; Py_ssize_t i; for (i = 0; i < nBlockCount; i++) { uint32_t nBlockStart = nBlockPositions[2*i]; uint32_t nBlockEnd = nBlockPositions[2*i+1]; if (nBlockEnd < start) continue; if (end < nBlockStart) break; if (nBlockStart < start) nBlockStart = start; if (end < nBlockEnd) nBlockEnd = end; memset(sequence + nBlockStart - start, 'N', nBlockEnd - nBlockStart); } } static void applyMask(char sequence[], uint32_t start, uint32_t end, Py_buffer* maskBlocks) { const Py_ssize_t maskBlockCount = maskBlocks->shape[0]; const uint32_t* const maskBlockPositions = maskBlocks->buf; const char diff = 'a' - 'A'; Py_ssize_t i; for (i = 0; i < maskBlockCount; i++) { uint32_t j; uint32_t maskBlockStart = maskBlockPositions[2*i]; uint32_t maskBlockEnd = maskBlockPositions[2*i+1]; if (maskBlockEnd < start) continue; if (end < maskBlockStart) break; if (maskBlockStart < start) maskBlockStart = start; if (end < maskBlockEnd) maskBlockEnd = end; for (j = maskBlockStart - start; j < maskBlockEnd - start; j++) sequence[j] += diff; } } static int blocks_converter(PyObject* object, void* pointer) { const int flag = PyBUF_ND | PyBUF_FORMAT; Py_buffer *view = pointer; if (object == NULL) goto exit; if (PyObject_GetBuffer(object, view, flag) == -1) { PyErr_SetString(PyExc_RuntimeError, "blocks have unexpected format."); return 0; } if (view->itemsize != sizeof(uint32_t) || (strcmp(view->format, "I") != 0 && strcmp(view->format, "L") != 0 )) { PyErr_Format(PyExc_RuntimeError, "blocks have incorrect data type (itemsize %zd, format %s)", view->itemsize, view->format); goto exit; } if (view->ndim != 2) { PyErr_Format(PyExc_RuntimeError, "blocks have incorrect rank %d (expected 2)", view->ndim); goto exit; } if (view->shape[1] != 2) { PyErr_Format(PyExc_RuntimeError, "blocks should have two columns (found %zd)", view->shape[1]); goto exit; } return Py_CLEANUP_SUPPORTED; exit: PyBuffer_Release(view); return 0; } static char TwoBit_convert__doc__[] = "convert twoBit data to the DNA sequence, apply blocks of N's (representing unknown sequences) and masked (lower case) blocks, and return the sequence as a bytes object"; static PyObject* TwoBit_convert(PyObject* self, PyObject* args, PyObject* keywords) { const unsigned char *data; Py_ssize_t start; Py_ssize_t end; Py_ssize_t step; Py_ssize_t size; Py_ssize_t length; Py_buffer nBlocks; Py_buffer maskBlocks; PyObject *object; char *sequence; static char* kwlist[] = {"data", "start", "end", "step", "nBlocks", "maskBlocks", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywords, "y#nnnO&O&", kwlist, &data, &length, &start, &end, &step, &blocks_converter, &nBlocks, &blocks_converter, &maskBlocks)) return NULL; size = (end - start) / step; object = PyBytes_FromStringAndSize(NULL, size); if (!object) goto exit; sequence = PyBytes_AS_STRING(object); if (step == 1) { if (extract(data, length, start, end, sequence) < 0) { Py_DECREF(object); object = NULL; goto exit; } applyNs(sequence, start, end, &nBlocks); applyMask(sequence, start, end, &maskBlocks); } else { Py_ssize_t current, i; Py_ssize_t full_start, full_end; char* full_sequence; if (start <= end) { full_start = start; full_end = end; current = 0; /* first position in sequence */ } else { full_start = end + 1; full_end = start + 1; current = start - end - 1; /* last position in sequence */ } full_sequence = PyMem_Malloc((full_end-full_start+1)*sizeof(char)); full_sequence[full_end-full_start] = '\0'; if (!full_sequence) { Py_DECREF(object); object = NULL; goto exit; } if (extract(data, length, full_start, full_end, full_sequence) < 0) { PyMem_Free(full_sequence); Py_DECREF(object); object = NULL; goto exit; } applyNs(full_sequence, full_start, full_end, &nBlocks); applyMask(full_sequence, full_start, full_end, &maskBlocks); for (i = 0; i < size; current += step, i++) sequence[i] = full_sequence[current]; PyMem_Free(full_sequence); } exit: blocks_converter(NULL, &nBlocks); blocks_converter(NULL, &maskBlocks); return object; } static struct PyMethodDef _twoBitIO_methods[] = { {"convert", (PyCFunction)TwoBit_convert, METH_VARARGS | METH_KEYWORDS, TwoBit_convert__doc__ }, {NULL, NULL, 0, NULL} /* sentinel */ }; static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "_twoBitIO", "Parser for DNA sequence data in 2bit format", -1, _twoBitIO_methods, NULL, NULL, NULL, NULL }; PyObject * PyInit__twoBitIO(void) { return PyModule_Create(&moduledef); }