Spaces:
No application file
No application file
| # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Local Composition Complexity.""" | |
| import math | |
| def lcc_mult(seq, wsize): | |
| """Calculate Local Composition Complexity (LCC) values over sliding window. | |
| Returns a list of floats, the LCC values for a sliding window over | |
| the sequence. | |
| seq - an unambiguous DNA sequence (a string or Seq object) | |
| wsize - window size, integer | |
| The result is the same as applying lcc_simp multiple times, but this | |
| version is optimized for speed. The optimization works by using the | |
| value of previous window as a base to compute the next one. | |
| """ | |
| l4 = math.log(4) | |
| seq = seq.upper() | |
| tamseq = len(seq) | |
| compone = [0] | |
| lccsal = [] | |
| for i in range(wsize): | |
| compone.append(((i + 1) / wsize) * math.log((i + 1) / wsize) / l4) | |
| window = seq[0:wsize] | |
| cant_a = window.count("A") | |
| cant_c = window.count("C") | |
| cant_t = window.count("T") | |
| cant_g = window.count("G") | |
| term_a = compone[cant_a] | |
| term_c = compone[cant_c] | |
| term_t = compone[cant_t] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| tail = seq[0] | |
| for x in range(tamseq - wsize): | |
| window = seq[x + 1 : wsize + x + 1] | |
| if tail == window[-1]: | |
| lccsal.append(lccsal[-1]) | |
| elif tail == "A": | |
| cant_a -= 1 | |
| if window.endswith("C"): | |
| cant_c += 1 | |
| term_a = compone[cant_a] | |
| term_c = compone[cant_c] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("T"): | |
| cant_t += 1 | |
| term_a = compone[cant_a] | |
| term_t = compone[cant_t] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("G"): | |
| cant_g += 1 | |
| term_a = compone[cant_a] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif tail == "C": | |
| cant_c -= 1 | |
| if window.endswith("A"): | |
| cant_a += 1 | |
| term_a = compone[cant_a] | |
| term_c = compone[cant_c] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("T"): | |
| cant_t += 1 | |
| term_c = compone[cant_c] | |
| term_t = compone[cant_t] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("G"): | |
| cant_g += 1 | |
| term_c = compone[cant_c] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif tail == "T": | |
| cant_t -= 1 | |
| if window.endswith("A"): | |
| cant_a += 1 | |
| term_a = compone[cant_a] | |
| term_t = compone[cant_t] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("C"): | |
| cant_c += 1 | |
| term_c = compone[cant_c] | |
| term_t = compone[cant_t] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("G"): | |
| cant_g += 1 | |
| term_t = compone[cant_t] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif tail == "G": | |
| cant_g -= 1 | |
| if window.endswith("A"): | |
| cant_a += 1 | |
| term_a = compone[cant_a] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("C"): | |
| cant_c += 1 | |
| term_c = compone[cant_c] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| elif window.endswith("T"): | |
| cant_t += 1 | |
| term_t = compone[cant_t] | |
| term_g = compone[cant_g] | |
| lccsal.append(-(term_a + term_c + term_t + term_g)) | |
| tail = window[0] | |
| return lccsal | |
| def lcc_simp(seq): | |
| """Calculate Local Composition Complexity (LCC) for a sequence. | |
| seq - an unambiguous DNA sequence (a string or Seq object) | |
| Returns the Local Composition Complexity (LCC) value for the entire | |
| sequence (as a float). | |
| Reference: | |
| Andrzej K Konopka (2005) Sequence Complexity and Composition | |
| https://doi.org/10.1038/npg.els.0005260 | |
| """ | |
| wsize = len(seq) | |
| seq = seq.upper() | |
| l4 = math.log(4) | |
| # Check to avoid calculating the log of 0. | |
| if "A" not in seq: | |
| term_a = 0 | |
| else: | |
| term_a = (seq.count("A") / wsize) * math.log(seq.count("A") / wsize) / l4 | |
| if "C" not in seq: | |
| term_c = 0 | |
| else: | |
| term_c = (seq.count("C") / wsize) * math.log(seq.count("C") / wsize) / l4 | |
| if "T" not in seq: | |
| term_t = 0 | |
| else: | |
| term_t = (seq.count("T") / wsize) * math.log(seq.count("T") / wsize) / l4 | |
| if "G" not in seq: | |
| term_g = 0 | |
| else: | |
| term_g = (seq.count("G") / wsize) * math.log(seq.count("G") / wsize) / l4 | |
| return -(term_a + term_c + term_t + term_g) | |