Spaces:
No application file
No application file
| # Copyright 2010 by Tiago Antao. All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Large file parsing of Genepop files. | |
| The standard parser loads the whole file into memory. This parser | |
| provides an iterator over data. | |
| Classes: | |
| - LargeRecord - Holds GenePop data. | |
| Functions: | |
| - read - Parses a GenePop record (file) into a Record object. | |
| """ | |
| def get_indiv(line): | |
| """Get individual's data from line.""" | |
| indiv_name, marker_line = line.split(",") | |
| markers = marker_line.replace("\t", " ").split(" ") | |
| markers = [marker for marker in markers if marker != ""] | |
| if len(markers[0]) in [2, 4]: # 2 digits per allele | |
| marker_len = 2 | |
| else: | |
| marker_len = 3 | |
| try: | |
| allele_list = [ | |
| (int(marker[0:marker_len]), int(marker[marker_len:])) for marker in markers | |
| ] | |
| except ValueError: # Haploid | |
| allele_list = [(int(marker[0:marker_len]),) for marker in markers] | |
| return indiv_name, allele_list, marker_len | |
| def read(handle): | |
| """Parse a handle containing a GenePop file. | |
| Arguments: | |
| - handle is a file-like object that contains a GenePop record. | |
| """ | |
| record = Record(handle) | |
| record.comment_line = next(handle).rstrip() | |
| # We can now have one loci per line or all loci in a single line | |
| # separated by either space or comma+space... | |
| # We will remove all commas on loci... that should not be a problem | |
| sample_loci_line = next(handle).rstrip().replace(",", "") | |
| all_loci = sample_loci_line.split(" ") | |
| record.loci_list.extend(all_loci) | |
| line = handle.readline() | |
| while line != "": | |
| line = line.rstrip() | |
| if line.upper() == "POP": | |
| record.stack.append("POP") | |
| break | |
| record.loci_list.append(line) | |
| line = handle.readline() | |
| next_line = handle.readline().rstrip() | |
| indiv_name, allele_list, record.marker_len = get_indiv(next_line) | |
| record.stack.append(next_line) | |
| return record | |
| class Record: | |
| """Hold information from a GenePop record. | |
| Members: | |
| marker_len The marker length (2 or 3 digit code per allele). | |
| comment_line Comment line. | |
| loci_list List of loci names. | |
| data_generator Iterates over population data. | |
| The generator will only work once. If you want to read a handle | |
| twice you have to re-open it! | |
| data_generator can either be () - an empty tuple - marking a new | |
| population or an individual. An individual is something like | |
| ('Ind1', [(1,1), (3,None), (200,201)], | |
| In the case above the individual is called Ind1, | |
| has three diploid loci. For the second loci, one of the alleles | |
| is unknown. | |
| """ | |
| def __init__(self, handle): | |
| """Initialize the class.""" | |
| self.handle = handle | |
| self.marker_len = 0 | |
| self.comment_line = "" | |
| self.loci_list = [] | |
| self.populations = [] | |
| self.stack = [] | |
| def data_generator(self): | |
| """Extract population data.""" | |
| for handle in [self.stack, self.handle]: | |
| for line in handle: | |
| line = line.rstrip() | |
| if line.upper() == "POP": | |
| yield () | |
| else: | |
| indiv_name, allele_list, marker_len = get_indiv(line) | |
| clean_list = [] | |
| for locus in allele_list: | |
| mk_real = [] | |
| for al in locus: | |
| if al == 0: | |
| mk_real.append(None) | |
| else: | |
| mk_real.append(al) | |
| clean_list.append(tuple(mk_real)) | |
| yield indiv_name, clean_list | |