Spaces:
Sleeping
Sleeping
| from collections import OrderedDict | |
| import re | |
| import json | |
| def remove_empty_lines(text): | |
| """remove empty lines""" | |
| assert (len(text) > 0) | |
| assert (isinstance(text, list)) | |
| text = [t.strip() for t in text] | |
| if "" in text: | |
| text.remove("") | |
| return text | |
| class TextGrid(object): | |
| def __init__(self, text): | |
| text = remove_empty_lines(text) | |
| self.text = text | |
| self.line_count = 0 | |
| self._get_type() | |
| self._get_time_intval() | |
| self._get_size() | |
| self.tier_list = [] | |
| self._get_item_list() | |
| def _extract_pattern(self, pattern, inc): | |
| """ | |
| Parameters | |
| ---------- | |
| pattern : regex to extract pattern | |
| inc : increment of line count after extraction | |
| Returns | |
| ------- | |
| group : extracted info | |
| """ | |
| try: | |
| group = re.match(pattern, self.text[self.line_count]).group(1) | |
| self.line_count += inc | |
| except AttributeError: | |
| raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count])) | |
| return group | |
| def _get_type(self): | |
| self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2) | |
| def _get_time_intval(self): | |
| self.xmin = self._extract_pattern(r"xmin = (.*)", 1) | |
| self.xmax = self._extract_pattern(r"xmax = (.*)", 2) | |
| def _get_size(self): | |
| self.size = int(self._extract_pattern(r"size = (.*)", 2)) | |
| def _get_item_list(self): | |
| """Only supports IntervalTier currently""" | |
| for itemIdx in range(1, self.size + 1): | |
| tier = OrderedDict() | |
| item_list = [] | |
| tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1) | |
| tier_class = self._extract_pattern(r"class = \"(.*)\"", 1) | |
| if tier_class != "IntervalTier": | |
| raise NotImplementedError("Only IntervalTier class is supported currently") | |
| tier_name = self._extract_pattern(r"name = \"(.*)\"", 1) | |
| tier_xmin = self._extract_pattern(r"xmin = (.*)", 1) | |
| tier_xmax = self._extract_pattern(r"xmax = (.*)", 1) | |
| tier_size = self._extract_pattern(r"intervals: size = (.*)", 1) | |
| for i in range(int(tier_size)): | |
| item = OrderedDict() | |
| item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1) | |
| item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1) | |
| item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1) | |
| item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1) | |
| item_list.append(item) | |
| tier["idx"] = tier_idx | |
| tier["class"] = tier_class | |
| tier["name"] = tier_name | |
| tier["xmin"] = tier_xmin | |
| tier["xmax"] = tier_xmax | |
| tier["size"] = tier_size | |
| tier["items"] = item_list | |
| self.tier_list.append(tier) | |
| def toJson(self): | |
| _json = OrderedDict() | |
| _json["file_type"] = self.file_type | |
| _json["xmin"] = self.xmin | |
| _json["xmax"] = self.xmax | |
| _json["size"] = self.size | |
| _json["tiers"] = self.tier_list | |
| return json.dumps(_json, ensure_ascii=False, indent=2) |