Spaces:
Build error
Build error
Validify-testbot-1
/
botbuilder-python
/libraries
/botbuilder-dialogs
/botbuilder
/dialogs
/choices
/tokenizer.py
| # Copyright (c) Microsoft Corporation. All rights reserved. | |
| # Licensed under the MIT License. | |
| from typing import Union | |
| from .token import Token | |
| class Tokenizer: | |
| """Provides a default tokenizer implementation.""" | |
| def default_tokenizer( # pylint: disable=unused-argument | |
| text: str, locale: str = None | |
| ) -> [Token]: | |
| """ | |
| Simple tokenizer that breaks on spaces and punctuation. The only normalization is to lowercase. | |
| Parameter: | |
| --------- | |
| text: The input text. | |
| locale: (Optional) Identifies the locale of the input text. | |
| """ | |
| tokens: [Token] = [] | |
| token: Union[Token, None] = None | |
| # Parse text | |
| length: int = len(text) if text else 0 | |
| i: int = 0 | |
| while i < length: | |
| # Get both the UNICODE value of the current character and the complete character itself | |
| # which can potentially be multiple segments | |
| code_point = ord(text[i]) | |
| char = chr(code_point) | |
| # Process current character | |
| if Tokenizer._is_breaking_char(code_point): | |
| # Character is in Unicode Plane 0 and is in an excluded block | |
| Tokenizer._append_token(tokens, token, i - 1) | |
| token = None | |
| elif code_point > 0xFFFF: | |
| # Character is in a Supplementary Unicode Plane. This is where emoji live so | |
| # we're going to just break each character in this range out as its own token | |
| Tokenizer._append_token(tokens, token, i - 1) | |
| token = None | |
| tokens.append(Token(start=i, end=i, text=char, normalized=char)) | |
| elif token is None: | |
| # Start a new token | |
| token = Token(start=i, end=0, text=char, normalized=None) | |
| else: | |
| # Add onto current token | |
| token.text += char | |
| i += 1 | |
| Tokenizer._append_token(tokens, token, length - 1) | |
| return tokens | |
| def _is_breaking_char(code_point) -> bool: | |
| return ( | |
| Tokenizer._is_between(code_point, 0x0000, 0x002F) | |
| or Tokenizer._is_between(code_point, 0x003A, 0x0040) | |
| or Tokenizer._is_between(code_point, 0x005B, 0x0060) | |
| or Tokenizer._is_between(code_point, 0x007B, 0x00BF) | |
| or Tokenizer._is_between(code_point, 0x02B9, 0x036F) | |
| or Tokenizer._is_between(code_point, 0x2000, 0x2BFF) | |
| or Tokenizer._is_between(code_point, 0x2E00, 0x2E7F) | |
| ) | |
| def _is_between(value: int, from_val: int, to_val: int) -> bool: | |
| """ | |
| Parameters: | |
| ----------- | |
| value: number value | |
| from: low range | |
| to: high range | |
| """ | |
| return from_val <= value <= to_val | |
| def _append_token(tokens: [Token], token: Token, end: int): | |
| if token is not None: | |
| token.end = end | |
| token.normalized = token.text.lower() | |
| tokens.append(token) | |