Spaces:
Build error
Build error
Validify-testbot-1
/
botbuilder-python
/libraries
/botbuilder-dialogs
/botbuilder
/dialogs
/choices
/find.py
| # Copyright (c) Microsoft Corporation. All rights reserved. | |
| # Licensed under the MIT License. | |
| from typing import Callable, List, Union | |
| from .choice import Choice | |
| from .find_choices_options import FindChoicesOptions, FindValuesOptions | |
| from .found_choice import FoundChoice | |
| from .found_value import FoundValue | |
| from .model_result import ModelResult | |
| from .sorted_value import SortedValue | |
| from .token import Token | |
| from .tokenizer import Tokenizer | |
| class Find: | |
| """Contains methods for matching user input against a list of choices""" | |
| def find_choices( | |
| utterance: str, | |
| choices: [Union[str, Choice]], | |
| options: FindChoicesOptions = None, | |
| ): | |
| """Matches user input against a list of choices""" | |
| if not choices: | |
| raise TypeError( | |
| "Find: choices cannot be None. Must be a [str] or [Choice]." | |
| ) | |
| opt = options if options else FindChoicesOptions() | |
| # Normalize list of choices | |
| choices_list = [ | |
| Choice(value=choice) if isinstance(choice, str) else choice | |
| for choice in choices | |
| ] | |
| # Build up full list of synonyms to search over. | |
| # - Each entry in the list contains the index of the choice it belongs to which will later be | |
| # used to map the search results back to their choice. | |
| synonyms: [SortedValue] = [] | |
| for index, choice in enumerate(choices_list): | |
| if not opt.no_value: | |
| synonyms.append(SortedValue(value=choice.value, index=index)) | |
| if ( | |
| getattr(choice, "action", False) | |
| and getattr(choice.action, "title", False) | |
| and not opt.no_value | |
| ): | |
| synonyms.append(SortedValue(value=choice.action.title, index=index)) | |
| if choice.synonyms is not None: | |
| for synonym in choice.synonyms: | |
| synonyms.append(SortedValue(value=synonym, index=index)) | |
| def found_choice_constructor(value_model: ModelResult) -> ModelResult: | |
| choice = choices_list[value_model.resolution.index] | |
| return ModelResult( | |
| start=value_model.start, | |
| end=value_model.end, | |
| type_name="choice", | |
| text=value_model.text, | |
| resolution=FoundChoice( | |
| value=choice.value, | |
| index=value_model.resolution.index, | |
| score=value_model.resolution.score, | |
| synonym=value_model.resolution.value, | |
| ), | |
| ) | |
| # Find synonyms in utterance and map back to their choices_list | |
| return list( | |
| map( | |
| found_choice_constructor, Find.find_values(utterance, synonyms, options) | |
| ) | |
| ) | |
| def find_values( | |
| utterance: str, values: List[SortedValue], options: FindValuesOptions = None | |
| ) -> List[ModelResult]: | |
| # Sort values in descending order by length, so that the longest value is searchd over first. | |
| sorted_values = sorted( | |
| values, key=lambda sorted_val: len(sorted_val.value), reverse=True | |
| ) | |
| # Search for each value within the utterance. | |
| matches: [ModelResult] = [] | |
| opt = options if options else FindValuesOptions() | |
| tokenizer: Callable[[str, str], List[Token]] = ( | |
| opt.tokenizer if opt.tokenizer else Tokenizer.default_tokenizer | |
| ) | |
| tokens = tokenizer(utterance, opt.locale) | |
| max_distance = ( | |
| opt.max_token_distance if opt.max_token_distance is not None else 2 | |
| ) | |
| for entry in sorted_values: | |
| # Find all matches for a value | |
| # - To match "last one" in "the last time I chose the last one" we need | |
| # to re-search the string starting from the end of the previous match. | |
| # - The start & end position returned for the match are token positions. | |
| start_pos = 0 | |
| searched_tokens = tokenizer(entry.value.strip(), opt.locale) | |
| while start_pos < len(tokens): | |
| match: Union[ModelResult, None] = Find._match_value( | |
| tokens, | |
| max_distance, | |
| opt, | |
| entry.index, | |
| entry.value, | |
| searched_tokens, | |
| start_pos, | |
| ) | |
| if match is not None: | |
| start_pos = match.end + 1 | |
| matches.append(match) | |
| else: | |
| break | |
| # Sort matches by score descending | |
| sorted_matches = sorted( | |
| matches, | |
| key=lambda model_result: model_result.resolution.score, | |
| reverse=True, | |
| ) | |
| # Filter out duplicate matching indexes and overlapping characters | |
| # - The start & end positions are token positions and need to be translated to | |
| # character positions before returning. We also need to populate the "text" | |
| # field as well. | |
| results: List[ModelResult] = [] | |
| found_indexes = set() | |
| used_tokens = set() | |
| for match in sorted_matches: | |
| # Apply filters. | |
| add = match.resolution.index not in found_indexes | |
| for i in range(match.start, match.end + 1): | |
| if i in used_tokens: | |
| add = False | |
| break | |
| # Add to results | |
| if add: | |
| # Update filter info | |
| found_indexes.add(match.resolution.index) | |
| for i in range(match.start, match.end + 1): | |
| used_tokens.add(i) | |
| # Translate start & end and populate text field | |
| match.start = tokens[match.start].start | |
| match.end = tokens[match.end].end | |
| match.text = utterance[match.start : match.end + 1] | |
| results.append(match) | |
| # Return the results sorted by position in the utterance | |
| return sorted(results, key=lambda model_result: model_result.start) | |
| def _match_value( | |
| source_tokens: List[Token], | |
| max_distance: int, | |
| options: FindValuesOptions, | |
| index: int, | |
| value: str, | |
| searched_tokens: List[Token], | |
| start_pos: int, | |
| ) -> Union[ModelResult, None]: | |
| # Match value to utterance and calculate total deviation. | |
| # - The tokens are matched in order so "second last" will match in | |
| # "the second from last one" but not in "the last from the second one". | |
| # - The total deviation is a count of the number of tokens skipped in the | |
| # match so for the example above the number of tokens matched would be | |
| # 2 and the total deviation would be 1. | |
| matched = 0 | |
| total_deviation = 0 | |
| start = -1 | |
| end = -1 | |
| for token in searched_tokens: | |
| # Find the position of the token in the utterance. | |
| pos = Find._index_of_token(source_tokens, token, start_pos) | |
| if pos >= 0: | |
| # Calculate the distance between the current token's position and the previous token's distance. | |
| distance = pos - start_pos if matched > 0 else 0 | |
| if distance <= max_distance: | |
| # Update count of tokens matched and move start pointer to search for next token | |
| # after the current token | |
| matched += 1 | |
| total_deviation += distance | |
| start_pos = pos + 1 | |
| # Update start & end position that will track the span of the utterance that's matched. | |
| if start < 0: | |
| start = pos | |
| end = pos | |
| # Calculate score and format result | |
| # - The start & end positions and the results text field will be corrected by the caller. | |
| result: ModelResult = None | |
| if matched > 0 and ( | |
| matched == len(searched_tokens) or options.allow_partial_matches | |
| ): | |
| # Percentage of tokens matched. If matching "second last" in | |
| # "the second form last one" the completeness would be 1.0 since | |
| # all tokens were found. | |
| completeness = matched / len(searched_tokens) | |
| # Accuracy of the match. The accuracy is reduced by additional tokens | |
| # occuring in the value that weren't in the utterance. So an utterance | |
| # of "second last" matched against a value of "second from last" would | |
| # result in an accuracy of 0.5. | |
| accuracy = float(matched) / (matched + total_deviation) | |
| # The final score is simply the compeleteness multiplied by the accuracy. | |
| score = completeness * accuracy | |
| # Format result | |
| result = ModelResult( | |
| text="", | |
| start=start, | |
| end=end, | |
| type_name="value", | |
| resolution=FoundValue(value=value, index=index, score=score), | |
| ) | |
| return result | |
| def _index_of_token(tokens: List[Token], token: Token, start_pos: int) -> int: | |
| for i in range(start_pos, len(tokens)): | |
| if tokens[i].normalized == token.normalized: | |
| return i | |
| return -1 | |