Spaces:

adirathor07
/

AutoEval

Sleeping

App Files Files Community

AutoEval / doctr /models /recognition /utils.py

adirathor07

added doctr folder

153628e over 1 year ago

raw

history blame contribute delete

3.55 kB

	# Copyright (C) 2021-2024, Mindee.

	# This program is licensed under the Apache License 2.0.
	# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

	from typing import List

	from rapidfuzz.distance import Levenshtein

	__all__ = ["merge_strings", "merge_multi_strings"]


	def merge_strings(a: str, b: str, dil_factor: float) -> str:
	"""Merges 2 character sequences in the best way to maximize the alignment of their overlapping characters.

	Args:
	----
	a: first char seq, suffix should be similar to b's prefix.
	b: second char seq, prefix should be similar to a's suffix.
	dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
	only used when the mother sequence is splitted on a character repetition

	Returns:
	-------
	A merged character sequence.

	Example::
	>>> from doctr.model.recognition.utils import merge_sequences
	>>> merge_sequences('abcd', 'cdefgh', 1.4)
	'abcdefgh'
	>>> merge_sequences('abcdi', 'cdefgh', 1.4)
	'abcdefgh'
	"""
	seq_len = min(len(a), len(b))
	if seq_len == 0: # One sequence is empty, return the other
	return b if len(a) == 0 else a

	# Initialize merging index and corresponding score (mean Levenstein)
	min_score, index = 1.0, 0 # No overlap, just concatenate

	scores = [Levenshtein.distance(a[-i:], b[:i], processor=None) / i for i in range(1, seq_len + 1)]

	# Edge case (split in the middle of char repetitions): if it starts with 2 or more 0
	if len(scores) > 1 and (scores[0], scores[1]) == (0, 0):
	# Compute n_overlap (number of overlapping chars, geometrically determined)
	n_overlap = round(len(b) * (dil_factor - 1) / dil_factor)
	# Find the number of consecutive zeros in the scores list
	# Impossible to have a zero after a non-zero score in that case
	n_zeros = sum(val == 0 for val in scores)
	# Index is bounded by the geometrical overlap to avoid collapsing repetitions
	min_score, index = 0, min(n_zeros, n_overlap)

	else: # Common case: choose the min score index
	for i, score in enumerate(scores):
	if score < min_score:
	min_score, index = score, i + 1 # Add one because first index is an overlap of 1 char

	# Merge with correct overlap
	if index == 0:
	return a + b
	return a[:-1] + b[index - 1 :]


	def merge_multi_strings(seq_list: List[str], dil_factor: float) -> str:
	"""Recursively merges consecutive string sequences with overlapping characters.

	Args:
	----
	seq_list: list of sequences to merge. Sequences need to be ordered from left to right.
	dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
	only used when the mother sequence is splitted on a character repetition

	Returns:
	-------
	A merged character sequence

	Example::
	>>> from doctr.model.recognition.utils import merge_multi_sequences
	>>> merge_multi_sequences(['abc', 'bcdef', 'difghi', 'aijkl'], 1.4)
	'abcdefghijkl'
	"""

	def _recursive_merge(a: str, seq_list: List[str], dil_factor: float) -> str:
	# Recursive version of compute_overlap
	if len(seq_list) == 1:
	return merge_strings(a, seq_list[0], dil_factor)
	return _recursive_merge(merge_strings(a, seq_list[0], dil_factor), seq_list[1:], dil_factor)

	return _recursive_merge("", seq_list, dil_factor)