Spaces:

divvun-tts
/

6L-TTS

Running

App Files Files Community

6L-TTS / langid /langid.py

asuni

Upload langid.py

4bfd07a verified 7 months ago

raw

history blame contribute delete

3.59 kB

	import fasttext
	import re
	from collections import Counter
	languages = {
	"0": "sma",
	"1": "sme",
	"2": "smj",
	"3": "fin",
	"4": "est",
	"5": "eng"
	}
	class WordLid:
	def __init__(self, model_path):
	self.model = fasttext.load_model(model_path)
	self.threshold = 0.5

	def set_threshold(self, threshold):
	self.threshold = threshold

	def _clean_word(self, word):
	word = word.lower()
	word = re.sub(r'[^\w\s]', '', word)
	word = re.sub(r'\s+', ' ', word)
	word = re.sub(r'\d', '', word)
	return word.strip()

	def _predict_all_languages(self, word):
	cleaned_word = self._clean_word(word)
	labels, probabilities = self.model.predict(cleaned_word, k=-1)

	print(word)
	for l, p in zip(labels, probabilities):
	print(f'{languages[l.replace("__label__", "")]} {p:.4f}')
	return {label.replace('__label__', ''): prob for label, prob in zip(labels, probabilities)}

	def _get_main_language(self, text):
	words = [self._clean_word(word) for word in text.split() if word]
	language_counts = Counter(
	max(self._predict_all_languages(word), key=self._predict_all_languages(word).get)
	for word in words if self._predict_all_languages(word)
	)
	return language_counts.most_common(1)[0][0] if language_counts else None


	def get_lang_array(self, text):
	main_language = self._get_main_language(text)
	if main_language is None:
	return ['unk'] * len(text)

	lang_array = [main_language] * len(text)
	word_start_index = 0

	for word in text.split():
	word_start_index = text.find(word, word_start_index)
	cleaned_word = self._clean_word(word)
	if not cleaned_word:
	word_start_index += len(word)
	continue

	predictions = self._predict_all_languages(cleaned_word)
	if not predictions:
	word_start_index += len(word)
	continue

	best_word_lang = max(predictions, key=predictions.get)
	main_lang_prob_for_word = predictions.get(main_language, 0.0) # Get main lang prob for this word
	best_lang_prob = predictions[best_word_lang]

	# Key change: Check if the best language probability is 0.5 greater than the main language probability for this word
	if best_lang_prob >= main_lang_prob_for_word + 0.5:
	for i in range(len(word)):
	lang_array[word_start_index + i] = best_word_lang

	word_start_index += len(word)
	return [int(x) for x in lang_array]
	#return lang_array

	if __name__ == '__main__':
	model_path = 'lang_id_model_q.bin'
	identifier = WordLid(model_path)

	test_texts = [
	"Mumenvákki ođđasamos badji ii gávdno vuos sámegillii. Áigumuššan lea goit dubbet dan maiddái sámegielaide, lohká Yle Draama hoavda Jarmo Lampela."

	]

	for text in test_texts:
	lang_array = identifier.get_lang_array(text)
	print(f"\nText: '{text}'")
	print(f"Language Array: {lang_array}")

	#for i in range(0,len(text)):
	# print(text[i], lang_array[i])
	assert len(lang_array) == len(text), "Length mismatch!"


	# Example of changing the threshold:
	identifier.set_threshold(0.8)
	lang_array = identifier.get_lang_array("Bonjour le monde!")
	print(f"\nText: 'Bonjour le monde!' (with threshold 0.8)")
	print(f"Language Array: {lang_array}")