Update app.py
Browse files
app.py
CHANGED
|
@@ -57,15 +57,7 @@ def check_voice_files():
|
|
| 57 |
else:
|
| 58 |
return "**All voice files are present.** 🎉"
|
| 59 |
|
| 60 |
-
# Initialize Hindi tokenizer
|
| 61 |
-
def load_hindi_tokenizer():
|
| 62 |
-
"""
|
| 63 |
-
Loads a pre-trained Hindi tokenizer from Hugging Face.
|
| 64 |
-
"""
|
| 65 |
-
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
|
| 66 |
-
return tokenizer
|
| 67 |
|
| 68 |
-
hindi_tokenizer = load_hindi_tokenizer()
|
| 69 |
|
| 70 |
# New function to split text into chunks of 100 tokens using the Hindi tokenizer
|
| 71 |
def split_text_into_chunks(text, max_tokens=100, language="en"):
|
|
@@ -74,26 +66,16 @@ def split_text_into_chunks(text, max_tokens=100, language="en"):
|
|
| 74 |
Inserts a newline after each chunk.
|
| 75 |
Uses a specialized tokenizer for Hindi language.
|
| 76 |
"""
|
| 77 |
-
if language == "hi":
|
| 78 |
-
tokens = hindi_tokenizer.tokenize(text)
|
| 79 |
-
else:
|
| 80 |
-
tokens = text.split() # Fallback to simple splitting for other languages
|
| 81 |
|
| 82 |
chunks = []
|
| 83 |
for i in range(0, len(tokens), max_tokens):
|
| 84 |
-
|
| 85 |
-
# Convert tokens back to string for Hindi
|
| 86 |
-
chunk = hindi_tokenizer.convert_tokens_to_string(tokens[i:i + max_tokens])
|
| 87 |
-
else:
|
| 88 |
-
chunk = ' '.join(tokens[i:i + max_tokens])
|
| 89 |
chunks.append(chunk)
|
| 90 |
return '\n'.join(chunks)
|
| 91 |
|
| 92 |
@spaces.GPU(duration=120)
|
| 93 |
def tts_generate(text, voice, language):
|
| 94 |
# Check for Hindi language and split text if necessary
|
| 95 |
-
if language == "hi":
|
| 96 |
-
text = split_text_into_chunks(text, max_tokens=100, language=language)
|
| 97 |
|
| 98 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
| 99 |
temp_audio_path = temp_audio.name
|
|
|
|
| 57 |
else:
|
| 58 |
return "**All voice files are present.** 🎉"
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
|
|
|
| 61 |
|
| 62 |
# New function to split text into chunks of 100 tokens using the Hindi tokenizer
|
| 63 |
def split_text_into_chunks(text, max_tokens=100, language="en"):
|
|
|
|
| 66 |
Inserts a newline after each chunk.
|
| 67 |
Uses a specialized tokenizer for Hindi language.
|
| 68 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
chunks = []
|
| 71 |
for i in range(0, len(tokens), max_tokens):
|
| 72 |
+
chunk = ' '.join(tokens[i:i + max_tokens])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
chunks.append(chunk)
|
| 74 |
return '\n'.join(chunks)
|
| 75 |
|
| 76 |
@spaces.GPU(duration=120)
|
| 77 |
def tts_generate(text, voice, language):
|
| 78 |
# Check for Hindi language and split text if necessary
|
|
|
|
|
|
|
| 79 |
|
| 80 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
| 81 |
temp_audio_path = temp_audio.name
|