File size: 2,690 Bytes
9157a3b
 
 
 
 
 
8e03dad
9157a3b
 
8e03dad
9157a3b
8e03dad
4287e46
9157a3b
4287e46
243b86d
8e03dad
4287e46
243b86d
 
 
 
4287e46
 
9157a3b
4287e46
9157a3b
 
243b86d
4287e46
243b86d
4287e46
243b86d
4287e46
243b86d
 
 
 
 
 
 
 
 
 
 
 
 
 
4287e46
 
 
8e03dad
 
 
 
4287e46
8e03dad
 
4287e46
 
8e03dad
4287e46
8e03dad
9157a3b
4287e46
9157a3b
 
 
 
 
 
 
 
 
 
 
 
 
 
4287e46
9157a3b
243b86d
9157a3b
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
import whisper
from deep_translator import GoogleTranslator
import nltk
nltk.download('punkt')

def transcribe_audio(audio, model_name):
    model = whisper.load_model(model_name)
    result = model.transcribe(audio)
    return result["text"]

def translate_transcript(transcript_text, target_language, max_chunk_length=5000):
    print("Translating into", target_language)
    translator = GoogleTranslator(source='auto', target=target_language)
    
    # Split content into chunks that attempt to maintain context
    chunks = split_text_into_chunks(transcript_text, max_chunk_length)
    
    translated_chunks = []
    for chunk in chunks:
        # Translate each chunk
        translated_chunks.append(translator.translate(chunk.strip()))
    
    # Join all translated chunks into a single string
    translated_text = ' '.join(translated_chunks)
    
    return translated_text

def split_text_into_chunks(text, max_chunk_length):
    """
    Helper function to split text into chunks that attempt to maintain context.
    """
    # Split text into smaller chunks based on logical points (e.g., pauses, transitions)
    chunks = []
    current_chunk = ""
    words = nltk.word_tokenize(text)
    
    for word in words:
        if len(current_chunk) + len(word) < max_chunk_length:
            current_chunk += word + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = word + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Example usage function
def transcribe_and_translate(audio, target_language):
    if not target_language:
        target_language = "English"
    target_language_code = lang_name_to_code[target_language]
    
    # Transcribe audio
    transcript_text = transcribe_audio(audio, model_name="base")
    
    # Translate transcript to the target language
    translated_text = translate_transcript(transcript_text, target_language=target_language_code)
    
    return translated_text

# List of top 10 widely used languages with their codes
top_languages = [
    ("English", "en"),
    ("Chinese", "zh"),
    ("Spanish", "es"),
    ("Hindi", "hi"),
    ("Arabic", "ar"),
    ("Portuguese", "pt"),
    ("Bengali", "bn"),
    ("Russian", "ru"),
    ("Japanese", "ja"),
    ("Punjabi", "pa"),
]
lang_name_to_code = {name: code for name, code in top_languages}

# Gradio interface
demo = gr.Interface(
    fn=transcribe_and_translate,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Dropdown(choices=[lang[0] for lang in top_languages], label="Language")
    ],
    outputs="textbox",
)
demo.launch()