Spaces:

Softprodigy
/

Transcription

Sleeping

App Files Files Community

ranjeetsps commited on Jun 20, 2024

Commit

243b86d

verified ·

1 Parent(s): 4287e46

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -29

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import whisper
 from deep_translator import GoogleTranslator
 import nltk
-from nltk import sent_tokenize
 nltk.download('punkt')
 def transcribe_audio(audio, model_name, output_file):
@@ -17,22 +16,13 @@ def translate_transcript(transcript_file, target_language, output_file, max_chun
     with open(transcript_file, 'r', encoding='utf-8') as file:
         content = file.read()
-    sentences = sent_tokenize(content)
-    translated_chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) < max_chunk_length:
-            current_chunk += sentence + " "
-        else:
-            # Translate the current chunk
-            translated_chunks.extend(translate_large_text(current_chunk, translator))
-            # Start a new chunk with the current sentence
-            current_chunk = sentence + " "
-    # Translate the last chunk if it exists
-    if current_chunk:
-        translated_chunks.extend(translate_large_text(current_chunk, translator))
     # Join all translated chunks into a single string
     translated_text = ' '.join(translated_chunks)
@@ -43,26 +33,34 @@ def translate_transcript(transcript_file, target_language, output_file, max_chun
     return translated_text
-def translate_large_text(text, translator, max_chunk_length=5000):
     """
-    Helper function to translate large text by splitting into chunks at sentence boundaries.
     """
     chunks = []
-    while len(text) > max_chunk_length:
-        # Find the last period to split at sentence boundaries
-        last_period_index = text[:max_chunk_length].rfind('.')
-        if last_period_index == -1:
-            raise ValueError("Cannot find a suitable splitting point.")
-        chunk = text[:last_period_index + 1]
-        chunks.append(translator.translate(chunk.strip()))
-        text = text[last_period_index + 1:]
-    chunks.append(translator.translate(text.strip()))
     return chunks
 # Example usage function
-def transcribe(audio, target_language):
     transcript_file = "transcript.txt"
     translated_file = "translated_file.txt"
     target_language = lang_name_to_code[target_language]
     # Transcribe audio and save the transcript
@@ -90,7 +88,7 @@ lang_name_to_code = {name: code for name, code in top_languages}
 # Gradio interface
 demo = gr.Interface(
-    fn=transcribe,
     inputs=[
         gr.Audio(type="filepath"),
         gr.Dropdown(choices=[lang[0] for lang in top_languages], label="Language")

 import whisper
 from deep_translator import GoogleTranslator
 import nltk
 nltk.download('punkt')
 def transcribe_audio(audio, model_name, output_file):
     with open(transcript_file, 'r', encoding='utf-8') as file:
         content = file.read()
+    # Split content into chunks that attempt to maintain context
+    chunks = split_text_into_chunks(content, max_chunk_length)
+    translated_chunks = []
+    for chunk in chunks:
+        # Translate each chunk
+        translated_chunks.append(translator.translate(chunk.strip()))
     # Join all translated chunks into a single string
     translated_text = ' '.join(translated_chunks)
     return translated_text
+def split_text_into_chunks(text, max_chunk_length):
     """
+    Helper function to split text into chunks that attempt to maintain context.
     """
+    # Split text into smaller chunks based on logical points (e.g., pauses, transitions)
     chunks = []
+    current_chunk = ""
+    words = nltk.word_tokenize(text)
+    for word in words:
+        if len(current_chunk) + len(word) < max_chunk_length:
+            current_chunk += word + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = word + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
     return chunks
 # Example usage function
+def transcribe_and_translate(audio, target_language ):
     transcript_file = "transcript.txt"
     translated_file = "translated_file.txt"
+    if not  target_language :
+        target_language ="English"
     target_language = lang_name_to_code[target_language]
     # Transcribe audio and save the transcript
 # Gradio interface
 demo = gr.Interface(
+    fn=transcribe_and_translate,
     inputs=[
         gr.Audio(type="filepath"),
         gr.Dropdown(choices=[lang[0] for lang in top_languages], label="Language")