ranjeetsps commited on
Commit
243b86d
·
verified ·
1 Parent(s): 4287e46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -29
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import whisper
3
  from deep_translator import GoogleTranslator
4
  import nltk
5
- from nltk import sent_tokenize
6
  nltk.download('punkt')
7
 
8
  def transcribe_audio(audio, model_name, output_file):
@@ -17,22 +16,13 @@ def translate_transcript(transcript_file, target_language, output_file, max_chun
17
  with open(transcript_file, 'r', encoding='utf-8') as file:
18
  content = file.read()
19
 
20
- sentences = sent_tokenize(content)
21
- translated_chunks = []
22
- current_chunk = ""
23
-
24
- for sentence in sentences:
25
- if len(current_chunk) + len(sentence) < max_chunk_length:
26
- current_chunk += sentence + " "
27
- else:
28
- # Translate the current chunk
29
- translated_chunks.extend(translate_large_text(current_chunk, translator))
30
- # Start a new chunk with the current sentence
31
- current_chunk = sentence + " "
32
 
33
- # Translate the last chunk if it exists
34
- if current_chunk:
35
- translated_chunks.extend(translate_large_text(current_chunk, translator))
 
36
 
37
  # Join all translated chunks into a single string
38
  translated_text = ' '.join(translated_chunks)
@@ -43,26 +33,34 @@ def translate_transcript(transcript_file, target_language, output_file, max_chun
43
 
44
  return translated_text
45
 
46
- def translate_large_text(text, translator, max_chunk_length=5000):
47
  """
48
- Helper function to translate large text by splitting into chunks at sentence boundaries.
49
  """
 
50
  chunks = []
51
- while len(text) > max_chunk_length:
52
- # Find the last period to split at sentence boundaries
53
- last_period_index = text[:max_chunk_length].rfind('.')
54
- if last_period_index == -1:
55
- raise ValueError("Cannot find a suitable splitting point.")
56
- chunk = text[:last_period_index + 1]
57
- chunks.append(translator.translate(chunk.strip()))
58
- text = text[last_period_index + 1:]
59
- chunks.append(translator.translate(text.strip()))
 
 
 
 
 
60
  return chunks
61
 
62
  # Example usage function
63
- def transcribe(audio, target_language):
64
  transcript_file = "transcript.txt"
65
  translated_file = "translated_file.txt"
 
 
66
  target_language = lang_name_to_code[target_language]
67
 
68
  # Transcribe audio and save the transcript
@@ -90,7 +88,7 @@ lang_name_to_code = {name: code for name, code in top_languages}
90
 
91
  # Gradio interface
92
  demo = gr.Interface(
93
- fn=transcribe,
94
  inputs=[
95
  gr.Audio(type="filepath"),
96
  gr.Dropdown(choices=[lang[0] for lang in top_languages], label="Language")
 
2
  import whisper
3
  from deep_translator import GoogleTranslator
4
  import nltk
 
5
  nltk.download('punkt')
6
 
7
  def transcribe_audio(audio, model_name, output_file):
 
16
  with open(transcript_file, 'r', encoding='utf-8') as file:
17
  content = file.read()
18
 
19
+ # Split content into chunks that attempt to maintain context
20
+ chunks = split_text_into_chunks(content, max_chunk_length)
 
 
 
 
 
 
 
 
 
 
21
 
22
+ translated_chunks = []
23
+ for chunk in chunks:
24
+ # Translate each chunk
25
+ translated_chunks.append(translator.translate(chunk.strip()))
26
 
27
  # Join all translated chunks into a single string
28
  translated_text = ' '.join(translated_chunks)
 
33
 
34
  return translated_text
35
 
36
+ def split_text_into_chunks(text, max_chunk_length):
37
  """
38
+ Helper function to split text into chunks that attempt to maintain context.
39
  """
40
+ # Split text into smaller chunks based on logical points (e.g., pauses, transitions)
41
  chunks = []
42
+ current_chunk = ""
43
+ words = nltk.word_tokenize(text)
44
+
45
+ for word in words:
46
+ if len(current_chunk) + len(word) < max_chunk_length:
47
+ current_chunk += word + " "
48
+ else:
49
+ if current_chunk:
50
+ chunks.append(current_chunk.strip())
51
+ current_chunk = word + " "
52
+
53
+ if current_chunk:
54
+ chunks.append(current_chunk.strip())
55
+
56
  return chunks
57
 
58
  # Example usage function
59
+ def transcribe_and_translate(audio, target_language ):
60
  transcript_file = "transcript.txt"
61
  translated_file = "translated_file.txt"
62
+ if not target_language :
63
+ target_language ="English"
64
  target_language = lang_name_to_code[target_language]
65
 
66
  # Transcribe audio and save the transcript
 
88
 
89
  # Gradio interface
90
  demo = gr.Interface(
91
+ fn=transcribe_and_translate,
92
  inputs=[
93
  gr.Audio(type="filepath"),
94
  gr.Dropdown(choices=[lang[0] for lang in top_languages], label="Language")