sreepathi-ravikumar commited on
Commit
97fa939
·
verified ·
1 Parent(s): 950be3e

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +208 -56
video2.py CHANGED
@@ -16,6 +16,18 @@ import asyncio
16
  import cv2
17
  import numpy as np
18
  import subprocess, shlex, os, time
 
 
 
 
 
 
 
 
 
 
 
 
19
  # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility
20
  import math
21
  # Use /app/data which we created with proper permissions
@@ -27,70 +39,210 @@ CLIPS_DIR = os.path.join(BASE_DIR, "video")
27
  # Create directories (no chmod needed)
28
  for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
29
  Path(path).mkdir(parents=True, exist_ok=True)
30
- async def generate_tts(id,lines,lang):
31
- voice={
32
- "English": "en-US-JennyNeural",
33
- "Tamil": "ta-IN-PallaviNeural",
34
- "Hindi": "hi-IN-SwaraNeural",
35
- "Malayalam": "ml-IN-SobhanaNeural",
36
- "Kannada": "kn-IN-SapnaNeural",
37
- "Telugu": "te-IN-ShrutiNeural",
38
- "Bengali": "bn-IN-TanishaaNeural",
39
- "Marathi": "mr-IN-AarohiNeural",
40
- "Gujarati": "gu-IN-DhwaniNeural",
41
- "Punjabi": "pa-IN-VaaniNeural",
42
- "Urdu": "ur-IN-GulNeural",
43
- "French": "fr-FR-DeniseNeural",
44
- "German": "de-DE-KatjaNeural",
45
- "Spanish": "es-ES-ElviraNeural",
46
- "Italian": "it-IT-IsabellaNeural",
47
- "Russian": "ru-RU-SvetlanaNeural",
48
- "Japanese": "ja-JP-NanamiNeural",
49
- "Korean": "ko-KR-SunHiNeural",
50
- "Chinese": "zh-CN-XiaoxiaoNeural",
51
- "Arabic": "ar-SA-ZariyahNeural",
52
- "Portuguese": "pt-BR-FranciscaNeural",
53
- "Dutch": "nl-NL-FennaNeural",
54
- "Greek": "el-GR-AthinaNeural",
55
- "Hebrew": "he-IL-HilaNeural",
56
- "Turkish": "tr-TR-EmelNeural",
57
- "Polish": "pl-PL-AgnieszkaNeural",
58
- "Thai": "th-TH-AcharaNeural",
59
- "Vietnamese": "vi-VN-HoaiMyNeural",
60
- "Swedish": "sv-SE-SofieNeural",
61
- "Finnish": "fi-FI-NooraNeural",
62
- "Czech": "cs-CZ-VlastaNeural",
63
- "Hungarian": "hu-HU-NoemiNeural"
64
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  audio_name = f"audio{id}.mp3"
66
  audio_path = os.path.join(AUDIO_DIR, audio_name)
67
- if len(lang)>1:
68
  listf = lang.split("&&&")
69
  text = listf[0].strip()
70
- langvoice = voice[listf[1].strip()]
 
71
  else:
72
- text=lines[id]
73
- communicate = edge_tts.Communicate(text=text, voice=langvoice, rate="+0%")
74
- await communicate.save(audio_path)
 
75
  if os.path.exists(audio_path):
76
  audio = MP3(audio_path)
77
  duration = audio.info.length
78
  return duration, audio_path
79
  return None, None
80
- def audio_func(id,lines,lang):
81
- return asyncio.run(generate_tts(id,lines,lang))
82
- def video_func(id, lines,lang):
83
- duration, audio_path = audio_func(id,lines,lang)
 
 
 
 
 
 
 
84
  if not duration or not audio_path:
85
  print("Failed to generate audio.")
86
  return None
87
- #listf = lines.split("&&&")
88
- #TEXT = listf[0].strip()
89
- TEXT=lines[id]
90
  print("-----------------------------------------------------------------------------")
91
  print(TEXT)
92
  SKIP_SPACES = False
93
-
94
  FPS = 30 # Increased for smoother animation
95
  ANIMATION_FRAMES_PER_CHAR = 3 # Number of sub-frames for pen movement per character
96
  WIDTH, HEIGHT = 1280, 720 # Keep as is
@@ -113,7 +265,7 @@ def video_func(id, lines,lang):
113
  PEN_BASE_ANGLE = 45 # Base angle of pen (degrees)
114
  PEN_MOVEMENT_AMPLITUDE = 10 # How much the pen moves up/down (pixels)
115
  # ===================================
116
-
117
  # Helper: wrap text by pixel width using cv2.getTextSize
118
  def wrap_text_cv(text, font, font_scale, thickness, max_width):
119
  wrapped_lines = []
@@ -161,8 +313,8 @@ def video_func(id, lines,lang):
161
  if SKIP_SPACES:
162
  visible_indices = [i for i, ch in enumerate(full_text) if (ch != ' ' and ch != '\n' and ch != '\t')]
163
  else:
164
- visible_indices = list(range(len(full_text)))
165
-
166
  total_glyphs = len(visible_indices)
167
  print(f"Wrapped lines: {len(wrapped_lines)} lines, total glyphs (counted): {total_glyphs}")
168
  if total_glyphs == 0:
@@ -178,7 +330,7 @@ def video_func(id, lines,lang):
178
  (w, h), baseline = cv2.getTextSize("Ay", FONT, FONT_SCALE, THICKNESS)
179
  else:
180
  (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
181
- line_heights.append(h + baseline + LINE_SPACING)
182
  y_positions = []
183
  y = MARGIN_Y
184
  for lh in line_heights:
@@ -193,7 +345,7 @@ def video_func(id, lines,lang):
193
  f'{silent_video_path}'
194
  )
195
  print("FFMPEG CMD:", ffmpeg_cmd)
196
-
197
  proc = subprocess.Popen(shlex.split(ffmpeg_cmd), stdin=subprocess.PIPE, bufsize=10**8)
198
  # Render function, modified: if pen_x <= 0, no pen
199
  def render_frame(visible_text, pen_x, pen_y, anim_offset):
@@ -203,9 +355,9 @@ def video_func(id, lines,lang):
203
  x = MARGIN_X
204
  y = y_positions[idx]
205
  (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
206
- y_draw = y + h
207
  if line != "":
208
- cv2.putText(img, line, (x, y_draw), FONT, FONT_SCALE, TEXT_COLOR, THICKNESS, lineType=cv2.LINE_AA)
209
  if pen_x > 0: # Only draw pen if pen_x > 0
210
  offset_y = int(PEN_MOVEMENT_AMPLITUDE * math.sin(anim_offset * math.pi))
211
  pen_tip_y = pen_y + offset_y
@@ -215,7 +367,7 @@ def video_func(id, lines,lang):
215
  cv2.line(img, (pen_x, pen_tip_y), (pen_end_x, pen_end_y), PEN_COLOR, PEN_THICKNESS)
216
  cv2.circle(img, (pen_x, pen_tip_y), PEN_TIP_RADIUS, PEN_COLOR, -1)
217
  return img
218
-
219
  t0 = time.time()
220
  frames_sent = 0
221
  prev_visible_sub = ""
 
16
  import cv2
17
  import numpy as np
18
  import subprocess, shlex, os, time
19
+ import asyncio
20
+ import nest_asyncio
21
+ from IPython.display import Audio, display
22
+ import edge_tts
23
+ import re
24
+ import html
25
+ import unicodedata
26
+ from pydub import AudioSegment
27
+ from pydub.effects import normalize
28
+ import tempfile
29
+ import os
30
+ import warnings
31
  # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility
32
  import math
33
  # Use /app/data which we created with proper permissions
 
39
  # Create directories (no chmod needed)
40
  for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
41
  Path(path).mkdir(parents=True, exist_ok=True)
42
+ warnings.filterwarnings('ignore')
43
+ nest_asyncio.apply()
44
+ VOICE_EN = "en-IN-NeerjaNeural"
45
+ def clean_text_for_tts(text):
46
+ """Cleans text before TTS so only the spoken words are read."""
47
+ if not text:
48
+ return ""
49
+ text = str(text).strip()
50
+ text = html.unescape(text)
51
+ # Remove URLs
52
+ text = re.sub(r'https?://[^\s<>"\']+', '', text)
53
+ text = re.sub(r'www\.[^\s<>"\']+', '', text)
54
+ # Remove XML/HTML/SSML tags
55
+ text = re.sub(r'<[^>]*>', '', text)
56
+ text = re.sub(r'[<>]', '', text)
57
+ text = re.sub(r'[\{\}\[\]]', '', text)
58
+ # Remove problematic special characters
59
+ text = re.sub(r'[#@$%^&*_+=|\\`~]', '', text)
60
+ # Replace escape sequences
61
+ text = text.replace('\\n', ' ').replace('\\t', ' ').replace('\\r', ' ')
62
+ # Remove unwanted SSML keywords
63
+ for keyword in ['voice', 'speak', 'prosody', 'ssml', 'xmlns']:
64
+ text = re.sub(f'\\b{keyword}\\b', '', text, flags=re.IGNORECASE)
65
+ # Unicode normalization and spacing
66
+ text = unicodedata.normalize('NFKD', text)
67
+ text = re.sub(r'\s+', ' ', text)
68
+ return text.strip()
69
+ async def generate_safe_audio(text, voice):
70
+ """Generate clean, plain text audio using edge-tts."""
71
+ cleaned_text = clean_text_for_tts(text)
72
+ if not cleaned_text:
73
+ return None
74
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
75
+ fname = temp_file.name
76
+ temp_file.close()
77
+ try:
78
+ comm = edge_tts.Communicate(cleaned_text, voice=voice)
79
+ await comm.save(fname)
80
+ return fname
81
+ except Exception as e:
82
+ print(f"Error generating audio: {e}")
83
+ return None
84
+ def smart_text_chunking(text, max_chars=80):
85
+ """Split text into sensible, natural-length chunks for TTS."""
86
+ text = clean_text_for_tts(text)
87
+ if not text:
88
+ return []
89
+ sentences = re.split(r'(?<=[.!?])\s+', text)
90
+ chunks = []
91
+ for sentence in sentences:
92
+ sentence = sentence.strip()
93
+ if not sentence:
94
+ continue
95
+ if len(sentence) <= max_chars:
96
+ chunks.append(sentence)
97
+ else:
98
+ sub_parts = re.split(r'(?<=[,;:])\s+', sentence)
99
+ for part in sub_parts:
100
+ part = part.strip()
101
+ if part:
102
+ if len(part) <= max_chars:
103
+ chunks.append(part)
104
+ else:
105
+ words = part.split()
106
+ current_chunk = ""
107
+ for word in words:
108
+ if len(current_chunk + " " + word) <= max_chars:
109
+ current_chunk += " " + word if current_chunk else word
110
+ else:
111
+ if current_chunk:
112
+ chunks.append(current_chunk.strip())
113
+ current_chunk = word
114
+ if current_chunk:
115
+ chunks.append(current_chunk.strip())
116
+ return [chunk for chunk in chunks if chunk.strip()]
117
+ async def bilingual_tts_fixed(text, output_file="audio0.mp3", VOICE_TA=None):
118
+ """Main fixed function for bilingual TTS output."""
119
+ print("Starting fixed bilingual TTS processing...")
120
+ try:
121
+ chunks = smart_text_chunking(text)
122
+ if not chunks:
123
+ print("Error: No valid text chunks after cleaning")
124
+ return None
125
+ print(f"Processing {len(chunks)} text chunks...")
126
+ audio_files = []
127
+ merged_audio = None
128
+ is_bilingual_tamil = VOICE_TA is not None and "ta-IN" in VOICE_TA
129
+ for i, chunk in enumerate(chunks):
130
+ is_tamil = any('\u0B80' <= char <= '\u0BFF' for char in chunk)
131
+ if is_bilingual_tamil:
132
+ voice = VOICE_TA if is_tamil else VOICE_EN
133
+ else:
134
+ voice = VOICE_TA
135
+ lang_label = "Tamil" if is_tamil else "English"
136
+ print(f"Chunk {i+1}/{len(chunks)} ({lang_label}): {chunk[:40]}...")
137
+ audio_file = await generate_safe_audio(chunk, voice)
138
+ if audio_file:
139
+ audio_files.append(audio_file)
140
+ try:
141
+ segment = AudioSegment.from_file(audio_file)
142
+ segment = normalize(segment)
143
+ # Only strip silence if segment is reasonably long
144
+ if len(segment) > 200:
145
+ try:
146
+ segment = segment.strip_silence(silence_len=50, silence_thresh=-40)
147
+ except Exception as e:
148
+ print(f" (Info) Skipped strip_silence: {e}")
149
+ if merged_audio is None:
150
+ merged_audio = segment
151
+ else:
152
+ pause = AudioSegment.silent(duration=200)
153
+ merged_audio += pause + segment
154
+ except Exception as audio_error:
155
+ print(f"Warning: Error processing audio for chunk {i+1}: {audio_error}")
156
+ continue
157
+ if merged_audio is None:
158
+ print("Error: No audio was successfully generated")
159
+ return None
160
+ merged_audio.export(output_file, format="mp3", bitrate="128k")
161
+ print(f"✅ Audio successfully generated: {output_file}")
162
+ for temp_file in audio_files:
163
+ try:
164
+ if os.path.exists(temp_file):
165
+ os.unlink(temp_file)
166
+ except:
167
+ pass
168
+ return output_file
169
+ except Exception as main_error:
170
+ print(f"Main error in bilingual TTS: {main_error}")
171
+ return None
172
+ # USAGE EXAMPLE
173
+ async def run_fixed_tts(text_input, output_file, lang):
174
+ await bilingual_tts_fixed(text_input, output_file, lang)
175
+
176
+ async def generate_tts(id, lines, lang):
177
+ voice = {
178
+ "English": "en-US-JennyNeural",
179
+ "Tamil": "ta-IN-PallaviNeural",
180
+ "Hindi": "hi-IN-SwaraNeural",
181
+ "Malayalam": "ml-IN-SobhanaNeural",
182
+ "Kannada": "kn-IN-SapnaNeural",
183
+ "Telugu": "te-IN-ShrutiNeural",
184
+ "Bengali": "bn-IN-TanishaaNeural",
185
+ "Marathi": "mr-IN-AarohiNeural",
186
+ "Gujarati": "gu-IN-DhwaniNeural",
187
+ "Punjabi": "pa-IN-VaaniNeural",
188
+ "Urdu": "ur-IN-GulNeural",
189
+ "French": "fr-FR-DeniseNeural",
190
+ "German": "de-DE-KatjaNeural",
191
+ "Spanish": "es-ES-ElviraNeural",
192
+ "Italian": "it-IT-IsabellaNeural",
193
+ "Russian": "ru-RU-SvetlanaNeural",
194
+ "Japanese": "ja-JP-NanamiNeural",
195
+ "Korean": "ko-KR-SunHiNeural",
196
+ "Chinese": "zh-CN-XiaoxiaoNeural",
197
+ "Arabic": "ar-SA-ZariyahNeural",
198
+ "Portuguese": "pt-BR-FranciscaNeural",
199
+ "Dutch": "nl-NL-FennaNeural",
200
+ "Greek": "el-GR-AthinaNeural",
201
+ "Hebrew": "he-IL-HilaNeural",
202
+ "Turkish": "tr-TR-EmelNeural",
203
+ "Polish": "pl-PL-AgnieszkaNeural",
204
+ "Thai": "th-TH-AcharaNeural",
205
+ "Vietnamese": "vi-VN-HoaiMyNeural",
206
+ "Swedish": "sv-SE-SofieNeural",
207
+ "Finnish": "fi-FI-NooraNeural",
208
+ "Czech": "cs-CZ-VlastaNeural",
209
+ "Hungarian": "hu-HU-NoemiNeural"
210
+ }
211
  audio_name = f"audio{id}.mp3"
212
  audio_path = os.path.join(AUDIO_DIR, audio_name)
213
+ if "&&&" in lang:
214
  listf = lang.split("&&&")
215
  text = listf[0].strip()
216
+ lang_name = listf[1].strip()
217
+ voice_to_use = voice[lang_name]
218
  else:
219
+ text = lines[id]
220
+ voice_to_use = voice[lang]
221
+ loop = asyncio.get_event_loop()
222
+ output = loop.run_until_complete(run_fixed_tts(text, audio_path, voice_to_use))
223
  if os.path.exists(audio_path):
224
  audio = MP3(audio_path)
225
  duration = audio.info.length
226
  return duration, audio_path
227
  return None, None
228
+ def audio_func(id, lines, lang):
229
+ return asyncio.run(generate_tts(id, lines, lang))
230
+ #-----------------------------
231
+ #---------------------------------
232
+ def video_func(id, lines, lang):
233
+ if "&&&" in lang:
234
+ listf = lang.split("&&&")
235
+ TEXT = listf[0].strip()
236
+ else:
237
+ TEXT = lines[id]
238
+ duration, audio_path = audio_func(id, lines, lang)
239
  if not duration or not audio_path:
240
  print("Failed to generate audio.")
241
  return None
 
 
 
242
  print("-----------------------------------------------------------------------------")
243
  print(TEXT)
244
  SKIP_SPACES = False
245
+
246
  FPS = 30 # Increased for smoother animation
247
  ANIMATION_FRAMES_PER_CHAR = 3 # Number of sub-frames for pen movement per character
248
  WIDTH, HEIGHT = 1280, 720 # Keep as is
 
265
  PEN_BASE_ANGLE = 45 # Base angle of pen (degrees)
266
  PEN_MOVEMENT_AMPLITUDE = 10 # How much the pen moves up/down (pixels)
267
  # ===================================
268
+
269
  # Helper: wrap text by pixel width using cv2.getTextSize
270
  def wrap_text_cv(text, font, font_scale, thickness, max_width):
271
  wrapped_lines = []
 
313
  if SKIP_SPACES:
314
  visible_indices = [i for i, ch in enumerate(full_text) if (ch != ' ' and ch != '\n' and ch != '\t')]
315
  else:
316
+ visible_indices = [i for i, ch in enumerate(full_text) if ch != '\n']
317
+
318
  total_glyphs = len(visible_indices)
319
  print(f"Wrapped lines: {len(wrapped_lines)} lines, total glyphs (counted): {total_glyphs}")
320
  if total_glyphs == 0:
 
330
  (w, h), baseline = cv2.getTextSize("Ay", FONT, FONT_SCALE, THICKNESS)
331
  else:
332
  (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
333
+ line_heights.append(h + LINE_SPACING)
334
  y_positions = []
335
  y = MARGIN_Y
336
  for lh in line_heights:
 
345
  f'{silent_video_path}'
346
  )
347
  print("FFMPEG CMD:", ffmpeg_cmd)
348
+
349
  proc = subprocess.Popen(shlex.split(ffmpeg_cmd), stdin=subprocess.PIPE, bufsize=10**8)
350
  # Render function, modified: if pen_x <= 0, no pen
351
  def render_frame(visible_text, pen_x, pen_y, anim_offset):
 
355
  x = MARGIN_X
356
  y = y_positions[idx]
357
  (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
358
+ y_draw = y + h - baseline
359
  if line != "":
360
+ cv2.putText(img, line, (x, int(y_draw)), FONT, FONT_SCALE, TEXT_COLOR, THICKNESS, lineType=cv2.LINE_AA)
361
  if pen_x > 0: # Only draw pen if pen_x > 0
362
  offset_y = int(PEN_MOVEMENT_AMPLITUDE * math.sin(anim_offset * math.pi))
363
  pen_tip_y = pen_y + offset_y
 
367
  cv2.line(img, (pen_x, pen_tip_y), (pen_end_x, pen_end_y), PEN_COLOR, PEN_THICKNESS)
368
  cv2.circle(img, (pen_x, pen_tip_y), PEN_TIP_RADIUS, PEN_COLOR, -1)
369
  return img
370
+
371
  t0 = time.time()
372
  frames_sent = 0
373
  prev_visible_sub = ""