sreepathi-ravikumar commited on
Commit
eb0f122
·
verified ·
1 Parent(s): cb8ee6b

Update video2.py

Browse files
Files changed (1) hide show
  1. video2.py +237 -41
video2.py CHANGED
@@ -12,6 +12,11 @@ import rust_highlight
12
  import rust_combiner
13
  import shutil
14
  import asyncio
 
 
 
 
 
15
 
16
  # Use /app/data which we created with proper permissions
17
  BASE_DIR = "/app/data"
@@ -24,11 +29,13 @@ CLIPS_DIR = os.path.join(BASE_DIR, "video")
24
  for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
25
  Path(path).mkdir(parents=True, exist_ok=True)
26
 
27
- async def generate_tts(id,lines):
28
- voice = "en-US-GuyNeural"
29
  audio_name = f"audio{id}.mp3"
30
  audio_path = os.path.join(AUDIO_DIR, audio_name)
31
-
 
 
32
  communicate = edge_tts.Communicate(text=lines[id], voice=voice, rate="+0%")
33
  await communicate.save(audio_path)
34
 
@@ -38,44 +45,233 @@ async def generate_tts(id,lines):
38
  return duration, audio_path
39
  return None, None
40
 
41
- def audio_func(id,lines):
42
- return asyncio.run(generate_tts(id,lines))
43
-
44
- # --- CONFIGURATION ---
45
  def video_func(id, lines):
46
  duration, audio_path = audio_func(id, lines)
47
- image_path = os.path.join(IMAGE_DIR, f"slide{id}.png")
48
- img = Image.open(image_path)
49
- data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
50
-
51
- words = []
52
- for i in range(len(data['text'])):
53
- txt = data['text'][i].strip()
54
- if txt and int(data['conf'][i]) > 60:
55
- box = (
56
- data['left'][i],
57
- data['top'][i],
58
- data['width'][i],
59
- data['height'][i],
60
- )
61
- words.append((txt, box))
62
-
63
- clip_file = rust_highlight.render_video(
64
- id=id,
65
- image_path=image_path,
66
- audio_path=audio_path,
67
- duration=duration,
68
- words=words,
69
- output_dir=CLIPS_DIR # Add your output directory here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  )
71
- print(f"Created {clip_file}")
72
-
73
- def video_com(lines):
74
- video_path = f"/tmp/video_{uuid.uuid4().hex}.mp4"
75
- clips = []
76
- for id in range(len(lines)):
77
- clip = f"/app/data/video/clip{id}.mp4"
78
- clips.append(clip)
79
-
80
- video_path = rust_combiner.combine_clips(clips)
81
- return video_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  import rust_combiner
13
  import shutil
14
  import asyncio
15
+ import cv2
16
+ import numpy as np
17
+ import subprocess, shlex, os, time
18
+ # from IPython.display import Video, display, HTML # Commented out for Hugging Face Spaces compatibility
19
+ import math
20
 
21
  # Use /app/data which we created with proper permissions
22
  BASE_DIR = "/app/data"
 
29
  for path in [BASE_DIR, AUDIO_DIR, CLIPS_DIR]:
30
  Path(path).mkdir(parents=True, exist_ok=True)
31
 
32
+ async def generate_tts(id, lines):
33
+ voice = "en-US-JennyNeural"
34
  audio_name = f"audio{id}.mp3"
35
  audio_path = os.path.join(AUDIO_DIR, audio_name)
36
+ #listf = lines.split("&&&")
37
+ #text = listf[0].strip()
38
+ #lang = listf[1].strip()
39
  communicate = edge_tts.Communicate(text=lines[id], voice=voice, rate="+0%")
40
  await communicate.save(audio_path)
41
 
 
45
  return duration, audio_path
46
  return None, None
47
 
48
+ def audio_func(id, lines):
49
+ return asyncio.run(generate_tts(id, lines))
50
+
 
51
  def video_func(id, lines):
52
  duration, audio_path = audio_func(id, lines)
53
+ if not duration or not audio_path:
54
+ print("Failed to generate audio.")
55
+ return None
56
+
57
+ listf = lines.split("&&&")
58
+ TEXT = listf[0].strip()
59
+
60
+ SKIP_SPACES = False
61
+
62
+ FPS = 30 # Increased for smoother animation
63
+ ANIMATION_FRAMES_PER_CHAR = 3 # Number of sub-frames for pen movement per character
64
+ WIDTH, HEIGHT = 1280, 720 # Keep as is
65
+ MARGIN_X, MARGIN_Y = 40, 60
66
+ LINE_SPACING = 8 # additional px between lines
67
+ FONT = cv2.FONT_HERSHEY_SIMPLEX
68
+ FONT_SCALE = 1.0 # tweak for desired size
69
+ THICKNESS = 2
70
+ TEXT_COLOR = (0, 0, 0) # BGR
71
+ BG_COLOR = (255, 255, 255) # BGR
72
+ silent_video_name = f"silent_video{id}.mp4"
73
+ silent_video_path = os.path.join(CLIPS_DIR, silent_video_name)
74
+ FFMPEG_PRESET = "ultrafast" # fastest encode
75
+ CRF = 23 # For faster encoding
76
+ # Pen settings
77
+ PEN_COLOR = (0, 0, 255) # Red pen for visibility (BGR)
78
+ PEN_TIP_RADIUS = 5 # Size of pen tip circle
79
+ PEN_LENGTH = 20 # Length of pen line
80
+ PEN_THICKNESS = 2 # Thickness of pen line
81
+ PEN_BASE_ANGLE = 45 # Base angle of pen (degrees)
82
+ PEN_MOVEMENT_AMPLITUDE = 10 # How much the pen moves up/down (pixels)
83
+ # ===================================
84
+
85
+ # Helper: wrap text by pixel width using cv2.getTextSize
86
+ def wrap_text_cv(text, font, font_scale, thickness, max_width):
87
+ wrapped_lines = []
88
+ for para in text.splitlines():
89
+ if para == "":
90
+ wrapped_lines.append("") # preserve blank line
91
+ continue
92
+ words = para.split(" ")
93
+ cur = ""
94
+ for w in words:
95
+ candidate = w if cur == "" else cur + " " + w
96
+ (w_w, w_h), _ = cv2.getTextSize(candidate, font, font_scale, thickness)
97
+ if w_w <= max_width:
98
+ cur = candidate
99
+ else:
100
+ if cur != "":
101
+ wrapped_lines.append(cur)
102
+ (single_w, _), _ = cv2.getTextSize(w, font, font_scale, thickness)
103
+ if single_w > max_width:
104
+ chunk = ""
105
+ for ch in w:
106
+ cand2 = chunk + ch
107
+ (c_w, _), _ = cv2.getTextSize(cand2, font, font_scale, thickness)
108
+ if c_w <= max_width:
109
+ chunk = cand2
110
+ else:
111
+ wrapped_lines.append(chunk)
112
+ chunk = ch
113
+ if chunk:
114
+ cur = chunk
115
+ else:
116
+ cur = ""
117
+ else:
118
+ cur = w
119
+ if cur != "":
120
+ wrapped_lines.append(cur)
121
+ return wrapped_lines
122
+
123
+ # Pre-wrap text
124
+ text_area_width = WIDTH - 2 * MARGIN_X
125
+ wrapped_lines = wrap_text_cv(TEXT, FONT, FONT_SCALE, THICKNESS, text_area_width)
126
+ full_text = "\n".join(wrapped_lines)
127
+ if not full_text:
128
+ full_text = ""
129
+
130
+ # Visible indices
131
+ if SKIP_SPACES:
132
+ visible_indices = [i for i, ch in enumerate(full_text) if (ch != ' ' and ch != '\n' and ch != '\t')]
133
+ else:
134
+ visible_indices = list(range(len(full_text)))
135
+
136
+ total_glyphs = len(visible_indices)
137
+ print(f"Wrapped lines: {len(wrapped_lines)} lines, total glyphs (counted): {total_glyphs}")
138
+
139
+ if total_glyphs == 0:
140
+ print("No text to animate.")
141
+ return None
142
+
143
+ # Calculate REPEAT_FRAMES_PER_CHAR to approximate audio duration
144
+ desired_frames = math.ceil(duration * FPS)
145
+ min_frames = total_glyphs * ANIMATION_FRAMES_PER_CHAR
146
+ extra_frames = desired_frames - min_frames
147
+ if extra_frames > 0:
148
+ REPEAT_FRAMES_PER_CHAR = math.floor(extra_frames / total_glyphs)
149
+ remaining_frames = extra_frames % total_glyphs
150
+ else:
151
+ REPEAT_FRAMES_PER_CHAR = 0
152
+ remaining_frames = 0
153
+
154
+ # But we'll add remaining as hold at end if needed, but since later we use subclip, it's ok.
155
+
156
+ # Pre-calc line heights and y_positions
157
+ line_heights = []
158
+ for line in wrapped_lines:
159
+ if line == "":
160
+ (w, h), baseline = cv2.getTextSize("Ay", FONT, FONT_SCALE, THICKNESS)
161
+ else:
162
+ (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
163
+ line_heights.append(h + baseline + LINE_SPACING)
164
+
165
+ y_positions = []
166
+ y = MARGIN_Y
167
+ for lh in line_heights:
168
+ y_positions.append(y)
169
+ y += lh
170
+
171
+ # Prepare ffmpeg
172
+ ffmpeg_cmd = (
173
+ f'ffmpeg -y '
174
+ f'-f rawvideo -pix_fmt bgr24 -s {WIDTH}x{HEIGHT} -r {FPS} -i - '
175
+ f'-an '
176
+ f'-c:v libx264 -preset {FFMPEG_PRESET} -crf {CRF} -pix_fmt yuv420p '
177
+ f'{silent_video_path}'
178
  )
179
+ print("FFMPEG CMD:", ffmpeg_cmd)
180
+
181
+ proc = subprocess.Popen(shlex.split(ffmpeg_cmd), stdin=subprocess.PIPE, bufsize=10**8)
182
+
183
+ # Render function, modified: if pen_x <= 0, no pen
184
+ def render_frame(visible_text, pen_x, pen_y, anim_offset):
185
+ img = np.full((HEIGHT, WIDTH, 3), BG_COLOR, dtype=np.uint8)
186
+ lines = visible_text.split("\n")
187
+ for idx, line in enumerate(lines):
188
+ x = MARGIN_X
189
+ y = y_positions[idx]
190
+ (w, h), baseline = cv2.getTextSize(line, FONT, FONT_SCALE, THICKNESS)
191
+ y_draw = y + h
192
+ if line != "":
193
+ cv2.putText(img, line, (x, y_draw), FONT, FONT_SCALE, TEXT_COLOR, THICKNESS, lineType=cv2.LINE_AA)
194
+
195
+ if pen_x > 0: # Only draw pen if pen_x > 0
196
+ offset_y = int(PEN_MOVEMENT_AMPLITUDE * math.sin(anim_offset * math.pi))
197
+ pen_tip_y = pen_y + offset_y
198
+ angle_rad = math.radians(PEN_BASE_ANGLE)
199
+ pen_end_x = pen_x + int(PEN_LENGTH * math.cos(angle_rad))
200
+ pen_end_y = pen_tip_y - int(PEN_LENGTH * math.sin(angle_rad))
201
+ cv2.line(img, (pen_x, pen_tip_y), (pen_end_x, pen_end_y), PEN_COLOR, PEN_THICKNESS)
202
+ cv2.circle(img, (pen_x, pen_tip_y), PEN_TIP_RADIUS, PEN_COLOR, -1)
203
+
204
+ return img
205
+
206
+ t0 = time.time()
207
+ frames_sent = 0
208
+ prev_visible_sub = ""
209
+ last_pen_x = 0
210
+ last_pen_y = 0
211
+ for rank, idx_in_full in enumerate(visible_indices):
212
+ visible_sub = full_text[:idx_in_full + 1]
213
+
214
+ if visible_sub != prev_visible_sub:
215
+ lines = visible_sub.split("\n")
216
+ last_line = lines[-1]
217
+ line_idx = len(lines) - 1
218
+ (w, h), baseline = cv2.getTextSize(last_line, FONT, FONT_SCALE, THICKNESS)
219
+ pen_x = MARGIN_X + w + 5
220
+ pen_y = y_positions[line_idx] + h // 2
221
+ last_pen_x = pen_x
222
+ last_pen_y = pen_y
223
+
224
+ for anim_step in range(ANIMATION_FRAMES_PER_CHAR):
225
+ frame_img = render_frame(visible_sub, pen_x, pen_y, anim_step / ANIMATION_FRAMES_PER_CHAR)
226
+ proc.stdin.write(frame_img.tobytes())
227
+ frames_sent += 1
228
+
229
+ prev_visible_sub = visible_sub
230
+
231
+ for r in range(REPEAT_FRAMES_PER_CHAR):
232
+ frame_img = render_frame(visible_sub, pen_x, pen_y, 0)
233
+ proc.stdin.write(frame_img.tobytes())
234
+ frames_sent += 1
235
+
236
+ # Add remaining frames as hold with pen (or without, but keep consistent)
237
+ for _ in range(remaining_frames):
238
+ frame_img = render_frame(full_text, last_pen_x, last_pen_y, 0)
239
+ proc.stdin.write(frame_img.tobytes())
240
+ frames_sent += 1
241
+
242
+ # To pad if still short (but shouldn't be), but we can skip since approximate.
243
+
244
+ proc.stdin.close()
245
+ proc.wait()
246
+ elapsed = time.time() - t0
247
+ print(f"Frames sent: {frames_sent}, elapsed time: {elapsed:.3f} seconds")
248
+
249
+ if not os.path.exists(silent_video_path):
250
+ print("Silent video generation failed.")
251
+ return None
252
+
253
+ # Now combine with audio using MoviePy
254
+ final_video_name = f"final_video{id}.mp4"
255
+ final_video_path = os.path.join(CLIPS_DIR, final_video_name)
256
+
257
+ video_clip = VideoFileClip(silent_video_path)
258
+ audio_clip = AudioFileClip(audio_path)
259
+
260
+ # Set video duration to exactly match audio (speed up/slow down if necessary, but since we adjusted, should be close)
261
+ # If video longer, subclip to audio duration; if shorter, it will pad silence but since we padded, likely longer or equal.
262
+ final_clip = video_clip.set_duration(duration).set_audio(audio_clip)
263
+
264
+ # Write final video
265
+ final_clip.write_videofile(final_video_path, codec='libx264', audio_codec='aac', preset='ultrafast')
266
+
267
+ # Print the final video file name
268
+ print(f"Final video saved at: {final_video_path}")
269
+
270
+ # For notebook display (comment out if not needed in HF Spaces)
271
+ # if os.path.exists(final_video_path):
272
+ # display(Video(final_video_path, embed=True, width=WIDTH, height=HEIGHT))
273
+
274
+ # Clean up silent video if not needed
275
+ os.remove(silent_video_path)
276
+
277
+ return final_video_path