Spaces:

kader1997
/

AutoCaptionPro

Running

App Files Files Community

kader1997 commited on Dec 22, 2025

Commit

41bfaac

verified ·

1 Parent(s): 253bd7c

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -46

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 os.environ["IMAGE_MAGICK_BINARY"] = "/usr/bin/convert"
-import json
 import gradio as gr
 import pandas as pd
 from faster_whisper import WhisperModel
@@ -8,59 +7,56 @@ from moviepy import VideoFileClip, TextClip, CompositeVideoClip
 from arabic_reshaper import reshape
 from bidi.algorithm import get_display
-# --- الإعدادات الثابتة ---
-FONT_PATH = "arialbd.ttf"  # تأكد من رفع ملف الخط للمساحة
 model = WhisperModel("large-v3", device="cpu", compute_type="int8")
-def process_arabic_text(text, is_active=False):
-    """تجهيز النص العربي مع إمكانية إضافة تنسيق خاص للكلمة النشطة"""
-    reshaped_text = reshape(text)
-    # ملاحظة: MoviePy لا يدعم تلوين جزء من النص داخل TextClip واحد بسهولة عبر التاغات
-    # لذلك سنعتمد على دمج الكلمات بشكل احترافي
-    return reshaped_text
 def step_1_extract_words(video_path):
     if not video_path:
-        return None, "الرجاء رفع ملف فيديو أولاً."
-    print("جاري استخراج الكلمات...")
     segments, _ = model.transcribe(video_path, word_timestamps=True, language="ar")
     words_data = []
     for segment in segments:
         for word in segment.words:
             words_data.append([word.word.strip(), round(word.start, 2), round(word.end, 2)])
-    df = pd.DataFrame(words_data, columns=["الكلمة", "البداية (ثانية)", "النهاية (ثانية)"])
-    return df, "تم الاستخراج! يمكنك مراجعة الكلمات الآن."
 def step_2_render_video(video_path, df_edited):
     if video_path is None or df_edited is None or df_edited.empty:
-        return None, "تأكد من وجود الفيديو والبيانات."
-    output_path = "final_pro_video.mp4"
     video = VideoFileClip(video_path)
     w, h = int(video.w), int(video.h)
     clips = [video]
     words_list = df_edited.values.tolist()
-    # حجم المجموعة (3 كلمات)
-    chunk_size = 3
     for i in range(len(words_list)):
-        # تحديد بداية ونهاية المجموعة الحالية (3 كلمات)
         start_chunk = (i // chunk_size) * chunk_size
         end_chunk = min(start_chunk + chunk_size, len(words_list))
         current_chunk = words_list[start_chunk:end_chunk]
-        # 1. إنشاء الجملة الكاملة (باللون الأصفر) لتعمل كخلفية
-        full_sentence = " ".join([r[0] for r in current_chunk])
-        clean_sentence = reshape(full_sentence) # بدون قلب لأن السيرفر يدعم RTL
-        # توقيت ظهور المجموعة (من بداية أول كلمة لنهاية آخر كلمة في المجموعة)
-        chunk_start_time = float(current_chunk[0][1])
-        chunk_end_time = float(current_chunk[-1][2])
-        # طبقة الجملة كاملة (أصفر)
-        bg_txt = TextClip(
-            text=clean_sentence + "\n ",
             font_size=80,
             color='yellow',
             stroke_color='black',
@@ -69,14 +65,13 @@ def step_2_render_video(video_path, df_edited):
             font=FONT_PATH,
             size=(int(w * 0.9), None),
             text_align='center'
-        ).with_start(chunk_start_time).with_duration(chunk_end_time - chunk_start_time).with_position(('center', int(h * 0.75)))
-        # 2. طبقة الكلمة النشطة (أبيض) تظهر فقط في وقتها
-        # ملاحظة: لجعلها احترافية 100% سنغير لون الجملة للأبيض بالكامل في وقت الكلمة الحالية
-        active_txt = TextClip(
-            text=clean_sentence + "\n ",
             font_size=80,
-            color='white', # الكلمة النشطة تضيء الجملة بالأبيض
             stroke_color='black',
             stroke_width=2,
             method='caption',
@@ -85,23 +80,29 @@ def step_2_render_video(video_path, df_edited):
             text_align='center'
         ).with_start(float(words_list[i][1])).with_duration(max(0.1, float(words_list[i][2]) - float(words_list[i][1]))).with_position(('center', int(h * 0.75)))
-        clips.append(bg_txt)
-        clips.append(active_txt)
     final_video = CompositeVideoClip(clips, size=(w, h))
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", fps=video.fps)
-    return output_path, "مبروك! تم إنتاج الفيديو بنمط المحترفين."
-# --- الواجهة (نفس واجهتك السابقة) ---
-with gr.Blocks(title="Caption Pro V2") as app:
-    gr.Markdown("# 🎬 محرر الكابشن: جمل 3 كلمات")
     with gr.Row():
-        video_in = gr.Video()
-        video_out = gr.Video()
     table = gr.Dataframe(headers=["الكلمة", "البداية", "النهاية"], interactive=True)
-    btn_ex = gr.Button("1. استخراج")
-    btn_re = gr.Button("2. إنتاج")
-    btn_ex.click(step_1_extract_words, inputs=[video_in], outputs=[table, video_in])
-    btn_re.click(step_2_render_video, inputs=[video_in, table], outputs=[video_out, video_in])
 app.launch()

 import os
 os.environ["IMAGE_MAGICK_BINARY"] = "/usr/bin/convert"
 import gradio as gr
 import pandas as pd
 from faster_whisper import WhisperModel
 from arabic_reshaper import reshape
 from bidi.algorithm import get_display
+# --- الإعدادات ---
+FONT_PATH = "arialbd.ttf" # تأكد من رفع هذا الملف في Hugging Face
 model = WhisperModel("large-v3", device="cpu", compute_type="int8")
+def process_arabic_text(text):
+    return reshape(text) + "\n "
 def step_1_extract_words(video_path):
     if not video_path:
+        return None, "الرجاء رفع فيديو أولاً."
+    # استخراج الكلمات بدقة large-v3
     segments, _ = model.transcribe(video_path, word_timestamps=True, language="ar")
     words_data = []
     for segment in segments:
         for word in segment.words:
             words_data.append([word.word.strip(), round(word.start, 2), round(word.end, 2)])
+    df = pd.DataFrame(words_data, columns=["الكلمة", "البداية", "النهاية"])
+    return df, "تم الاستخراج بنجاح! راجع الجدول ثم اضغط إنتاج."
 def step_2_render_video(video_path, df_edited):
     if video_path is None or df_edited is None or df_edited.empty:
+        return None, "بيانات ناقصة."
+    output_path = "output_pro.mp4"
     video = VideoFileClip(video_path)
     w, h = int(video.w), int(video.h)
     clips = [video]
     words_list = df_edited.values.tolist()
+    chunk_size = 3 # عدد الكلمات في المجموعة
     for i in range(len(words_list)):
+        # تحديد المجموعة (الجملة)
         start_chunk = (i // chunk_size) * chunk_size
         end_chunk = min(start_chunk + chunk_size, len(words_list))
         current_chunk = words_list[start_chunk:end_chunk]
+        # بناء الجملة
+        sentence = " ".join([str(r[0]) for r in current_chunk])
+        clean_sentence = process_arabic_text(sentence)
+        # توقيت المجموعة الكاملة
+        c_start = float(current_chunk[0][1])
+        c_end = float(current_chunk[-1][2])
+        # 1. طبقة الجملة (باللون الأصفر) - تظهر طوال مدة المجموعة
+        bg_clip = TextClip(
+            text=clean_sentence,
             font_size=80,
             color='yellow',
             stroke_color='black',
             font=FONT_PATH,
             size=(int(w * 0.9), None),
             text_align='center'
+        ).with_start(c_start).with_duration(c_end - c_start).with_position(('center', int(h * 0.75)))
+        # 2. طبقة "الكلمة النشطة" (باللون الأبيض) - تضيء فوق الجملة في وقتها المحدد
+        active_clip = TextClip(
+            text=clean_sentence,
             font_size=80,
+            color='white',
             stroke_color='black',
             stroke_width=2,
             method='caption',
             text_align='center'
         ).with_start(float(words_list[i][1])).with_duration(max(0.1, float(words_list[i][2]) - float(words_list[i][1]))).with_position(('center', int(h * 0.75)))
+        clips.append(bg_clip)
+        clips.append(active_clip)
     final_video = CompositeVideoClip(clips, size=(w, h))
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac", fps=video.fps)
+    return output_path, "تم إنتاج الفيديو بنمط احترافي!"
+# --- بناء الواجهة (تصحيح ربط المخرجات) ---
+with gr.Blocks() as app:
+    gr.Markdown("# 🎬 Caption Pro - 3 Words Style")
     with gr.Row():
+        video_in = gr.Video(label="Input")
+        video_out = gr.Video(label="Output")
+    status = gr.Textbox(label="Status")
     table = gr.Dataframe(headers=["الكلمة", "البداية", "النهاية"], interactive=True)
+    btn_ex = gr.Button("1. استخراج الكلمات", variant="primary")
+    btn_re = gr.Button("2. إنتاج الفيديو", variant="secondary")
+    # تصحيح الربط هنا: outputs تتبع ترتيب القيم الراجعة من الدالة
+    btn_ex.click(step_1_extract_words, inputs=[video_in], outputs=[table, status])
+    btn_re.click(step_2_render_video, inputs=[video_in, table], outputs=[video_out, status])
 app.launch()