Ernie4.5-VL-Video2Coder

Sleeping

App Files Files Community

zhijun.li commited on Nov 21, 2025

Commit

35da597

1 Parent(s): 5f6df4d

Cancel mandatory video agreement

Browse files

Files changed (1) hide show

app.py +33 -27

app.py CHANGED Viewed

@@ -96,8 +96,8 @@ def aggregate_and_generate_webpage(client, summaries):
     return content
 def main_process(video_file, progress=gr.Progress()):
-    # Clean progress bar logic: explicitly call progress()
-    progress(0, desc="Starting...")
     api_key = os.environ.get("ERNIE_API_KEY")
     if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
@@ -114,30 +114,34 @@ def main_process(video_file, progress=gr.Progress()):
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
-    progress(0.1, desc="Step 1/3: Extracting frames...")
     chunks = extract_frames(video_file)
     if not chunks: raise gr.Error("Frame extraction failed.")
-    progress(0.3, desc="Step 2/3: ERNIE Analyzing content...")
     chunk_summaries = {}
-    # Using ThreadPool without tqdm to avoid UI glitches
     with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
         future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
-        total_chunks = len(chunks)
         completed = 0
         for future in as_completed(future_to_chunk):
             idx, summary = future.result()
             if summary: chunk_summaries[idx] = summary
             completed += 1
-            # Smooth progress update from 0.3 to 0.8
-            current_progress = 0.3 + (0.5 * (completed / total_chunks))
-            progress(current_progress, desc=f"Step 2/3: Analyzing segment {completed}/{total_chunks}")
-    progress(0.85, desc="Step 3/3: Synthesizing final code...")
     html_code = aggregate_and_generate_webpage(client, chunk_summaries)
     output_path = "generated_website.html"
@@ -149,46 +153,47 @@ def main_process(video_file, progress=gr.Progress()):
     data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
     iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
-    progress(1.0, desc="Completed!")
-    return iframe_html, output_path, html_code
 # --- UI ---
 with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
-    # --- Header & Description (Goal 3) ---
     gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
-    with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=False):
         gr.Markdown("""
         This application is powered by **Baidu ERNIE 4.5 **, a state-of-the-art foundation model with specific enhancements for video understanding:
         *   **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
-        *   **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information, allowing precise understanding of event sequences in videos.
-        *   **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios, ensuring fine-grained details (like small code font on screen) are captured accurately.
-        *   **🚀 Long Context Window**: Supports up to 128k context length, enabling the analysis of longer tutorials and complex logic flows.
         """)
-    gr.Markdown("Upload a frontend coding tutorial video (or try the example below). The AI will watch it, understand the code, and render the result instantly.")
     with gr.Row():
         with gr.Column(scale=1):
-            # --- Input Section ---
-            video_input = gr.Video(label="Upload Video", format="mp4", height=320)
-            # --- Goal 1: Examples Component ---
-            # 用户点击这里的视频，会自动填充到上面的 video_input 中
             gr.Examples(
-                examples=[["sample_demo.mp4"]], # ⚠️ 确保你上传了名为 sample_demo.mp4 的文件
                 inputs=[video_input],
                 label="▶️ Or try this example video:",
-                cache_examples=False # 关闭缓存以节省空间
             )
             submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
         with gr.Column(scale=2):
-            # --- Output Section ---
             with gr.Tabs():
                 with gr.TabItem("🌐 Live Preview (Result)"):
                     html_preview = gr.HTML(label="Rendered Page")
@@ -199,10 +204,11 @@ with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
                 with gr.TabItem("⬇️ Download"):
                      file_download = gr.File(label="Download .html File")
     submit_btn.click(
         fn=main_process,
         inputs=[video_input],
-        outputs=[html_preview, file_download, code_output]
     )
 if __name__ == "__main__":

     return content
 def main_process(video_file, progress=gr.Progress()):
+    # 1. 初始化状态
+    yield "⏳ Initializing...", None, None, None
     api_key = os.environ.get("ERNIE_API_KEY")
     if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
+    # 2. 抽帧阶段
+    yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
+    progress(0.1, desc="Extracting frames...")
     chunks = extract_frames(video_file)
     if not chunks: raise gr.Error("Frame extraction failed.")
+    # 3. 分析阶段
+    yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
+    progress(0.3, desc="Analyzing content...")
     chunk_summaries = {}
     with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
         future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
         completed = 0
+        total = len(chunks)
         for future in as_completed(future_to_chunk):
             idx, summary = future.result()
             if summary: chunk_summaries[idx] = summary
             completed += 1
+            # 实时更新状态文字
+            yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None
+    # 4. 生成代码阶段
+    yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
+    progress(0.85, desc="Synthesizing code...")
     html_code = aggregate_and_generate_webpage(client, chunk_summaries)
     output_path = "generated_website.html"
     data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
     iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
+    progress(1.0, desc="Done")
+    # 5. 完成，返回所有结果
+    yield "✅ Generation Complete!", iframe_html, output_path, html_code
 # --- UI ---
 with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
+    # 修复2：将 open 设置为 True，默认展开
+    with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
         gr.Markdown("""
         This application is powered by **Baidu ERNIE 4.5 **, a state-of-the-art foundation model with specific enhancements for video understanding:
         *   **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
+        *   **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
+        *   **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
+        *   **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials.
         """)
+    gr.Markdown("Upload a frontend coding tutorial video. The AI will watch it, understand the code, and render the result instantly.")
     with gr.Row():
         with gr.Column(scale=1):
+            # 修复1：去掉了 height 参数，让它自适应高度，不会再“扁扁的”
+            video_input = gr.Video(label="Upload Video", format="mp4")
             gr.Examples(
+                examples=[["sample_demo.mp4"]],
                 inputs=[video_input],
                 label="▶️ Or try this example video:",
+                cache_examples=False
             )
             submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
+            # 修复3：新增一个状态文本框，直接显示在这里，不用滚轮找进度条了
+            status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)
         with gr.Column(scale=2):
             with gr.Tabs():
                 with gr.TabItem("🌐 Live Preview (Result)"):
                     html_preview = gr.HTML(label="Rendered Page")
                 with gr.TabItem("⬇️ Download"):
                      file_download = gr.File(label="Download .html File")
+    # 绑定事件：outputs 增加了 status_output
     submit_btn.click(
         fn=main_process,
         inputs=[video_input],
+        outputs=[status_output, html_preview, file_download, code_output]
     )
 if __name__ == "__main__":