| | import os |
| | import cv2 |
| | import time |
| | import base64 |
| | import re |
| | import gradio as gr |
| | from openai import OpenAI |
| | from concurrent.futures import ThreadPoolExecutor, as_completed |
| |
|
| | |
| | BASE_URL = "https://aistudio.baidu.com/llm/lmapi/v3" |
| | MODEL_NAME = "ernie-4.5-turbo-vl" |
| | MAX_CONCURRENT_REQUESTS = 4 |
| | MAX_VIDEO_DURATION_SEC = 1800 |
| |
|
| | def extract_frames(video_path, interval_sec=1): |
| | """Extract frames from the video.""" |
| | cap = cv2.VideoCapture(video_path) |
| | if not cap.isOpened(): |
| | return [] |
| | fps = cap.get(cv2.CAP_PROP_FPS) |
| | if fps == 0: fps = 30 |
| | chunk_frames, chunks, frame_count = [], [], 0 |
| | while cap.isOpened(): |
| | ret, frame = cap.read() |
| | if not ret: break |
| | if frame_count % int(fps * interval_sec) == 0: |
| | height, width = frame.shape[:2] |
| | scale = 512 / height |
| | resized_frame = cv2.resize(frame, (int(width * scale), 512)) |
| | _, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60]) |
| | chunk_frames.append(base64.b64encode(buffer).decode('utf-8')) |
| | if len(chunk_frames) == 30: |
| | chunks.append(chunk_frames) |
| | chunk_frames = [] |
| | frame_count += 1 |
| | if chunk_frames: chunks.append(chunk_frames) |
| | cap.release() |
| | return chunks |
| |
|
| | def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3): |
| | """Send chunk to LLM.""" |
| | prompt = ( |
| | "This is a segment from a frontend web development video tutorial (screenshots taken every second). " |
| | "Please focus intently on the code shown on the screen and the resulting web page style. " |
| | "Describe in detail the HTML structure, CSS styling rules, or JavaScript logic presented in this segment. " |
| | "Ignore unrelated video elements." |
| | ) |
| | content = [{"type": "text", "text": prompt}] |
| | for f in frames_b64: |
| | content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}}) |
| | for attempt in range(max_retries): |
| | try: |
| | response = client.chat.completions.create( |
| | model=MODEL_NAME, |
| | messages=[{"role": "user", "content": content}], |
| | temperature=0.1, max_tokens=1024 |
| | ) |
| | return chunk_index, response.choices[0].message.content |
| | except Exception as e: |
| | time.sleep(2) |
| | return chunk_index, "" |
| |
|
| | def aggregate_and_generate_webpage(client, summaries): |
| | """Generate final HTML.""" |
| | full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s]) |
| | final_prompt = f""" |
| | You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file. |
| | |
| | **Summaries:** |
| | {full_summary} |
| | |
| | **Strict Output Instructions:** |
| | 1. Return ONLY the raw HTML code. |
| | 2. Start directly with `<!DOCTYPE html>`. |
| | 3. End directly with `</html>`. |
| | 4. NO introduction text, NO markdown backticks (```), NO explanations after the code. |
| | """ |
| | response = client.chat.completions.create( |
| | model=MODEL_NAME, |
| | messages=[{"role": "user", "content": final_prompt}], |
| | temperature=0.2, top_p=0.8 |
| | ) |
| | content = response.choices[0].message.content |
| | |
| | |
| | content = content.replace("```html", "").replace("```", "").strip() |
| | match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE) |
| | if match: |
| | content = match.group(1) |
| | else: |
| | start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE) |
| | if start_match: |
| | content = content[start_match.start():] |
| | |
| | return content |
| |
|
| | def main_process(video_file, progress=gr.Progress()): |
| | |
| | yield "⏳ Initializing...", None, None, None |
| | |
| | api_key = os.environ.get("ERNIE_API_KEY") |
| | if not api_key: raise gr.Error("Server Config Error: API KEY missing.") |
| | if not video_file: raise gr.Error("Please upload a video.") |
| |
|
| | |
| | cap = cv2.VideoCapture(video_file) |
| | fps = cap.get(cv2.CAP_PROP_FPS) |
| | count = cap.get(cv2.CAP_PROP_FRAME_COUNT) |
| | duration = count / fps if fps > 0 else 0 |
| | cap.release() |
| | if duration > MAX_VIDEO_DURATION_SEC: |
| | raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.") |
| |
|
| | client = OpenAI(api_key=api_key, base_url=BASE_URL) |
| | |
| | |
| | yield "🎞️ Step 1/3: Extracting video frames...", None, None, None |
| | progress(0.1, desc="Extracting frames...") |
| | chunks = extract_frames(video_file) |
| | if not chunks: raise gr.Error("Frame extraction failed.") |
| | |
| | |
| | yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None |
| | progress(0.3, desc="Analyzing content...") |
| | chunk_summaries = {} |
| | |
| | with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor: |
| | future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)} |
| | |
| | completed = 0 |
| | total = len(chunks) |
| | |
| | for future in as_completed(future_to_chunk): |
| | idx, summary = future.result() |
| | if summary: chunk_summaries[idx] = summary |
| | |
| | completed += 1 |
| | |
| | yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None |
| | |
| | |
| | yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None |
| | progress(0.85, desc="Synthesizing code...") |
| | html_code = aggregate_and_generate_webpage(client, chunk_summaries) |
| | |
| | output_path = "generated_website.html" |
| | with open(output_path, "w", encoding="utf-8") as f: |
| | f.write(html_code) |
| | |
| | |
| | b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8') |
| | data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}" |
| | iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>""" |
| | |
| | progress(1.0, desc="Done") |
| | |
| | yield "✅ Generation Complete!", iframe_html, output_path, html_code |
| |
|
| | |
| |
|
| | with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo: |
| | |
| | gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent") |
| | |
| | |
| | with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True): |
| | gr.Markdown(""" |
| | This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding: |
| | |
| | * **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities. |
| | * **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information. |
| | * **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details. |
| | * **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials. |
| | """) |
| | |
| | gr.Markdown("Upload a frontend coding tutorial video. The AI will watch it, understand the code, and render the result instantly.") |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | |
| | video_input = gr.Video(label="Upload Video", format="mp4") |
| | |
| | gr.Examples( |
| | examples=[["sample_demo.mp4"]], |
| | inputs=[video_input], |
| | label="▶️ Or try this example video:", |
| | cache_examples=False |
| | ) |
| | |
| | submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg") |
| | |
| | |
| | status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False) |
| | |
| | with gr.Column(scale=2): |
| | with gr.Tabs(): |
| | with gr.TabItem("🌐 Live Preview (Result)"): |
| | html_preview = gr.HTML(label="Rendered Page") |
| | |
| | with gr.TabItem("📝 Source Code"): |
| | code_output = gr.Code(language="html", label="HTML Source") |
| | |
| | with gr.TabItem("⬇️ Download"): |
| | file_download = gr.File(label="Download .html File") |
| |
|
| | |
| | submit_btn.click( |
| | fn=main_process, |
| | inputs=[video_input], |
| | outputs=[status_output, html_preview, file_download, code_output] |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |