Spaces:

ZhijunLStudio
/

Ernie4.5-VL-Video2Coder

Sleeping

zhijun.li

Adjusting annotations

850ce03 3 months ago

9.01 kB

	import os
	import cv2
	import time
	import base64
	import re
	import gradio as gr
	from openai import OpenAI
	from concurrent.futures import ThreadPoolExecutor, as_completed

	# --- Configuration ---
	BASE_URL = "https://aistudio.baidu.com/llm/lmapi/v3"
	MODEL_NAME = "ernie-4.5-turbo-vl"
	MAX_CONCURRENT_REQUESTS = 4
	MAX_VIDEO_DURATION_SEC = 1800

	def extract_frames(video_path, interval_sec=1):
	"""Extract frames from the video."""
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return []
	fps = cap.get(cv2.CAP_PROP_FPS)
	if fps == 0: fps = 30
	chunk_frames, chunks, frame_count = [], [], 0
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret: break
	if frame_count % int(fps * interval_sec) == 0:
	height, width = frame.shape[:2]
	scale = 512 / height
	resized_frame = cv2.resize(frame, (int(width * scale), 512))
	_, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
	chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
	if len(chunk_frames) == 30:
	chunks.append(chunk_frames)
	chunk_frames = []
	frame_count += 1
	if chunk_frames: chunks.append(chunk_frames)
	cap.release()
	return chunks

	def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
	"""Send chunk to LLM."""
	prompt = (
	"This is a segment from a frontend web development video tutorial (screenshots taken every second). "
	"Please focus intently on the code shown on the screen and the resulting web page style. "
	"Describe in detail the HTML structure, CSS styling rules, or JavaScript logic presented in this segment. "
	"Ignore unrelated video elements."
	)
	content = [{"type": "text", "text": prompt}]
	for f in frames_b64:
	content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
	for attempt in range(max_retries):
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": content}],
	temperature=0.1, max_tokens=1024
	)
	return chunk_index, response.choices[0].message.content
	except Exception as e:
	time.sleep(2)
	return chunk_index, ""

	def aggregate_and_generate_webpage(client, summaries):
	"""Generate final HTML."""
	full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
	final_prompt = f"""
	You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.

	Summaries:
	{full_summary}

	Strict Output Instructions:
	1. Return ONLY the raw HTML code.
	2. Start directly with `<!DOCTYPE html>`.
	3. End directly with `</html>`.
	4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
	"""
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": final_prompt}],
	temperature=0.2, top_p=0.8
	)
	content = response.choices[0].message.content

	# Regex Cleaning
	content = content.replace("```html", "").replace("```", "").strip()
	match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL \| re.IGNORECASE)
	if match:
	content = match.group(1)
	else:
	start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
	if start_match:
	content = content[start_match.start():]

	return content

	def main_process(video_file, progress=gr.Progress()):
	# 1. 初始化状态
	yield "⏳ Initializing...", None, None, None

	api_key = os.environ.get("ERNIE_API_KEY")
	if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
	if not video_file: raise gr.Error("Please upload a video.")

	# check duration
	cap = cv2.VideoCapture(video_file)
	fps = cap.get(cv2.CAP_PROP_FPS)
	count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
	duration = count / fps if fps > 0 else 0
	cap.release()
	if duration > MAX_VIDEO_DURATION_SEC:
	raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")

	client = OpenAI(api_key=api_key, base_url=BASE_URL)

	# 2. 抽帧阶段
	yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
	progress(0.1, desc="Extracting frames...")
	chunks = extract_frames(video_file)
	if not chunks: raise gr.Error("Frame extraction failed.")

	# 3. 分析阶段
	yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
	progress(0.3, desc="Analyzing content...")
	chunk_summaries = {}

	with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
	future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}

	completed = 0
	total = len(chunks)

	for future in as_completed(future_to_chunk):
	idx, summary = future.result()
	if summary: chunk_summaries[idx] = summary

	completed += 1
	# 实时更新状态文字
	yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None

	# 4. 生成代码阶段
	yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
	progress(0.85, desc="Synthesizing code...")
	html_code = aggregate_and_generate_webpage(client, chunk_summaries)

	output_path = "generated_website.html"
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(html_code)

	# Create Iframe
	b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
	data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
	iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""

	progress(1.0, desc="Done")
	# 5. 完成，返回所有结果
	yield "✅ Generation Complete!", iframe_html, output_path, html_code

	# --- UI ---

	with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:

	gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")

	# 修复2：将 open 设置为 True，默认展开
	with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
	gr.Markdown("""
	This application is powered by Baidu ERNIE 4.5, a state-of-the-art foundation model with specific enhancements for video understanding:

	* 👁️ Multimodal Heterogeneous MoE: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
	* ⏳ 3D-RoPE Temporal Modeling: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
	* 📐 Adaptive Resolution: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
	* 🚀 Long Context Window: Supports up to 128k context length for analyzing long tutorials.
	""")

	gr.Markdown("Upload a frontend coding tutorial video. The AI will watch it, understand the code, and render the result instantly.")

	with gr.Row():
	with gr.Column(scale=1):
	# 修复1：去掉了 height 参数，让它自适应高度，不会再“扁扁的”
	video_input = gr.Video(label="Upload Video", format="mp4")

	gr.Examples(
	examples=[["sample_demo.mp4"]],
	inputs=[video_input],
	label="▶️ Or try this example video:",
	cache_examples=False
	)

	submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")

	# 修复3：新增一个状态文本框，直接显示在这里，不用滚轮找进度条了
	status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("🌐 Live Preview (Result)"):
	html_preview = gr.HTML(label="Rendered Page")

	with gr.TabItem("📝 Source Code"):
	code_output = gr.Code(language="html", label="HTML Source")

	with gr.TabItem("⬇️ Download"):
	file_download = gr.File(label="Download .html File")

	# 绑定事件：outputs 增加了 status_output
	submit_btn.click(
	fn=main_process,
	inputs=[video_input],
	outputs=[status_output, html_preview, file_download, code_output]
	)

	if __name__ == "__main__":
	demo.launch()