zhijun.li commited on
Commit
e6bc9f9
·
1 Parent(s): 25ed9e1

Add app.py, requirements and improved README

Browse files
Files changed (4) hide show
  1. .gitignore +6 -0
  2. README.md +34 -6
  3. app.py +192 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ .DS_Store
5
+ venv/
6
+ .ipynb_checkpoints/
README.md CHANGED
@@ -1,14 +1,42 @@
1
  ---
2
- title: Video To Code Ernie
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Generate HTML/CSS code from videos using ERNIE 4.5-VL
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Ernie 4.5 Video2Code
3
+ emoji:
4
+ colorFrom: blue
5
+ colorTo: cyan
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Turn UI/UX video tutorials into executable HTML code instantly.
12
  ---
13
 
14
+ # ERNIE 4.5-VL Video-to-Code Agent
15
+
16
+ **Watch the video, write the code.**
17
+
18
+ This AI Agent uses **Baidu ERNIE 4.5-VL (Vision-Language Model)** to analyze frontend coding tutorials frame-by-frame and reconstruct the final webpage structure, styling, and logic automatically.
19
+
20
+
21
+ ## ✨ Key Features
22
+
23
+ * **👁️ Visual Perception**: The AI "watches" the video, identifying HTML structures, CSS layouts, and interactive elements shown on screen.
24
+ * **🛡️ Sandbox Rendering**: Generated code is rendered inside a secure **Iframe**, allowing you to see the live result immediately without style conflicts.
25
+ * **🧹 Clean Output**: Automatically filters out conversational text to provide pure, ready-to-run HTML/CSS/JS code.
26
+ * **📦 Single-File Download**: Get a standalone `.html` file containing all dependencies.
27
+
28
+ ## 🚀 How to Use
29
+
30
+ 1. **Upload**: Drop an MP4 video file (Frontend tutorials, CSS effects, UI demos).
31
+ * *Constraint: Max video duration is **30 minutes**.*
32
+ 2. **Generate**: Click **"🚀 Generate & Render"**.
33
+ 3. **Preview**:
34
+ * **Live Preview**: See the code running instantly in the browser.
35
+ * **Source Code**: Inspect the generated HTML syntax.
36
+ 4. **Download**: Save the result to your local machine.
37
+
38
+ ## ⚙️ How It Works
39
+
40
+ 1. **Frame Extraction**: The video is processed using OpenCV to capture high-quality keyframes.
41
+ 2. **Parallel Analysis**: ERNIE 4.5-VL processes video segments in parallel to understand the coding progression and visual outcome.
42
+ 3. **Logic Synthesis**: The agent acts as a Senior Frontend Engineer, aggregating the visual insights to write functional code.
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import time
4
+ import base64
5
+ import gradio as gr
6
+ from openai import OpenAI
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ import re
9
+
10
+
11
+ # --- Configuration ---
12
+ BASE_URL = "https://aistudio.baidu.com/llm/lmapi/v3"
13
+ MODEL_NAME = "ernie-4.5-turbo-vl"
14
+ MAX_CONCURRENT_REQUESTS = 4
15
+ MAX_VIDEO_DURATION_SEC = 1800
16
+
17
+ def extract_frames(video_path, interval_sec=1):
18
+ """Extract frames from the video."""
19
+ cap = cv2.VideoCapture(video_path)
20
+ if not cap.isOpened():
21
+ return []
22
+ fps = cap.get(cv2.CAP_PROP_FPS)
23
+ if fps == 0: fps = 30
24
+ chunk_frames, chunks, frame_count = [], [], 0
25
+ while cap.isOpened():
26
+ ret, frame = cap.read()
27
+ if not ret: break
28
+ if frame_count % int(fps * interval_sec) == 0:
29
+ height, width = frame.shape[:2]
30
+ scale = 512 / height
31
+ resized_frame = cv2.resize(frame, (int(width * scale), 512))
32
+ _, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
33
+ chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
34
+ if len(chunk_frames) == 30:
35
+ chunks.append(chunk_frames)
36
+ chunk_frames = []
37
+ frame_count += 1
38
+ if chunk_frames: chunks.append(chunk_frames)
39
+ cap.release()
40
+ return chunks
41
+
42
+ def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
43
+ """Send chunk to LLM."""
44
+ prompt = (
45
+ "This is a segment from a frontend web development video tutorial (screenshots taken every second). "
46
+ "Please focus intently on the code shown on the screen and the resulting web page style. "
47
+ "Describe in detail the HTML structure, CSS styling rules, or JavaScript logic presented in this segment. "
48
+ "Ignore unrelated video elements."
49
+ )
50
+ content = [{"type": "text", "text": prompt}]
51
+ for f in frames_b64:
52
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
53
+ for attempt in range(max_retries):
54
+ try:
55
+ response = client.chat.completions.create(
56
+ model=MODEL_NAME,
57
+ messages=[{"role": "user", "content": content}],
58
+ temperature=0.1, max_tokens=1024
59
+ )
60
+ return chunk_index, response.choices[0].message.content
61
+ except Exception as e:
62
+ time.sleep(2)
63
+ return chunk_index, ""
64
+
65
+ def aggregate_and_generate_webpage(client, summaries):
66
+ """Aggregate summaries and generate final HTML."""
67
+ full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
68
+
69
+ # Prompt 稍微加强一点语气
70
+ final_prompt = f"""
71
+ You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
72
+
73
+ **Summaries:**
74
+ {full_summary}
75
+
76
+ **Strict Output Instructions:**
77
+ 1. Return ONLY the raw HTML code.
78
+ 2. Start directly with `<!DOCTYPE html>`.
79
+ 3. End directly with `</html>`.
80
+ 4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
81
+ """
82
+
83
+ response = client.chat.completions.create(
84
+ model=MODEL_NAME,
85
+ messages=[{"role": "user", "content": final_prompt}],
86
+ temperature=0.2, top_p=0.8
87
+ )
88
+
89
+ content = response.choices[0].message.content
90
+ content = content.replace("```html", "").replace("```", "").strip()
91
+
92
+ match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
93
+
94
+ if match:
95
+ content = match.group(1)
96
+ else:
97
+ start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
98
+ if start_match:
99
+ content = content[start_match.start():]
100
+
101
+ return content
102
+
103
+
104
+ def main_process(video_file, progress=gr.Progress()):
105
+ api_key = os.environ.get("ERNIE_API_KEY")
106
+ if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
107
+ if not video_file: raise gr.Error("Please upload a video.")
108
+
109
+ # check duration
110
+ cap = cv2.VideoCapture(video_file)
111
+ fps = cap.get(cv2.CAP_PROP_FPS)
112
+ count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
113
+ duration = count / fps if fps > 0 else 0
114
+ cap.release()
115
+ if duration > MAX_VIDEO_DURATION_SEC:
116
+ raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")
117
+
118
+ client = OpenAI(api_key=api_key, base_url=BASE_URL)
119
+
120
+ progress(0.1, desc="Extracting frames...")
121
+ chunks = extract_frames(video_file)
122
+ if not chunks: raise gr.Error("Frame extraction failed.")
123
+
124
+ progress(0.3, desc="Analyzing content...")
125
+ chunk_summaries = {}
126
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
127
+ future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
128
+ for i, future in enumerate(as_completed(future_to_chunk)):
129
+ idx, summary = future.result()
130
+ if summary: chunk_summaries[idx] = summary
131
+ progress(0.3 + 0.5 * ((i+1)/len(chunks)), desc=f"Analyzed {i+1}/{len(chunks)}")
132
+
133
+ progress(0.8, desc="Synthesizing code...")
134
+ html_code = aggregate_and_generate_webpage(client, chunk_summaries)
135
+
136
+ # Save file
137
+ output_path = "generated_website.html"
138
+ with open(output_path, "w", encoding="utf-8") as f:
139
+ f.write(html_code)
140
+
141
+ # --- 关键修改:制作 Data URI Iframe ---
142
+ # 将 HTML 编码为 Base64,放入 iframe 的 src 中
143
+ # 这样实现了完美的沙箱隔离,样式不会冲突,JS 也能正常运行
144
+ b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
145
+ data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
146
+
147
+ # 创建一个 HTML 字符串,里面包含一个 iframe
148
+ iframe_html = f"""
149
+ <iframe
150
+ src="{data_uri}"
151
+ width="100%"
152
+ height="600px"
153
+ style="border: 1px solid #ccc; border-radius: 8px; background-color: white;">
154
+ </iframe>
155
+ """
156
+
157
+ progress(1.0, desc="Done!")
158
+ # 返回 iframe 字符串给 HTML 组件,返回路径给下载组件,返回源码给 Code 组件
159
+ return iframe_html, output_path, html_code
160
+
161
+ # --- UI ---
162
+
163
+ with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
164
+ gr.Markdown("# 🎬 Ernie 4.5-VL: Video to Code Agent")
165
+ gr.Markdown("Upload a frontend video tutorial. The AI will generate and **render** the code instantly.")
166
+
167
+ with gr.Row():
168
+ with gr.Column(scale=1):
169
+ video_input = gr.Video(label="Upload Video", format="mp4", height=300)
170
+ submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
171
+
172
+ with gr.Column(scale=2):
173
+ # 直接展示预览,不再隐藏在 Tab 里,或者设为默认 Tab
174
+ with gr.Tabs():
175
+ with gr.TabItem("🌐 Live Preview (Result)"):
176
+ # 这个组件现在接收的是 iframe 字符串
177
+ html_preview = gr.HTML(label="Rendered Page")
178
+
179
+ with gr.TabItem("📝 Source Code"):
180
+ code_output = gr.Code(language="html", label="HTML Source")
181
+
182
+ with gr.TabItem("⬇️ Download"):
183
+ file_download = gr.File(label="Download .html File")
184
+
185
+ submit_btn.click(
186
+ fn=main_process,
187
+ inputs=[video_input],
188
+ outputs=[html_preview, file_download, code_output]
189
+ )
190
+
191
+ if __name__ == "__main__":
192
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ opencv-python-headless
2
+ openai
3
+ tqdm
4
+ gradio
5
+ numpy