Spaces:
Running
Running
add text to video and fix input image issue for image to video
Browse files
app.py
CHANGED
|
@@ -1315,7 +1315,7 @@ def generate_video_from_image(input_image_data, prompt: str, session_id: Optiona
|
|
| 1315 |
)
|
| 1316 |
print(f"[Image2Video] InferenceClient initialized (provider=auto)")
|
| 1317 |
|
| 1318 |
-
# Normalize input image to bytes
|
| 1319 |
import io
|
| 1320 |
from PIL import Image
|
| 1321 |
try:
|
|
@@ -1323,19 +1323,18 @@ def generate_video_from_image(input_image_data, prompt: str, session_id: Optiona
|
|
| 1323 |
except Exception:
|
| 1324 |
np = None
|
| 1325 |
|
| 1326 |
-
|
| 1327 |
-
|
| 1328 |
-
|
| 1329 |
-
|
| 1330 |
-
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
-
|
| 1334 |
-
|
| 1335 |
-
|
| 1336 |
-
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
if pil_image.mode != 'RGB':
|
| 1340 |
pil_image = pil_image.convert('RGB')
|
| 1341 |
try:
|
|
@@ -1343,9 +1342,35 @@ def generate_video_from_image(input_image_data, prompt: str, session_id: Optiona
|
|
| 1343 |
except Exception:
|
| 1344 |
pass
|
| 1345 |
|
| 1346 |
-
|
| 1347 |
-
|
| 1348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1349 |
|
| 1350 |
# Call image-to-video; require method support
|
| 1351 |
model_id = "Lightricks/LTX-Video-0.9.8-13B-distilled"
|
|
@@ -1402,7 +1427,7 @@ def generate_video_from_image(input_image_data, prompt: str, session_id: Optiona
|
|
| 1402 |
|
| 1403 |
if file_url:
|
| 1404 |
video_html = (
|
| 1405 |
-
f"<video controls style=\"max-width: 100%; height: auto; border-radius: 8px; margin: 10px 0;\">"
|
| 1406 |
f"<source src=\"{file_url}\" type=\"video/mp4\" />"
|
| 1407 |
f"Your browser does not support the video tag."
|
| 1408 |
f"</video>"
|
|
@@ -1419,6 +1444,86 @@ def generate_video_from_image(input_image_data, prompt: str, session_id: Optiona
|
|
| 1419 |
print(f"Image-to-video generation error: {str(e)}")
|
| 1420 |
return f"Error generating video (image-to-video): {str(e)}"
|
| 1421 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1422 |
def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
|
| 1423 |
"""Extract image generation prompts from the full text based on number of images needed"""
|
| 1424 |
# Use the entire text as the base prompt for image generation
|
|
@@ -1638,6 +1743,79 @@ def create_image_replacement_blocks_text_to_image_single(html_content: str, prom
|
|
| 1638 |
# If no <body>, just append
|
| 1639 |
return f"{SEARCH_START}\n\n{DIVIDER}\n{image_html}\n{REPLACE_END}"
|
| 1640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1641 |
def create_image_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, max_images: int = 1) -> str:
|
| 1642 |
"""Create search/replace blocks using image-to-image generation with a provided input image.
|
| 1643 |
|
|
@@ -1810,7 +1988,7 @@ def create_video_replacement_blocks_from_input_image(html_content: str, user_pro
|
|
| 1810 |
print("[Image2Video] No <body> tag; appending video via replacement block")
|
| 1811 |
return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
|
| 1812 |
|
| 1813 |
-
def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None) -> str:
|
| 1814 |
"""Apply text-to-image and/or image-to-image replacements to HTML content.
|
| 1815 |
|
| 1816 |
If both toggles are enabled, text-to-image replacements run first, then image-to-image.
|
|
@@ -1845,6 +2023,18 @@ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_t
|
|
| 1845 |
print("[MediaApply] No i2v replacement blocks generated")
|
| 1846 |
return result
|
| 1847 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1848 |
# If an input image is provided and image-to-image is enabled, we only replace one image
|
| 1849 |
# and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
|
| 1850 |
if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
|
@@ -2693,7 +2883,7 @@ The HTML code above contains the complete original website structure with all im
|
|
| 2693 |
stop_generation = False
|
| 2694 |
|
| 2695 |
|
| 2696 |
-
def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None):
|
| 2697 |
if query is None:
|
| 2698 |
query = ''
|
| 2699 |
if _history is None:
|
|
@@ -2845,6 +3035,8 @@ This will help me create a better design for you."""
|
|
| 2845 |
enable_image_to_video=enable_image_to_video,
|
| 2846 |
image_to_video_prompt=image_to_video_prompt,
|
| 2847 |
session_id=session_id,
|
|
|
|
|
|
|
| 2848 |
)
|
| 2849 |
|
| 2850 |
_history.append([query, final_content])
|
|
@@ -3010,6 +3202,8 @@ This will help me create a better design for you."""
|
|
| 3010 |
enable_image_to_video=enable_image_to_video,
|
| 3011 |
image_to_video_prompt=image_to_video_prompt,
|
| 3012 |
session_id=session_id,
|
|
|
|
|
|
|
| 3013 |
)
|
| 3014 |
|
| 3015 |
yield {
|
|
@@ -3032,6 +3226,8 @@ This will help me create a better design for you."""
|
|
| 3032 |
enable_image_to_video=enable_image_to_video,
|
| 3033 |
image_to_video_prompt=image_to_video_prompt,
|
| 3034 |
session_id=session_id,
|
|
|
|
|
|
|
| 3035 |
)
|
| 3036 |
|
| 3037 |
preview_val = None
|
|
@@ -3432,6 +3628,8 @@ This will help me create a better design for you."""
|
|
| 3432 |
image_to_video_prompt=image_to_video_prompt,
|
| 3433 |
session_id=session_id,
|
| 3434 |
text_to_image_prompt=text_to_image_prompt,
|
|
|
|
|
|
|
| 3435 |
)
|
| 3436 |
|
| 3437 |
# Update history with the cleaned content
|
|
@@ -3459,6 +3657,8 @@ This will help me create a better design for you."""
|
|
| 3459 |
enable_image_to_video=enable_image_to_video,
|
| 3460 |
image_to_video_prompt=image_to_video_prompt,
|
| 3461 |
session_id=session_id,
|
|
|
|
|
|
|
| 3462 |
)
|
| 3463 |
|
| 3464 |
_history.append([query, final_content])
|
|
@@ -4580,6 +4780,20 @@ with gr.Blocks(
|
|
| 4580 |
visible=False
|
| 4581 |
)
|
| 4582 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4583 |
def on_image_to_image_toggle(toggled):
|
| 4584 |
# Show generation image input and its prompt when image-to-image is enabled
|
| 4585 |
return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
|
|
@@ -4605,6 +4819,11 @@ with gr.Blocks(
|
|
| 4605 |
inputs=[image_generation_toggle],
|
| 4606 |
outputs=[text_to_image_prompt]
|
| 4607 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4608 |
model_dropdown = gr.Dropdown(
|
| 4609 |
choices=[model['name'] for model in AVAILABLE_MODELS],
|
| 4610 |
value=DEFAULT_MODEL_NAME,
|
|
@@ -4855,7 +5074,7 @@ with gr.Blocks(
|
|
| 4855 |
show_progress="hidden",
|
| 4856 |
).then(
|
| 4857 |
generation_code,
|
| 4858 |
-
inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt],
|
| 4859 |
outputs=[code_output, history, sandbox, history_output]
|
| 4860 |
).then(
|
| 4861 |
end_generation_ui,
|
|
|
|
| 1315 |
)
|
| 1316 |
print(f"[Image2Video] InferenceClient initialized (provider=auto)")
|
| 1317 |
|
| 1318 |
+
# Normalize input image to bytes, with downscale/compress to cap request size
|
| 1319 |
import io
|
| 1320 |
from PIL import Image
|
| 1321 |
try:
|
|
|
|
| 1323 |
except Exception:
|
| 1324 |
np = None
|
| 1325 |
|
| 1326 |
+
def _load_pil(img_like) -> Image.Image:
|
| 1327 |
+
if hasattr(img_like, 'read'):
|
| 1328 |
+
return Image.open(io.BytesIO(img_like.read()))
|
| 1329 |
+
if hasattr(img_like, 'mode') and hasattr(img_like, 'size'):
|
| 1330 |
+
return img_like
|
| 1331 |
+
if np is not None and isinstance(img_like, np.ndarray):
|
| 1332 |
+
return Image.fromarray(img_like)
|
| 1333 |
+
if isinstance(img_like, (bytes, bytearray)):
|
| 1334 |
+
return Image.open(io.BytesIO(img_like))
|
| 1335 |
+
return Image.open(io.BytesIO(bytes(img_like)))
|
| 1336 |
+
|
| 1337 |
+
pil_image = _load_pil(input_image_data)
|
|
|
|
| 1338 |
if pil_image.mode != 'RGB':
|
| 1339 |
pil_image = pil_image.convert('RGB')
|
| 1340 |
try:
|
|
|
|
| 1342 |
except Exception:
|
| 1343 |
pass
|
| 1344 |
|
| 1345 |
+
# Progressive encode to keep payload under ~3.9MB (below 4MB limit)
|
| 1346 |
+
MAX_BYTES = 3_900_000
|
| 1347 |
+
max_dim = 1024 # initial cap on longest edge
|
| 1348 |
+
quality = 90
|
| 1349 |
+
|
| 1350 |
+
def encode_current(pil: Image.Image, q: int) -> bytes:
|
| 1351 |
+
tmp = io.BytesIO()
|
| 1352 |
+
pil.save(tmp, format='JPEG', quality=q, optimize=True)
|
| 1353 |
+
return tmp.getvalue()
|
| 1354 |
+
|
| 1355 |
+
# Downscale while the longest edge exceeds max_dim
|
| 1356 |
+
while max(pil_image.size) > max_dim:
|
| 1357 |
+
ratio = max_dim / float(max(pil_image.size))
|
| 1358 |
+
new_size = (max(1, int(pil_image.size[0] * ratio)), max(1, int(pil_image.size[1] * ratio)))
|
| 1359 |
+
pil_image = pil_image.resize(new_size, Image.Resampling.LANCZOS)
|
| 1360 |
+
|
| 1361 |
+
encoded = encode_current(pil_image, quality)
|
| 1362 |
+
# If still too big, iteratively reduce quality, then dimensions
|
| 1363 |
+
while len(encoded) > MAX_BYTES and (quality > 40 or max(pil_image.size) > 640):
|
| 1364 |
+
if quality > 40:
|
| 1365 |
+
quality -= 10
|
| 1366 |
+
else:
|
| 1367 |
+
# reduce dims by 15% if already at low quality
|
| 1368 |
+
new_w = max(1, int(pil_image.size[0] * 0.85))
|
| 1369 |
+
new_h = max(1, int(pil_image.size[1] * 0.85))
|
| 1370 |
+
pil_image = pil_image.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
| 1371 |
+
encoded = encode_current(pil_image, quality)
|
| 1372 |
+
|
| 1373 |
+
input_bytes = encoded
|
| 1374 |
|
| 1375 |
# Call image-to-video; require method support
|
| 1376 |
model_id = "Lightricks/LTX-Video-0.9.8-13B-distilled"
|
|
|
|
| 1427 |
|
| 1428 |
if file_url:
|
| 1429 |
video_html = (
|
| 1430 |
+
f"<video controls autoplay muted loop playsinline style=\"max-width: 100%; height: auto; border-radius: 8px; margin: 10px 0;\">"
|
| 1431 |
f"<source src=\"{file_url}\" type=\"video/mp4\" />"
|
| 1432 |
f"Your browser does not support the video tag."
|
| 1433 |
f"</video>"
|
|
|
|
| 1444 |
print(f"Image-to-video generation error: {str(e)}")
|
| 1445 |
return f"Error generating video (image-to-video): {str(e)}"
|
| 1446 |
|
| 1447 |
+
def generate_video_from_text(prompt: str, session_id: Optional[str] = None) -> str:
|
| 1448 |
+
"""Generate a video from a text prompt using Hugging Face InferenceClient.
|
| 1449 |
+
|
| 1450 |
+
Returns an HTML <video> tag whose source points to a local file URL (file://...).
|
| 1451 |
+
"""
|
| 1452 |
+
try:
|
| 1453 |
+
print("[Text2Video] Starting video generation from text")
|
| 1454 |
+
if not os.getenv('HF_TOKEN'):
|
| 1455 |
+
print("[Text2Video] Missing HF_TOKEN")
|
| 1456 |
+
return "Error: HF_TOKEN environment variable is not set. Please set it to your Hugging Face API token."
|
| 1457 |
+
|
| 1458 |
+
client = InferenceClient(
|
| 1459 |
+
provider="auto",
|
| 1460 |
+
api_key=os.getenv('HF_TOKEN'),
|
| 1461 |
+
bill_to="huggingface",
|
| 1462 |
+
)
|
| 1463 |
+
print("[Text2Video] InferenceClient initialized (provider=auto)")
|
| 1464 |
+
|
| 1465 |
+
# Ensure the client has text_to_video (newer huggingface_hub)
|
| 1466 |
+
text_to_video_method = getattr(client, "text_to_video", None)
|
| 1467 |
+
if not callable(text_to_video_method):
|
| 1468 |
+
print("[Text2Video] InferenceClient.text_to_video not available in this huggingface_hub version")
|
| 1469 |
+
return (
|
| 1470 |
+
"Error generating video (text-to-video): Your installed huggingface_hub version "
|
| 1471 |
+
"does not expose InferenceClient.text_to_video. Please upgrade with "
|
| 1472 |
+
"`pip install -U huggingface_hub` and try again."
|
| 1473 |
+
)
|
| 1474 |
+
|
| 1475 |
+
model_id = "Wan-AI/Wan2.2-TI2V-5B"
|
| 1476 |
+
prompt_str = (prompt or "").strip()
|
| 1477 |
+
print(f"[Text2Video] Calling text_to_video with model={model_id}, prompt length={len(prompt_str)}")
|
| 1478 |
+
video_bytes = text_to_video_method(
|
| 1479 |
+
prompt_str,
|
| 1480 |
+
model=model_id,
|
| 1481 |
+
)
|
| 1482 |
+
print(f"[Text2Video] Received video bytes: {len(video_bytes) if hasattr(video_bytes, '__len__') else 'unknown length'}")
|
| 1483 |
+
|
| 1484 |
+
# Persist to a temp .mp4 and return a file URL based <video>
|
| 1485 |
+
try:
|
| 1486 |
+
_ensure_video_dir_exists()
|
| 1487 |
+
file_name = f"{uuid.uuid4()}.mp4"
|
| 1488 |
+
file_path = os.path.join(VIDEO_TEMP_DIR, file_name)
|
| 1489 |
+
with open(file_path, "wb") as f:
|
| 1490 |
+
f.write(video_bytes)
|
| 1491 |
+
_register_video_for_session(session_id, file_path)
|
| 1492 |
+
try:
|
| 1493 |
+
file_size = os.path.getsize(file_path)
|
| 1494 |
+
except Exception:
|
| 1495 |
+
file_size = -1
|
| 1496 |
+
print(f"[Text2Video] Saved video to temp file: {file_path} (size={file_size} bytes)")
|
| 1497 |
+
except Exception as save_exc:
|
| 1498 |
+
print(f"[Text2Video] Warning: could not persist temp video file: {save_exc}")
|
| 1499 |
+
|
| 1500 |
+
# Build file:// URL
|
| 1501 |
+
file_url = None
|
| 1502 |
+
try:
|
| 1503 |
+
if 'file_path' in locals() and file_path:
|
| 1504 |
+
from pathlib import Path
|
| 1505 |
+
file_url = Path(file_path).as_uri()
|
| 1506 |
+
except Exception:
|
| 1507 |
+
file_url = None
|
| 1508 |
+
|
| 1509 |
+
if not file_url:
|
| 1510 |
+
return "Error generating video (text-to-video): Could not persist video to a local file."
|
| 1511 |
+
|
| 1512 |
+
video_html = (
|
| 1513 |
+
f"<video controls autoplay muted loop playsinline style=\"max-width: 100%; height: auto; border-radius: 8px; margin: 10px 0;\">"
|
| 1514 |
+
f"<source src=\"{file_url}\" type=\"video/mp4\" />"
|
| 1515 |
+
f"Your browser does not support the video tag."
|
| 1516 |
+
f"</video>"
|
| 1517 |
+
)
|
| 1518 |
+
print("[Text2Video] Successfully generated video HTML tag from text")
|
| 1519 |
+
return video_html
|
| 1520 |
+
except Exception as e:
|
| 1521 |
+
import traceback
|
| 1522 |
+
print("[Text2Video] Exception during generation:")
|
| 1523 |
+
traceback.print_exc()
|
| 1524 |
+
print(f"Text-to-video generation error: {str(e)}")
|
| 1525 |
+
return f"Error generating video (text-to-video): {str(e)}"
|
| 1526 |
+
|
| 1527 |
def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
|
| 1528 |
"""Extract image generation prompts from the full text based on number of images needed"""
|
| 1529 |
# Use the entire text as the base prompt for image generation
|
|
|
|
| 1743 |
# If no <body>, just append
|
| 1744 |
return f"{SEARCH_START}\n\n{DIVIDER}\n{image_html}\n{REPLACE_END}"
|
| 1745 |
|
| 1746 |
+
def create_video_replacement_blocks_text_to_video(html_content: str, prompt: str, session_id: Optional[str] = None) -> str:
|
| 1747 |
+
"""Create search/replace blocks that generate and insert ONLY ONE text-to-video result.
|
| 1748 |
+
|
| 1749 |
+
Replaces the first detected <img> placeholder; if none found, inserts one video near the top of <body>.
|
| 1750 |
+
"""
|
| 1751 |
+
if not prompt or not prompt.strip():
|
| 1752 |
+
return ""
|
| 1753 |
+
|
| 1754 |
+
import re
|
| 1755 |
+
|
| 1756 |
+
# Detect the same placeholders as image counterparts, to replace the first image slot with a video
|
| 1757 |
+
placeholder_patterns = [
|
| 1758 |
+
r'<img[^>]*src=["\'](?:placeholder|dummy|sample|example)[^"\']*["\'][^>]*>',
|
| 1759 |
+
r'<img[^>]*src=["\']https?://via\.placeholder\.com[^"\']*["\'][^>]*>',
|
| 1760 |
+
r'<img[^>]*src=["\']https?://picsum\.photos[^"\']*["\'][^>]*>',
|
| 1761 |
+
r'<img[^>]*src=["\']https?://dummyimage\.com[^"\']*["\'][^>]*>',
|
| 1762 |
+
r'<img[^>]*alt=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1763 |
+
r'<img[^>]*class=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1764 |
+
r'<img[^>]*id=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
|
| 1765 |
+
r'<img[^>]*src=["\']data:image[^"\']*["\'][^>]*>',
|
| 1766 |
+
r'<img[^>]*src=["\']#["\'][^>]*>',
|
| 1767 |
+
r'<img[^>]*src=["\']about:blank["\'][^>]*>',
|
| 1768 |
+
]
|
| 1769 |
+
|
| 1770 |
+
placeholder_images = []
|
| 1771 |
+
for pattern in placeholder_patterns:
|
| 1772 |
+
matches = re.findall(pattern, html_content, re.IGNORECASE)
|
| 1773 |
+
if matches:
|
| 1774 |
+
placeholder_images.extend(matches)
|
| 1775 |
+
|
| 1776 |
+
if not placeholder_images:
|
| 1777 |
+
img_pattern = r'<img[^>]*>'
|
| 1778 |
+
placeholder_images = re.findall(img_pattern, html_content)
|
| 1779 |
+
|
| 1780 |
+
video_html = generate_video_from_text(prompt, session_id=session_id)
|
| 1781 |
+
if video_html.startswith("Error"):
|
| 1782 |
+
return ""
|
| 1783 |
+
|
| 1784 |
+
# Replace first placeholder if present
|
| 1785 |
+
if placeholder_images:
|
| 1786 |
+
placeholder = placeholder_images[0]
|
| 1787 |
+
placeholder_clean = re.sub(r'\s+', ' ', placeholder.strip())
|
| 1788 |
+
placeholder_variations = [
|
| 1789 |
+
placeholder,
|
| 1790 |
+
placeholder_clean,
|
| 1791 |
+
placeholder_clean.replace('"', "'"),
|
| 1792 |
+
placeholder_clean.replace("'", '"'),
|
| 1793 |
+
re.sub(r'\s+', ' ', placeholder_clean),
|
| 1794 |
+
placeholder_clean.replace(' ', ' '),
|
| 1795 |
+
]
|
| 1796 |
+
blocks = []
|
| 1797 |
+
for variation in placeholder_variations:
|
| 1798 |
+
blocks.append(f"""{SEARCH_START}
|
| 1799 |
+
{variation}
|
| 1800 |
+
{DIVIDER}
|
| 1801 |
+
{video_html}
|
| 1802 |
+
{REPLACE_END}""")
|
| 1803 |
+
return '\n\n'.join(blocks)
|
| 1804 |
+
|
| 1805 |
+
# Otherwise insert after <body>
|
| 1806 |
+
if '<body' in html_content:
|
| 1807 |
+
body_end = html_content.find('>', html_content.find('<body')) + 1
|
| 1808 |
+
insertion_point = html_content[:body_end] + '\n '
|
| 1809 |
+
return f"""{SEARCH_START}
|
| 1810 |
+
{insertion_point}
|
| 1811 |
+
{DIVIDER}
|
| 1812 |
+
{insertion_point}
|
| 1813 |
+
{video_html}
|
| 1814 |
+
{REPLACE_END}"""
|
| 1815 |
+
|
| 1816 |
+
# If no <body>, just append
|
| 1817 |
+
return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
|
| 1818 |
+
|
| 1819 |
def create_image_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, max_images: int = 1) -> str:
|
| 1820 |
"""Create search/replace blocks using image-to-image generation with a provided input image.
|
| 1821 |
|
|
|
|
| 1988 |
print("[Image2Video] No <body> tag; appending video via replacement block")
|
| 1989 |
return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
|
| 1990 |
|
| 1991 |
+
def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: str | None = None) -> str:
|
| 1992 |
"""Apply text-to-image and/or image-to-image replacements to HTML content.
|
| 1993 |
|
| 1994 |
If both toggles are enabled, text-to-image replacements run first, then image-to-image.
|
|
|
|
| 2023 |
print("[MediaApply] No i2v replacement blocks generated")
|
| 2024 |
return result
|
| 2025 |
|
| 2026 |
+
# If text-to-video is enabled, insert a generated video (no input image required) and return.
|
| 2027 |
+
if enable_text_to_video and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
| 2028 |
+
t2v_prompt = (text_to_video_prompt or user_prompt or "").strip()
|
| 2029 |
+
print(f"[MediaApply] Running text-to-video with prompt len={len(t2v_prompt)}")
|
| 2030 |
+
blocks_tv = create_video_replacement_blocks_text_to_video(result, t2v_prompt, session_id=session_id)
|
| 2031 |
+
if blocks_tv:
|
| 2032 |
+
print("[MediaApply] Applying text-to-video replacement blocks")
|
| 2033 |
+
result = apply_search_replace_changes(result, blocks_tv)
|
| 2034 |
+
else:
|
| 2035 |
+
print("[MediaApply] No t2v replacement blocks generated")
|
| 2036 |
+
return result
|
| 2037 |
+
|
| 2038 |
# If an input image is provided and image-to-image is enabled, we only replace one image
|
| 2039 |
# and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
|
| 2040 |
if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
|
|
|
|
| 2883 |
stop_generation = False
|
| 2884 |
|
| 2885 |
|
| 2886 |
+
def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None, enable_text_to_video: bool = False, text_to_video_prompt: Optional[str] = None):
|
| 2887 |
if query is None:
|
| 2888 |
query = ''
|
| 2889 |
if _history is None:
|
|
|
|
| 3035 |
enable_image_to_video=enable_image_to_video,
|
| 3036 |
image_to_video_prompt=image_to_video_prompt,
|
| 3037 |
session_id=session_id,
|
| 3038 |
+
enable_text_to_video=enable_text_to_video,
|
| 3039 |
+
text_to_video_prompt=text_to_video_prompt,
|
| 3040 |
)
|
| 3041 |
|
| 3042 |
_history.append([query, final_content])
|
|
|
|
| 3202 |
enable_image_to_video=enable_image_to_video,
|
| 3203 |
image_to_video_prompt=image_to_video_prompt,
|
| 3204 |
session_id=session_id,
|
| 3205 |
+
enable_text_to_video=enable_text_to_video,
|
| 3206 |
+
text_to_video_prompt=text_to_video_prompt,
|
| 3207 |
)
|
| 3208 |
|
| 3209 |
yield {
|
|
|
|
| 3226 |
enable_image_to_video=enable_image_to_video,
|
| 3227 |
image_to_video_prompt=image_to_video_prompt,
|
| 3228 |
session_id=session_id,
|
| 3229 |
+
enable_text_to_video=enable_text_to_video,
|
| 3230 |
+
text_to_video_prompt=text_to_video_prompt,
|
| 3231 |
)
|
| 3232 |
|
| 3233 |
preview_val = None
|
|
|
|
| 3628 |
image_to_video_prompt=image_to_video_prompt,
|
| 3629 |
session_id=session_id,
|
| 3630 |
text_to_image_prompt=text_to_image_prompt,
|
| 3631 |
+
enable_text_to_video=enable_text_to_video,
|
| 3632 |
+
text_to_video_prompt=text_to_video_prompt,
|
| 3633 |
)
|
| 3634 |
|
| 3635 |
# Update history with the cleaned content
|
|
|
|
| 3657 |
enable_image_to_video=enable_image_to_video,
|
| 3658 |
image_to_video_prompt=image_to_video_prompt,
|
| 3659 |
session_id=session_id,
|
| 3660 |
+
enable_text_to_video=enable_text_to_video,
|
| 3661 |
+
text_to_video_prompt=text_to_video_prompt,
|
| 3662 |
)
|
| 3663 |
|
| 3664 |
_history.append([query, final_content])
|
|
|
|
| 4780 |
visible=False
|
| 4781 |
)
|
| 4782 |
|
| 4783 |
+
# Text-to-Video
|
| 4784 |
+
text_to_video_toggle = gr.Checkbox(
|
| 4785 |
+
label="๐น Generate Video (text โ video)",
|
| 4786 |
+
value=False,
|
| 4787 |
+
visible=True,
|
| 4788 |
+
info="Generate a short video directly from your prompt using Wan-AI/Wan2.2-TI2V-5B"
|
| 4789 |
+
)
|
| 4790 |
+
text_to_video_prompt = gr.Textbox(
|
| 4791 |
+
label="Text-to-Video Prompt",
|
| 4792 |
+
placeholder="Describe the video to generate (e.g., 'A young man walking on the street')",
|
| 4793 |
+
lines=2,
|
| 4794 |
+
visible=False
|
| 4795 |
+
)
|
| 4796 |
+
|
| 4797 |
def on_image_to_image_toggle(toggled):
|
| 4798 |
# Show generation image input and its prompt when image-to-image is enabled
|
| 4799 |
return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
|
|
|
|
| 4819 |
inputs=[image_generation_toggle],
|
| 4820 |
outputs=[text_to_image_prompt]
|
| 4821 |
)
|
| 4822 |
+
text_to_video_toggle.change(
|
| 4823 |
+
on_text_to_image_toggle,
|
| 4824 |
+
inputs=[text_to_video_toggle],
|
| 4825 |
+
outputs=[text_to_video_prompt]
|
| 4826 |
+
)
|
| 4827 |
model_dropdown = gr.Dropdown(
|
| 4828 |
choices=[model['name'] for model in AVAILABLE_MODELS],
|
| 4829 |
value=DEFAULT_MODEL_NAME,
|
|
|
|
| 5074 |
show_progress="hidden",
|
| 5075 |
).then(
|
| 5076 |
generation_code,
|
| 5077 |
+
inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt, text_to_video_toggle, text_to_video_prompt],
|
| 5078 |
outputs=[code_output, history, sandbox, history_output]
|
| 5079 |
).then(
|
| 5080 |
end_generation_ui,
|