Playwright / app.py
rafael1994s's picture
Update app.py
6bf082d verified
import gradio as gr
from playwright.sync_api import sync_playwright
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import os
import subprocess
import spaces
# Устанавливаем браузер и все системные зависимости
subprocess.run(["playwright", "install", "chromium"], check=True)
model_id = "allenai/Molmo-7B-D-0924"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Загружаем модель один раз при старте
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
device_map="auto",
)
model.eval()
@spaces.GPU(duration=120)
def run_agent(url, prompt):
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"]
)
context = browser.new_context(viewport={"width": 1280, "height": 720})
page = context.new_page()
try:
target_url = url if url.startswith("http") else f"https://{url}"
page.goto(target_url, timeout=60000)
page.wait_for_timeout(3000)
screenshot_path = "/tmp/snapshot.png"
page.screenshot(path=screenshot_path)
image = Image.open(screenshot_path).convert("RGB")
inputs = processor.process(images=[image], text=prompt)
inputs = {
k: v.to(device="cuda") if torch.is_tensor(v) else v
for k, v in inputs.items()
}
with torch.no_grad():
output = model.generate_from_batch(
inputs,
max_new_tokens=200,
stop_strings=["<|endoftext|>"]
)
generated_text = processor.tokenizer.decode(
output[0], skip_special_tokens=True
)
browser.close()
return image, generated_text
except Exception as e:
try:
browser.close()
except Exception:
pass
return None, f"Ошибка: {str(e)}"
with gr.Blocks() as demo:
gr.Markdown("# 🚀 Molmo AI Web Agent")
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="URL", value="google.com")
prompt_input = gr.Textbox(label="Запрос", value="Point to the search bar")
btn = gr.Button("Запустить")
with gr.Row():
out_img = gr.Image(label="Скриншот", type="pil")
out_txt = gr.Textbox(label="Ответ")
btn.click(fn=run_agent, inputs=[url_input, prompt_input], outputs=[out_img, out_txt])
demo.launch()