import gradio as gr from playwright.sync_api import sync_playwright import torch from transformers import AutoModelForCausalLM, AutoProcessor from PIL import Image import os import subprocess import spaces # Устанавливаем браузер и все системные зависимости subprocess.run(["playwright", "install", "chromium"], check=True) model_id = "allenai/Molmo-7B-D-0924" processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) # Загружаем модель один раз при старте model = AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="auto", ) model.eval() @spaces.GPU(duration=120) def run_agent(url, prompt): with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"] ) context = browser.new_context(viewport={"width": 1280, "height": 720}) page = context.new_page() try: target_url = url if url.startswith("http") else f"https://{url}" page.goto(target_url, timeout=60000) page.wait_for_timeout(3000) screenshot_path = "/tmp/snapshot.png" page.screenshot(path=screenshot_path) image = Image.open(screenshot_path).convert("RGB") inputs = processor.process(images=[image], text=prompt) inputs = { k: v.to(device="cuda") if torch.is_tensor(v) else v for k, v in inputs.items() } with torch.no_grad(): output = model.generate_from_batch( inputs, max_new_tokens=200, stop_strings=["<|endoftext|>"] ) generated_text = processor.tokenizer.decode( output[0], skip_special_tokens=True ) browser.close() return image, generated_text except Exception as e: try: browser.close() except Exception: pass return None, f"Ошибка: {str(e)}" with gr.Blocks() as demo: gr.Markdown("# 🚀 Molmo AI Web Agent") with gr.Row(): with gr.Column(): url_input = gr.Textbox(label="URL", value="google.com") prompt_input = gr.Textbox(label="Запрос", value="Point to the search bar") btn = gr.Button("Запустить") with gr.Row(): out_img = gr.Image(label="Скриншот", type="pil") out_txt = gr.Textbox(label="Ответ") btn.click(fn=run_agent, inputs=[url_input, prompt_input], outputs=[out_img, out_txt]) demo.launch()