Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,6 +11,48 @@ import numpy as np
|
|
| 11 |
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
|
| 12 |
CPU_DEVICE = "cpu"
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# --- Model and Processor Loading ---
|
| 15 |
print("Loading model and processor...")
|
| 16 |
try:
|
|
@@ -112,7 +154,7 @@ def process_and_generate(image_input, text_prompt, processing_size=512):
|
|
| 112 |
if model.config.pad_token_id is None:
|
| 113 |
model.config.pad_token_id = model.config.eos_token_id
|
| 114 |
|
| 115 |
-
generated_ids = model.generate(**inputs, max_new_tokens=
|
| 116 |
|
| 117 |
generated_ids_trimmed = [
|
| 118 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
@@ -139,12 +181,18 @@ with gr.Blocks() as demo:
|
|
| 139 |
"""
|
| 140 |
)
|
| 141 |
|
| 142 |
-
with gr.Accordion("
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
max_dim_slider = gr.Slider(
|
| 149 |
minimum=512,
|
| 150 |
maximum=2048,
|
|
@@ -167,10 +215,10 @@ with gr.Blocks() as demo:
|
|
| 167 |
screenshot_button = gr.Button("Capture Screenshot")
|
| 168 |
|
| 169 |
with gr.Row():
|
| 170 |
-
with gr.Column():
|
| 171 |
image_output = gr.Image(type="numpy", label="Screenshot")
|
| 172 |
-
with gr.Column():
|
| 173 |
-
text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this
|
| 174 |
submit_button = gr.Button("Generate Markdown")
|
| 175 |
|
| 176 |
with gr.Row():
|
|
@@ -183,16 +231,21 @@ with gr.Blocks() as demo:
|
|
| 183 |
return screenshot_path
|
| 184 |
else:
|
| 185 |
raise gr.Error(screenshot_path)
|
| 186 |
-
|
| 187 |
# --- Function to handle the loading UI and processing ---
|
| 188 |
-
def generate_markdown_with_loading(image,
|
| 189 |
# Return a dictionary of updates to show the loading state
|
| 190 |
yield {
|
| 191 |
-
output_text: "Processing, please wait...",
|
| 192 |
submit_button: gr.update(interactive=False)
|
| 193 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
# Process the data
|
| 195 |
-
result = process_and_generate(image,
|
|
|
|
| 196 |
# Return a dictionary of updates with the final result
|
| 197 |
yield {
|
| 198 |
output_text: result,
|
|
@@ -208,7 +261,7 @@ with gr.Blocks() as demo:
|
|
| 208 |
|
| 209 |
submit_button.click(
|
| 210 |
fn=generate_markdown_with_loading,
|
| 211 |
-
inputs=[image_output, text_prompt, processing_size_slider],
|
| 212 |
outputs=[output_text, submit_button]
|
| 213 |
)
|
| 214 |
|
|
|
|
| 11 |
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
|
| 12 |
CPU_DEVICE = "cpu"
|
| 13 |
|
| 14 |
+
|
| 15 |
+
# --- DETAILED PROMPT TEMPLATE ---
|
| 16 |
+
DETAILED_ANALYSIS_PROMPT = """
|
| 17 |
+
Analyze the provided webpage screenshot and generate a detailed markdown document. Break down the page into logical sections. For each section, identify and list all significant elements such as headers, navigation links, buttons, images, forms, and footer content. Your analysis should be structured as follows:
|
| 18 |
+
|
| 19 |
+
# [Page Title or a General Description]
|
| 20 |
+
|
| 21 |
+
## 1. Overall Summary
|
| 22 |
+
Provide a brief, high-level overview of the webpage's purpose, primary content, and general layout.
|
| 23 |
+
|
| 24 |
+
## 2. Header / Navigation Bar
|
| 25 |
+
- **Logo:** Describe the logo if present.
|
| 26 |
+
- **Navigation Links:** List all visible navigation links (e.g., Home, About, Services, Contact).
|
| 27 |
+
- **Buttons / CTAs:** List any buttons, such as "Sign In," "Register," or "Get Started."
|
| 28 |
+
- **Search Bar:** Note if a search bar is present.
|
| 29 |
+
|
| 30 |
+
## 3. Main Content / Body
|
| 31 |
+
This is the primary section of the page. Break it down logically.
|
| 32 |
+
- **Hero Section (if applicable):** Describe the main banner, headline, subheading, and primary call-to-action (CTA).
|
| 33 |
+
- **Key Sections:** Identify and describe each subsequent section. For each, list:
|
| 34 |
+
- **Headings and Subheadings.**
|
| 35 |
+
- **Text/Paragraphs:** Briefly summarize the content.
|
| 36 |
+
- **Images:** Describe the images and their purpose.
|
| 37 |
+
- **Buttons and Links:** List all interactive elements and their text.
|
| 38 |
+
- **Forms:** Describe any input fields, labels, and submission buttons.
|
| 39 |
+
|
| 40 |
+
## 4. Sidebar (if present)
|
| 41 |
+
Describe the content of the sidebar, including any navigation, filters, or advertisements.
|
| 42 |
+
|
| 43 |
+
## 5. Footer
|
| 44 |
+
- **Footer Links:** Create a comprehensive list of all links, often categorized under headings like "Company," "Resources," or "Legal."
|
| 45 |
+
- **Social Media Icons:** List any social media platforms linked.
|
| 46 |
+
- **Contact Information:** Note any address, phone number, or email.
|
| 47 |
+
- **Copyright Information:** State the copyright notice.
|
| 48 |
+
|
| 49 |
+
## 6. General UI/UX Observations
|
| 50 |
+
- **Color Scheme:** Describe the primary colors used.
|
| 51 |
+
- **Typography:** Comment on the font styles (headings vs. body).
|
| 52 |
+
- **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
|
| 56 |
# --- Model and Processor Loading ---
|
| 57 |
print("Loading model and processor...")
|
| 58 |
try:
|
|
|
|
| 154 |
if model.config.pad_token_id is None:
|
| 155 |
model.config.pad_token_id = model.config.eos_token_id
|
| 156 |
|
| 157 |
+
generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7) # Increased tokens for detailed prompt
|
| 158 |
|
| 159 |
generated_ids_trimmed = [
|
| 160 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
|
|
| 181 |
"""
|
| 182 |
)
|
| 183 |
|
| 184 |
+
with gr.Accordion("Controls", open=True):
|
| 185 |
+
with gr.Row():
|
| 186 |
+
use_template_checkbox = gr.Checkbox(
|
| 187 |
+
value=True,
|
| 188 |
+
label="Use Detailed Analysis Template",
|
| 189 |
+
info="If checked, uses a comprehensive prompt to dissect the page into sections. If unchecked, uses the prompt in the textbox below."
|
| 190 |
+
)
|
| 191 |
+
full_page_checkbox = gr.Checkbox(
|
| 192 |
+
value=True,
|
| 193 |
+
label="Enable Full Height Page Capture",
|
| 194 |
+
info="If checked, captures the entire scrollable webpage. If unchecked, captures only the visible part."
|
| 195 |
+
)
|
| 196 |
max_dim_slider = gr.Slider(
|
| 197 |
minimum=512,
|
| 198 |
maximum=2048,
|
|
|
|
| 215 |
screenshot_button = gr.Button("Capture Screenshot")
|
| 216 |
|
| 217 |
with gr.Row():
|
| 218 |
+
with gr.Column(scale=1):
|
| 219 |
image_output = gr.Image(type="numpy", label="Screenshot")
|
| 220 |
+
with gr.Column(scale=1):
|
| 221 |
+
text_prompt = gr.Textbox(label="Custom Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this page's color scheme.")
|
| 222 |
submit_button = gr.Button("Generate Markdown")
|
| 223 |
|
| 224 |
with gr.Row():
|
|
|
|
| 231 |
return screenshot_path
|
| 232 |
else:
|
| 233 |
raise gr.Error(screenshot_path)
|
| 234 |
+
|
| 235 |
# --- Function to handle the loading UI and processing ---
|
| 236 |
+
def generate_markdown_with_loading(image, user_prompt, processing_size, use_template):
|
| 237 |
# Return a dictionary of updates to show the loading state
|
| 238 |
yield {
|
| 239 |
+
output_text: "## Processing, please wait... ⏳",
|
| 240 |
submit_button: gr.update(interactive=False)
|
| 241 |
}
|
| 242 |
+
|
| 243 |
+
# Determine which prompt to use
|
| 244 |
+
final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
|
| 245 |
+
|
| 246 |
# Process the data
|
| 247 |
+
result = process_and_generate(image, final_prompt, processing_size)
|
| 248 |
+
|
| 249 |
# Return a dictionary of updates with the final result
|
| 250 |
yield {
|
| 251 |
output_text: result,
|
|
|
|
| 261 |
|
| 262 |
submit_button.click(
|
| 263 |
fn=generate_markdown_with_loading,
|
| 264 |
+
inputs=[image_output, text_prompt, processing_size_slider, use_template_checkbox],
|
| 265 |
outputs=[output_text, submit_button]
|
| 266 |
)
|
| 267 |
|