broadfield-dev commited on
Commit
274aee4
·
verified ·
1 Parent(s): 8b307ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -15
app.py CHANGED
@@ -11,6 +11,48 @@ import numpy as np
11
  MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
12
  CPU_DEVICE = "cpu"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # --- Model and Processor Loading ---
15
  print("Loading model and processor...")
16
  try:
@@ -112,7 +154,7 @@ def process_and_generate(image_input, text_prompt, processing_size=512):
112
  if model.config.pad_token_id is None:
113
  model.config.pad_token_id = model.config.eos_token_id
114
 
115
- generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=True, top_p=0.8, temperature=0.7)
116
 
117
  generated_ids_trimmed = [
118
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -139,12 +181,18 @@ with gr.Blocks() as demo:
139
  """
140
  )
141
 
142
- with gr.Accordion("Performance Controls", open=True):
143
- full_page_checkbox = gr.Checkbox(
144
- value=True,
145
- label="Enable Full Height Page Capture",
146
- info="If checked, captures the entire scrollable webpage. If unchecked, captures only the visible part."
147
- )
 
 
 
 
 
 
148
  max_dim_slider = gr.Slider(
149
  minimum=512,
150
  maximum=2048,
@@ -167,10 +215,10 @@ with gr.Blocks() as demo:
167
  screenshot_button = gr.Button("Capture Screenshot")
168
 
169
  with gr.Row():
170
- with gr.Column():
171
  image_output = gr.Image(type="numpy", label="Screenshot")
172
- with gr.Column():
173
- text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this webpage in detail as a markdown document.")
174
  submit_button = gr.Button("Generate Markdown")
175
 
176
  with gr.Row():
@@ -183,16 +231,21 @@ with gr.Blocks() as demo:
183
  return screenshot_path
184
  else:
185
  raise gr.Error(screenshot_path)
186
-
187
  # --- Function to handle the loading UI and processing ---
188
- def generate_markdown_with_loading(image, prompt, size):
189
  # Return a dictionary of updates to show the loading state
190
  yield {
191
- output_text: "Processing, please wait...",
192
  submit_button: gr.update(interactive=False)
193
  }
 
 
 
 
194
  # Process the data
195
- result = process_and_generate(image, prompt, size)
 
196
  # Return a dictionary of updates with the final result
197
  yield {
198
  output_text: result,
@@ -208,7 +261,7 @@ with gr.Blocks() as demo:
208
 
209
  submit_button.click(
210
  fn=generate_markdown_with_loading,
211
- inputs=[image_output, text_prompt, processing_size_slider],
212
  outputs=[output_text, submit_button]
213
  )
214
 
 
11
  MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
12
  CPU_DEVICE = "cpu"
13
 
14
+
15
+ # --- DETAILED PROMPT TEMPLATE ---
16
+ DETAILED_ANALYSIS_PROMPT = """
17
+ Analyze the provided webpage screenshot and generate a detailed markdown document. Break down the page into logical sections. For each section, identify and list all significant elements such as headers, navigation links, buttons, images, forms, and footer content. Your analysis should be structured as follows:
18
+
19
+ # [Page Title or a General Description]
20
+
21
+ ## 1. Overall Summary
22
+ Provide a brief, high-level overview of the webpage's purpose, primary content, and general layout.
23
+
24
+ ## 2. Header / Navigation Bar
25
+ - **Logo:** Describe the logo if present.
26
+ - **Navigation Links:** List all visible navigation links (e.g., Home, About, Services, Contact).
27
+ - **Buttons / CTAs:** List any buttons, such as "Sign In," "Register," or "Get Started."
28
+ - **Search Bar:** Note if a search bar is present.
29
+
30
+ ## 3. Main Content / Body
31
+ This is the primary section of the page. Break it down logically.
32
+ - **Hero Section (if applicable):** Describe the main banner, headline, subheading, and primary call-to-action (CTA).
33
+ - **Key Sections:** Identify and describe each subsequent section. For each, list:
34
+ - **Headings and Subheadings.**
35
+ - **Text/Paragraphs:** Briefly summarize the content.
36
+ - **Images:** Describe the images and their purpose.
37
+ - **Buttons and Links:** List all interactive elements and their text.
38
+ - **Forms:** Describe any input fields, labels, and submission buttons.
39
+
40
+ ## 4. Sidebar (if present)
41
+ Describe the content of the sidebar, including any navigation, filters, or advertisements.
42
+
43
+ ## 5. Footer
44
+ - **Footer Links:** Create a comprehensive list of all links, often categorized under headings like "Company," "Resources," or "Legal."
45
+ - **Social Media Icons:** List any social media platforms linked.
46
+ - **Contact Information:** Note any address, phone number, or email.
47
+ - **Copyright Information:** State the copyright notice.
48
+
49
+ ## 6. General UI/UX Observations
50
+ - **Color Scheme:** Describe the primary colors used.
51
+ - **Typography:** Comment on the font styles (headings vs. body).
52
+ - **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
53
+ """
54
+
55
+
56
  # --- Model and Processor Loading ---
57
  print("Loading model and processor...")
58
  try:
 
154
  if model.config.pad_token_id is None:
155
  model.config.pad_token_id = model.config.eos_token_id
156
 
157
+ generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7) # Increased tokens for detailed prompt
158
 
159
  generated_ids_trimmed = [
160
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
181
  """
182
  )
183
 
184
+ with gr.Accordion("Controls", open=True):
185
+ with gr.Row():
186
+ use_template_checkbox = gr.Checkbox(
187
+ value=True,
188
+ label="Use Detailed Analysis Template",
189
+ info="If checked, uses a comprehensive prompt to dissect the page into sections. If unchecked, uses the prompt in the textbox below."
190
+ )
191
+ full_page_checkbox = gr.Checkbox(
192
+ value=True,
193
+ label="Enable Full Height Page Capture",
194
+ info="If checked, captures the entire scrollable webpage. If unchecked, captures only the visible part."
195
+ )
196
  max_dim_slider = gr.Slider(
197
  minimum=512,
198
  maximum=2048,
 
215
  screenshot_button = gr.Button("Capture Screenshot")
216
 
217
  with gr.Row():
218
+ with gr.Column(scale=1):
219
  image_output = gr.Image(type="numpy", label="Screenshot")
220
+ with gr.Column(scale=1):
221
+ text_prompt = gr.Textbox(label="Custom Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this page's color scheme.")
222
  submit_button = gr.Button("Generate Markdown")
223
 
224
  with gr.Row():
 
231
  return screenshot_path
232
  else:
233
  raise gr.Error(screenshot_path)
234
+
235
  # --- Function to handle the loading UI and processing ---
236
+ def generate_markdown_with_loading(image, user_prompt, processing_size, use_template):
237
  # Return a dictionary of updates to show the loading state
238
  yield {
239
+ output_text: "## Processing, please wait...",
240
  submit_button: gr.update(interactive=False)
241
  }
242
+
243
+ # Determine which prompt to use
244
+ final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
245
+
246
  # Process the data
247
+ result = process_and_generate(image, final_prompt, processing_size)
248
+
249
  # Return a dictionary of updates with the final result
250
  yield {
251
  output_text: result,
 
261
 
262
  submit_button.click(
263
  fn=generate_markdown_with_loading,
264
+ inputs=[image_output, text_prompt, processing_size_slider, use_template_checkbox],
265
  outputs=[output_text, submit_button]
266
  )
267