Spaces:
Running
on
Zero
Running
on
Zero
update app
Browse filesimplemented image scaling feature by adding a slider to the user interface and integrating the scaling logic into the document processing function. This allows you to upscale the image before it's sent to the model, which can significantly improve the accuracy of the text extraction, especially for documents with small characters.
app.py
CHANGED
|
@@ -129,7 +129,8 @@ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: i
|
|
| 129 |
@spaces.GPU
|
| 130 |
def process_document_stream(
|
| 131 |
image: Image.Image,
|
| 132 |
-
prompt_input: str,
|
|
|
|
| 133 |
max_new_tokens: int,
|
| 134 |
temperature: float,
|
| 135 |
top_p: float,
|
|
@@ -146,6 +147,21 @@ def process_document_stream(
|
|
| 146 |
yield "Please enter a prompt.", ""
|
| 147 |
return
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
temp_image_path = None
|
| 150 |
try:
|
| 151 |
# --- FIX: Save the PIL Image to a temporary file ---
|
|
@@ -230,6 +246,16 @@ def create_gradio_interface():
|
|
| 230 |
image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
|
| 231 |
|
| 232 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
|
| 234 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
|
| 235 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
|
|
@@ -276,7 +302,8 @@ def create_gradio_interface():
|
|
| 276 |
|
| 277 |
process_btn.click(
|
| 278 |
fn=process_document_stream,
|
| 279 |
-
|
|
|
|
| 280 |
outputs=[raw_output_stream, markdown_output]
|
| 281 |
)
|
| 282 |
|
|
|
|
| 129 |
@spaces.GPU
|
| 130 |
def process_document_stream(
|
| 131 |
image: Image.Image,
|
| 132 |
+
prompt_input: str,
|
| 133 |
+
image_scale_factor: float, # New parameter for image scaling
|
| 134 |
max_new_tokens: int,
|
| 135 |
temperature: float,
|
| 136 |
top_p: float,
|
|
|
|
| 147 |
yield "Please enter a prompt.", ""
|
| 148 |
return
|
| 149 |
|
| 150 |
+
# --- IMPLEMENTATION: Image Scaling based on user input ---
|
| 151 |
+
if image_scale_factor > 1.0:
|
| 152 |
+
try:
|
| 153 |
+
original_width, original_height = image.size
|
| 154 |
+
new_width = int(original_width * image_scale_factor)
|
| 155 |
+
new_height = int(original_height * image_scale_factor)
|
| 156 |
+
print(f"Scaling image from {image.size} to ({new_width}, {new_height}) with factor {image_scale_factor}.")
|
| 157 |
+
# Use a high-quality resampling filter for better results
|
| 158 |
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"Error during image scaling: {e}")
|
| 161 |
+
# Continue with the original image if scaling fails
|
| 162 |
+
pass
|
| 163 |
+
# --- END IMPLEMENTATION ---
|
| 164 |
+
|
| 165 |
temp_image_path = None
|
| 166 |
try:
|
| 167 |
# --- FIX: Save the PIL Image to a temporary file ---
|
|
|
|
| 246 |
image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
|
| 247 |
|
| 248 |
with gr.Accordion("Advanced Settings", open=False):
|
| 249 |
+
# --- NEW UI ELEMENT: Image Scaling Slider ---
|
| 250 |
+
image_scale_factor = gr.Slider(
|
| 251 |
+
minimum=1.0,
|
| 252 |
+
maximum=3.0,
|
| 253 |
+
value=1.0,
|
| 254 |
+
step=0.1,
|
| 255 |
+
label="Image Upscale Factor",
|
| 256 |
+
info="Increases image size before processing. Can improve OCR on small text. Default: 1.0 (no change)."
|
| 257 |
+
)
|
| 258 |
+
# --- END NEW UI ELEMENT ---
|
| 259 |
max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
|
| 260 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
|
| 261 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
|
|
|
|
| 302 |
|
| 303 |
process_btn.click(
|
| 304 |
fn=process_document_stream,
|
| 305 |
+
# --- UPDATE: Add the new slider to the inputs list ---
|
| 306 |
+
inputs=[image_input, prompt_input, image_scale_factor, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
| 307 |
outputs=[raw_output_stream, markdown_output]
|
| 308 |
)
|
| 309 |
|