Spaces:

coderprabhat
/

olmOCR

Runtime error

App Files Files Community

coderprabhat commited on Nov 4

Commit

55a0a6c

1 Parent(s): 1c1eb03

fix : bugs

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +26 -73
requirements.txt +1 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 📄
 colorFrom: blue
 colorTo: green
 sdk: gradio
-sdk_version: 4.0.0
 app_file: app.py
 python_version: 3.11
 pinned: false

 colorFrom: blue
 colorTo: green
 sdk: gradio
+sdk_version: 5.49.1
 app_file: app.py
 python_version: 3.11
 pinned: false

app.py CHANGED Viewed

@@ -3,55 +3,45 @@ import base64
 import gradio as gr
 from io import BytesIO
 from PIL import Image
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
 import warnings
 warnings.filterwarnings('ignore')
-# Initialize the model with CPU optimizations
-print("Loading model... This may take a few minutes on CPU")
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    "allenai/olmOCR-2-7B-1025",
-    torch_dtype=torch.float32,  # Use float32 for CPU
-    low_cpu_mem_usage=True,     # Optimize memory usage
 ).eval()
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-device = torch.device("cpu")
-model.to(device)
 print("Model loaded successfully")
 def process_document(file, page_number, max_tokens):
-    """
-    Process a PDF or image file and extract text using olmOCR
-    Args:
-        file: Uploaded file (PDF, PNG, or JPEG)
-        page_number: Page number to process (for PDFs)
-        max_tokens: Maximum number of tokens to generate
-    Returns:
-        Extracted text output and processed image
-    """
     if file is None:
         return "Please upload a file first.", None
     try:
         # Handle different file types
         if file.name.endswith('.pdf'):
-            # Render PDF page to base64 image with smaller size for CPU
             image_base64 = render_pdf_to_base64png(
                 file.name,
                 page_number,
-                target_longest_image_dim=1024  # Reduced from 1288 for CPU
             )
             main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
         else:
-            # Handle image files directly
             main_image = Image.open(file.name)
-            # Resize large images for CPU efficiency
-            max_size = 1024
             if max(main_image.size) > max_size:
                 main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
@@ -59,7 +49,6 @@ def process_document(file, page_number, max_tokens):
             main_image.save(buffered, format="PNG")
             image_base64 = base64.b64encode(buffered.getvalue()).decode()
-        # Build the full prompt
         messages = [
             {
                 "role": "user",
@@ -70,7 +59,6 @@ def process_document(file, page_number, max_tokens):
             }
         ]
-        # Apply the chat template and processor
         text = processor.apply_chat_template(
             messages,
             tokenize=False,
@@ -83,20 +71,17 @@ def process_document(file, page_number, max_tokens):
             padding=True,
             return_tensors="pt",
         )
-        inputs = {key: value.to(device) for (key, value) in inputs.items()}
-        # Generate with CPU-optimized settings
-        with torch.no_grad():  # Disable gradient computation for inference
             output = model.generate(
                 **inputs,
                 temperature=0.1,
-                max_new_tokens=max_tokens,
                 num_return_sequences=1,
-                do_sample=False,  # Greedy decoding is faster on CPU
-                num_beams=1,      # No beam search for speed
             )
-        # Decode the output
         prompt_length = inputs["input_ids"].shape[1]
         new_tokens = output[:, prompt_length:]
         text_output = processor.tokenizer.batch_decode(
@@ -106,16 +91,12 @@ def process_document(file, page_number, max_tokens):
         return text_output[0], main_image
     except Exception as e:
-        return f"Error processing file: {str(e)}", None
-# Create Gradio interface
 with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
-    gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
-    gr.Markdown("""
-    Upload a PDF or image file to extract text using the olmOCR model.
-    ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page.
-    """)
     with gr.Row():
         with gr.Column():
@@ -123,35 +104,12 @@ with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
                 label="Upload Document (PDF, PNG, or JPEG)",
                 file_types=[".pdf", ".png", ".jpg", ".jpeg"]
             )
-            page_number = gr.Slider(
-                minimum=1,
-                maximum=50,
-                value=1,
-                step=1,
-                label="Page Number (for PDFs)"
-            )
-            max_tokens = gr.Slider(
-                minimum=100,
-                maximum=1024,  # Reduced max for CPU
-                value=512,
-                step=50,
-                label="Max Tokens"
-            )
             process_btn = gr.Button("Extract Text", variant="primary")
-            gr.Markdown("""
-            ### Tips for CPU Usage:
-            - Smaller images process faster
-            - First run may be slower (model loading)
-            - Reduce max tokens for faster results
-            """)
         with gr.Column():
-            output_text = gr.Textbox(
-                label="Extracted Text",
-                lines=20,
-                placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
-            )
             output_image = gr.Image(label="Processed Image")
     process_btn.click(
@@ -159,12 +117,7 @@ with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
         inputs=[file_input, page_number, max_tokens],
         outputs=[output_text, output_image]
     )
-    gr.Examples(
-        examples=[],
-        inputs=[file_input]
-    )
 if __name__ == "__main__":
-    demo.queue(max_size=3)  # Limit queue to prevent overload
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from io import BytesIO
 from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig
 from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
 import warnings
 warnings.filterwarnings('ignore')
+# Configure 8-bit quantization to reduce memory
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_enable_fp32_cpu_offload=True
+)
+print("Loading model with 8-bit quantization...")
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "allenai/olmOCR-2-7B-1025",
+    quantization_config=quantization_config,
+    device_map="auto",
+    low_cpu_mem_usage=True,
 ).eval()
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 print("Model loaded successfully")
 def process_document(file, page_number, max_tokens):
     if file is None:
         return "Please upload a file first.", None
     try:
         # Handle different file types
         if file.name.endswith('.pdf'):
             image_base64 = render_pdf_to_base64png(
                 file.name,
                 page_number,
+                target_longest_image_dim=896  # Further reduced for memory
             )
             main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
         else:
             main_image = Image.open(file.name)
+            max_size = 896  # Reduced image size
             if max(main_image.size) > max_size:
                 main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
             main_image.save(buffered, format="PNG")
             image_base64 = base64.b64encode(buffered.getvalue()).decode()
         messages = [
             {
                 "role": "user",
             }
         ]
         text = processor.apply_chat_template(
             messages,
             tokenize=False,
             padding=True,
             return_tensors="pt",
         )
+        # Generate with memory optimization
+        with torch.no_grad():
             output = model.generate(
                 **inputs,
                 temperature=0.1,
+                max_new_tokens=min(max_tokens, 256),  # Limit tokens
                 num_return_sequences=1,
+                do_sample=False,
             )
         prompt_length = inputs["input_ids"].shape[1]
         new_tokens = output[:, prompt_length:]
         text_output = processor.tokenizer.batch_decode(
         return text_output[0], main_image
     except Exception as e:
+        return f"Error: {str(e)}", None
+# Create Gradio interface (same as before, but update max_tokens)
 with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
+    gr.Markdown("# olmOCR: Document OCR (Quantized)")
+    gr.Markdown("⚠️ **Note**: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.")
     with gr.Row():
         with gr.Column():
                 label="Upload Document (PDF, PNG, or JPEG)",
                 file_types=[".pdf", ".png", ".jpg", ".jpeg"]
             )
+            page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number")
+            max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens")
             process_btn = gr.Button("Extract Text", variant="primary")
         with gr.Column():
+            output_text = gr.Textbox(label="Extracted Text", lines=20)
             output_image = gr.Image(label="Processed Image")
     process_btn.click(
         inputs=[file_input, page_number, max_tokens],
         outputs=[output_text, output_image]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=2)
     demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,10 +1,9 @@
 torch
-torchvision
 transformers>=4.40.0
 gradio
 pillow
 olmocr
 accelerate
 sentencepiece
 qwen-vl-utils
-poppler-utils

 torch
 transformers>=4.40.0
 gradio
 pillow
 olmocr
 accelerate
+bitsandbytes
 sentencepiece
 qwen-vl-utils