coderprabhat commited on
Commit
322bbf8
·
1 Parent(s): 49129f9

Add olmOCR Gradio app for Hugging Face Spaces deployment

Browse files
Files changed (2) hide show
  1. app.py +170 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import base64
3
+ import gradio as gr
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
7
+ from olmocr.data.renderpdf import render_pdf_to_base64png
8
+ from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
9
+ import warnings
10
+ warnings.filterwarnings('ignore')
11
+
12
+ # Initialize the model with CPU optimizations
13
+ print("Loading model... This may take a few minutes on CPU")
14
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15
+ "allenai/olmOCR-2-7B-1025",
16
+ torch_dtype=torch.float32, # Use float32 for CPU
17
+ low_cpu_mem_usage=True, # Optimize memory usage
18
+ ).eval()
19
+
20
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
21
+ device = torch.device("cpu")
22
+ model.to(device)
23
+ print("Model loaded successfully")
24
+
25
+ def process_document(file, page_number, max_tokens):
26
+ """
27
+ Process a PDF or image file and extract text using olmOCR
28
+
29
+ Args:
30
+ file: Uploaded file (PDF, PNG, or JPEG)
31
+ page_number: Page number to process (for PDFs)
32
+ max_tokens: Maximum number of tokens to generate
33
+
34
+ Returns:
35
+ Extracted text output and processed image
36
+ """
37
+ if file is None:
38
+ return "Please upload a file first.", None
39
+
40
+ try:
41
+ # Handle different file types
42
+ if file.name.endswith('.pdf'):
43
+ # Render PDF page to base64 image with smaller size for CPU
44
+ image_base64 = render_pdf_to_base64png(
45
+ file.name,
46
+ page_number,
47
+ target_longest_image_dim=1024 # Reduced from 1288 for CPU
48
+ )
49
+ main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
50
+ else:
51
+ # Handle image files directly
52
+ main_image = Image.open(file.name)
53
+ # Resize large images for CPU efficiency
54
+ max_size = 1024
55
+ if max(main_image.size) > max_size:
56
+ main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
57
+
58
+ buffered = BytesIO()
59
+ main_image.save(buffered, format="PNG")
60
+ image_base64 = base64.b64encode(buffered.getvalue()).decode()
61
+
62
+ # Build the full prompt
63
+ messages = [
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
68
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
69
+ ],
70
+ }
71
+ ]
72
+
73
+ # Apply the chat template and processor
74
+ text = processor.apply_chat_template(
75
+ messages,
76
+ tokenize=False,
77
+ add_generation_prompt=True
78
+ )
79
+
80
+ inputs = processor(
81
+ text=[text],
82
+ images=[main_image],
83
+ padding=True,
84
+ return_tensors="pt",
85
+ )
86
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
87
+
88
+ # Generate with CPU-optimized settings
89
+ with torch.no_grad(): # Disable gradient computation for inference
90
+ output = model.generate(
91
+ **inputs,
92
+ temperature=0.1,
93
+ max_new_tokens=max_tokens,
94
+ num_return_sequences=1,
95
+ do_sample=False, # Greedy decoding is faster on CPU
96
+ num_beams=1, # No beam search for speed
97
+ )
98
+
99
+ # Decode the output
100
+ prompt_length = inputs["input_ids"].shape[1]
101
+ new_tokens = output[:, prompt_length:]
102
+ text_output = processor.tokenizer.batch_decode(
103
+ new_tokens, skip_special_tokens=True
104
+ )
105
+
106
+ return text_output[0], main_image
107
+
108
+ except Exception as e:
109
+ return f"Error processing file: {str(e)}", None
110
+
111
+ # Create Gradio interface
112
+ with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
113
+ gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
114
+ gr.Markdown("""
115
+ Upload a PDF or image file to extract text using the olmOCR model.
116
+
117
+ ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page.
118
+ """)
119
+
120
+ with gr.Row():
121
+ with gr.Column():
122
+ file_input = gr.File(
123
+ label="Upload Document (PDF, PNG, or JPEG)",
124
+ file_types=[".pdf", ".png", ".jpg", ".jpeg"]
125
+ )
126
+ page_number = gr.Slider(
127
+ minimum=1,
128
+ maximum=50,
129
+ value=1,
130
+ step=1,
131
+ label="Page Number (for PDFs)"
132
+ )
133
+ max_tokens = gr.Slider(
134
+ minimum=100,
135
+ maximum=1024, # Reduced max for CPU
136
+ value=512,
137
+ step=50,
138
+ label="Max Tokens"
139
+ )
140
+ process_btn = gr.Button("Extract Text", variant="primary")
141
+
142
+ gr.Markdown("""
143
+ ### Tips for CPU Usage:
144
+ - Smaller images process faster
145
+ - First run may be slower (model loading)
146
+ - Reduce max tokens for faster results
147
+ """)
148
+
149
+ with gr.Column():
150
+ output_text = gr.Textbox(
151
+ label="Extracted Text",
152
+ lines=20,
153
+ placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
154
+ )
155
+ output_image = gr.Image(label="Processed Image")
156
+
157
+ process_btn.click(
158
+ fn=process_document,
159
+ inputs=[file_input, page_number, max_tokens],
160
+ outputs=[output_text, output_image]
161
+ )
162
+
163
+ gr.Examples(
164
+ examples=[],
165
+ inputs=[file_input]
166
+ )
167
+
168
+ if __name__ == "__main__":
169
+ demo.queue(max_size=3) # Limit queue to prevent overload
170
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ olmocr>=0.4.0