coderprabhat commited on
Commit
55a0a6c
·
1 Parent(s): 1c1eb03

fix : bugs

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +26 -73
  3. requirements.txt +1 -2
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 📄
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.0.0
8
  app_file: app.py
9
  python_version: 3.11
10
  pinned: false
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
  app_file: app.py
9
  python_version: 3.11
10
  pinned: false
app.py CHANGED
@@ -3,55 +3,45 @@ import base64
3
  import gradio as gr
4
  from io import BytesIO
5
  from PIL import Image
6
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
7
  from olmocr.data.renderpdf import render_pdf_to_base64png
8
  from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
9
  import warnings
10
  warnings.filterwarnings('ignore')
11
 
12
- # Initialize the model with CPU optimizations
13
- print("Loading model... This may take a few minutes on CPU")
 
 
 
 
 
14
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15
- "allenai/olmOCR-2-7B-1025",
16
- torch_dtype=torch.float32, # Use float32 for CPU
17
- low_cpu_mem_usage=True, # Optimize memory usage
 
18
  ).eval()
19
 
20
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
21
- device = torch.device("cpu")
22
- model.to(device)
23
  print("Model loaded successfully")
24
 
25
  def process_document(file, page_number, max_tokens):
26
- """
27
- Process a PDF or image file and extract text using olmOCR
28
-
29
- Args:
30
- file: Uploaded file (PDF, PNG, or JPEG)
31
- page_number: Page number to process (for PDFs)
32
- max_tokens: Maximum number of tokens to generate
33
-
34
- Returns:
35
- Extracted text output and processed image
36
- """
37
  if file is None:
38
  return "Please upload a file first.", None
39
 
40
  try:
41
  # Handle different file types
42
  if file.name.endswith('.pdf'):
43
- # Render PDF page to base64 image with smaller size for CPU
44
  image_base64 = render_pdf_to_base64png(
45
  file.name,
46
  page_number,
47
- target_longest_image_dim=1024 # Reduced from 1288 for CPU
48
  )
49
  main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
50
  else:
51
- # Handle image files directly
52
  main_image = Image.open(file.name)
53
- # Resize large images for CPU efficiency
54
- max_size = 1024
55
  if max(main_image.size) > max_size:
56
  main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
57
 
@@ -59,7 +49,6 @@ def process_document(file, page_number, max_tokens):
59
  main_image.save(buffered, format="PNG")
60
  image_base64 = base64.b64encode(buffered.getvalue()).decode()
61
 
62
- # Build the full prompt
63
  messages = [
64
  {
65
  "role": "user",
@@ -70,7 +59,6 @@ def process_document(file, page_number, max_tokens):
70
  }
71
  ]
72
 
73
- # Apply the chat template and processor
74
  text = processor.apply_chat_template(
75
  messages,
76
  tokenize=False,
@@ -83,20 +71,17 @@ def process_document(file, page_number, max_tokens):
83
  padding=True,
84
  return_tensors="pt",
85
  )
86
- inputs = {key: value.to(device) for (key, value) in inputs.items()}
87
 
88
- # Generate with CPU-optimized settings
89
- with torch.no_grad(): # Disable gradient computation for inference
90
  output = model.generate(
91
  **inputs,
92
  temperature=0.1,
93
- max_new_tokens=max_tokens,
94
  num_return_sequences=1,
95
- do_sample=False, # Greedy decoding is faster on CPU
96
- num_beams=1, # No beam search for speed
97
  )
98
 
99
- # Decode the output
100
  prompt_length = inputs["input_ids"].shape[1]
101
  new_tokens = output[:, prompt_length:]
102
  text_output = processor.tokenizer.batch_decode(
@@ -106,16 +91,12 @@ def process_document(file, page_number, max_tokens):
106
  return text_output[0], main_image
107
 
108
  except Exception as e:
109
- return f"Error processing file: {str(e)}", None
110
 
111
- # Create Gradio interface
112
  with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
113
- gr.Markdown("# olmOCR: Document OCR with Vision Language Models")
114
- gr.Markdown("""
115
- Upload a PDF or image file to extract text using the olmOCR model.
116
-
117
- ⚠️ **Note**: Running on CPU - processing may take 30-90 seconds per page.
118
- """)
119
 
120
  with gr.Row():
121
  with gr.Column():
@@ -123,35 +104,12 @@ with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
123
  label="Upload Document (PDF, PNG, or JPEG)",
124
  file_types=[".pdf", ".png", ".jpg", ".jpeg"]
125
  )
126
- page_number = gr.Slider(
127
- minimum=1,
128
- maximum=50,
129
- value=1,
130
- step=1,
131
- label="Page Number (for PDFs)"
132
- )
133
- max_tokens = gr.Slider(
134
- minimum=100,
135
- maximum=1024, # Reduced max for CPU
136
- value=512,
137
- step=50,
138
- label="Max Tokens"
139
- )
140
  process_btn = gr.Button("Extract Text", variant="primary")
141
-
142
- gr.Markdown("""
143
- ### Tips for CPU Usage:
144
- - Smaller images process faster
145
- - First run may be slower (model loading)
146
- - Reduce max tokens for faster results
147
- """)
148
 
149
  with gr.Column():
150
- output_text = gr.Textbox(
151
- label="Extracted Text",
152
- lines=20,
153
- placeholder="Extracted text will appear here...\n\nProcessing on CPU may take 30-90 seconds."
154
- )
155
  output_image = gr.Image(label="Processed Image")
156
 
157
  process_btn.click(
@@ -159,12 +117,7 @@ with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
159
  inputs=[file_input, page_number, max_tokens],
160
  outputs=[output_text, output_image]
161
  )
162
-
163
- gr.Examples(
164
- examples=[],
165
- inputs=[file_input]
166
- )
167
 
168
  if __name__ == "__main__":
169
- demo.queue(max_size=3) # Limit queue to prevent overload
170
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import gradio as gr
4
  from io import BytesIO
5
  from PIL import Image
6
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, BitsAndBytesConfig
7
  from olmocr.data.renderpdf import render_pdf_to_base64png
8
  from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
9
  import warnings
10
  warnings.filterwarnings('ignore')
11
 
12
+ # Configure 8-bit quantization to reduce memory
13
+ quantization_config = BitsAndBytesConfig(
14
+ load_in_8bit=True,
15
+ llm_int8_enable_fp32_cpu_offload=True
16
+ )
17
+
18
+ print("Loading model with 8-bit quantization...")
19
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
20
+ "allenai/olmOCR-2-7B-1025",
21
+ quantization_config=quantization_config,
22
+ device_map="auto",
23
+ low_cpu_mem_usage=True,
24
  ).eval()
25
 
26
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 
 
27
  print("Model loaded successfully")
28
 
29
  def process_document(file, page_number, max_tokens):
 
 
 
 
 
 
 
 
 
 
 
30
  if file is None:
31
  return "Please upload a file first.", None
32
 
33
  try:
34
  # Handle different file types
35
  if file.name.endswith('.pdf'):
 
36
  image_base64 = render_pdf_to_base64png(
37
  file.name,
38
  page_number,
39
+ target_longest_image_dim=896 # Further reduced for memory
40
  )
41
  main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
42
  else:
 
43
  main_image = Image.open(file.name)
44
+ max_size = 896 # Reduced image size
 
45
  if max(main_image.size) > max_size:
46
  main_image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
47
 
 
49
  main_image.save(buffered, format="PNG")
50
  image_base64 = base64.b64encode(buffered.getvalue()).decode()
51
 
 
52
  messages = [
53
  {
54
  "role": "user",
 
59
  }
60
  ]
61
 
 
62
  text = processor.apply_chat_template(
63
  messages,
64
  tokenize=False,
 
71
  padding=True,
72
  return_tensors="pt",
73
  )
 
74
 
75
+ # Generate with memory optimization
76
+ with torch.no_grad():
77
  output = model.generate(
78
  **inputs,
79
  temperature=0.1,
80
+ max_new_tokens=min(max_tokens, 256), # Limit tokens
81
  num_return_sequences=1,
82
+ do_sample=False,
 
83
  )
84
 
 
85
  prompt_length = inputs["input_ids"].shape[1]
86
  new_tokens = output[:, prompt_length:]
87
  text_output = processor.tokenizer.batch_decode(
 
91
  return text_output[0], main_image
92
 
93
  except Exception as e:
94
+ return f"Error: {str(e)}", None
95
 
96
+ # Create Gradio interface (same as before, but update max_tokens)
97
  with gr.Blocks(title="olmOCR - Document OCR (CPU)") as demo:
98
+ gr.Markdown("# olmOCR: Document OCR (Quantized)")
99
+ gr.Markdown("⚠️ **Note**: Using 8-bit quantization for CPU compatibility. Processing may take 60-120 seconds.")
 
 
 
 
100
 
101
  with gr.Row():
102
  with gr.Column():
 
104
  label="Upload Document (PDF, PNG, or JPEG)",
105
  file_types=[".pdf", ".png", ".jpg", ".jpeg"]
106
  )
107
+ page_number = gr.Slider(1, 20, value=1, step=1, label="Page Number")
108
+ max_tokens = gr.Slider(50, 256, value=128, step=16, label="Max Tokens")
 
 
 
 
 
 
 
 
 
 
 
 
109
  process_btn = gr.Button("Extract Text", variant="primary")
 
 
 
 
 
 
 
110
 
111
  with gr.Column():
112
+ output_text = gr.Textbox(label="Extracted Text", lines=20)
 
 
 
 
113
  output_image = gr.Image(label="Processed Image")
114
 
115
  process_btn.click(
 
117
  inputs=[file_input, page_number, max_tokens],
118
  outputs=[output_text, output_image]
119
  )
 
 
 
 
 
120
 
121
  if __name__ == "__main__":
122
+ demo.queue(max_size=2)
123
  demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,10 +1,9 @@
1
  torch
2
- torchvision
3
  transformers>=4.40.0
4
  gradio
5
  pillow
6
  olmocr
7
  accelerate
 
8
  sentencepiece
9
  qwen-vl-utils
10
- poppler-utils
 
1
  torch
 
2
  transformers>=4.40.0
3
  gradio
4
  pillow
5
  olmocr
6
  accelerate
7
+ bitsandbytes
8
  sentencepiece
9
  qwen-vl-utils