api-olmocr-api

Sleeping

App Files Files Community

abinash73 commited on Nov 15, 2025

Commit

b3ff38b

verified ·

1 Parent(s): 1ca9281

Add main application file

Browse files

Files changed (1) hide show

app.py +241 -0

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+import base64
+import gradio as gr
+from io import BytesIO
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
+# Initialize the model
+print("Loading OlmOCR model...")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "allenai/olmOCR-2-7B-1025",
+    torch_dtype=torch.bfloat16
+).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+print(f"Model loaded successfully on {device}")
+def process_pdf(pdf_file, page_number=1, max_new_tokens=50, temperature=0.1):
+    """
+    Process a PDF file and extract text using OlmOCR
+    Args:
+        pdf_file: Path to uploaded PDF file
+        page_number: Page number to extract (default: 1)
+        max_new_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+    Returns:
+        Extracted text from the PDF
+    """
+    try:
+        # Render PDF page to base64 image
+        image_base64 = render_pdf_to_base64png(
+            pdf_file,
+            page_number,
+            target_longest_image_dim=1288
+        )
+        # Build the prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Process inputs
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+        inputs = processor(
+            text=[text],
+            images=[main_image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(device) for (key, value) in inputs.items()}
+        # Generate output
+        output = model.generate(
+            **inputs,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+        # Decode output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = processor.tokenizer.batch_decode(
+            new_tokens,
+            skip_special_tokens=True
+        )
+        return text_output[0] if text_output else "No text extracted"
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}"
+def process_image(image_file, max_new_tokens=50, temperature=0.1):
+    """
+    Process an image file directly using OlmOCR
+    Args:
+        image_file: PIL Image or path to image file
+        max_new_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+    Returns:
+        Extracted text from the image
+    """
+    try:
+        # Convert image to base64
+        if isinstance(image_file, str):
+            with open(image_file, 'rb') as f:
+                image_bytes = f.read()
+        else:
+            buffered = BytesIO()
+            image_file.save(buffered, format="PNG")
+            image_bytes = buffered.getvalue()
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        # Build the prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Process inputs
+        text = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        main_image = Image.open(BytesIO(image_bytes))
+        inputs = processor(
+            text=[text],
+            images=[main_image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(device) for (key, value) in inputs.items()}
+        # Generate output
+        output = model.generate(
+            **inputs,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+        # Decode output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = processor.tokenizer.batch_decode(
+            new_tokens,
+            skip_special_tokens=True
+        )
+        return text_output[0] if text_output else "No text extracted"
+    except Exception as e:
+        return f"Error processing image: {str(e)}"
+# Create Gradio interface with tabs
+with gr.Blocks(title="OlmOCR API") as demo:
+    gr.Markdown("# OlmOCR - PDF & Image Text Extraction")
+    gr.Markdown("Extract text from PDFs and images using the OlmOCR model")
+    with gr.Tab("PDF Processing"):
+        with gr.Row():
+            with gr.Column():
+                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+                pdf_page = gr.Number(label="Page Number", value=1, precision=0)
+                pdf_tokens = gr.Slider(label="Max New Tokens", minimum=10, maximum=500, value=50, step=10)
+                pdf_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.1, step=0.1)
+                pdf_button = gr.Button("Extract Text from PDF", variant="primary")
+            with gr.Column():
+                pdf_output = gr.Textbox(label="Extracted Text", lines=15)
+        pdf_button.click(
+            fn=process_pdf,
+            inputs=[pdf_input, pdf_page, pdf_tokens, pdf_temp],
+            outputs=pdf_output
+        )
+    with gr.Tab("Image Processing"):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label="Upload Image", type="pil")
+                image_tokens = gr.Slider(label="Max New Tokens", minimum=10, maximum=500, value=50, step=10)
+                image_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.1, step=0.1)
+                image_button = gr.Button("Extract Text from Image", variant="primary")
+            with gr.Column():
+                image_output = gr.Textbox(label="Extracted Text", lines=15)
+        image_button.click(
+            fn=process_image,
+            inputs=[image_input, image_tokens, image_temp],
+            outputs=image_output
+        )
+    gr.Markdown("""
+    ### API Usage
+    Once running, you can access the API at:
+    - **Web Interface**: http://localhost:7860
+    - **API Endpoint**: http://localhost:7860/api/predict
+    ### Python API Client Example:
+    ```python
+    from gradio_client import Client
+    client = Client("http://localhost:7860")
+    # For PDF
+    result = client.predict(
+        pdf_file="path/to/file.pdf",
+        page_number=1,
+        max_new_tokens=50,
+        temperature=0.1,
+        api_name="/predict"
+    )
+    # For Image
+    result = client.predict(
+        image_file="path/to/image.png",
+        max_new_tokens=50,
+        temperature=0.1,
+        api_name="/predict_1"
+    )
+    ```
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Set to True to create a public link
+        show_api=True  # Enable API documentation
+    )