Spaces:

abinash73
/

api-olmocr-api

Running

App Files Files Community

abinash73 commited on Nov 15, 2025

Commit

bc87f48

verified ·

1 Parent(s): dbd1f1f

Update app.py

Browse files

Files changed (1) hide show

app.py +320 -210

app.py CHANGED Viewed

@@ -1,241 +1,351 @@
-import torch
-import base64
 import gradio as gr
-from io import BytesIO
 from PIL import Image
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
-# Initialize the model
-print("Loading OlmOCR model...")
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    "allenai/olmOCR-2-7B-1025",
-    torch_dtype=torch.bfloat16
-).eval()
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-print(f"Model loaded successfully on {device}")
-def process_pdf(pdf_file, page_number=1, max_new_tokens=50, temperature=0.1):
-    """
-    Process a PDF file and extract text using OlmOCR
-    Args:
-        pdf_file: Path to uploaded PDF file
-        page_number: Page number to extract (default: 1)
-        max_new_tokens: Maximum tokens to generate
-        temperature: Sampling temperature
-    Returns:
-        Extracted text from the PDF
-    """
-    try:
-        # Render PDF page to base64 image
-        image_base64 = render_pdf_to_base64png(
-            pdf_file,
-            page_number,
-            target_longest_image_dim=1288
-        )
-        # Build the prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                ],
-            }
-        ]
-        # Process inputs
-        text = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-        inputs = processor(
-            text=[text],
-            images=[main_image],
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = {key: value.to(device) for (key, value) in inputs.items()}
-        # Generate output
-        output = model.generate(
-            **inputs,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            num_return_sequences=1,
-            do_sample=True,
-        )
-        # Decode output
-        prompt_length = inputs["input_ids"].shape[1]
-        new_tokens = output[:, prompt_length:]
-        text_output = processor.tokenizer.batch_decode(
-            new_tokens,
-            skip_special_tokens=True
-        )
-        return text_output[0] if text_output else "No text extracted"
-    except Exception as e:
-        return f"Error processing PDF: {str(e)}"
-def process_image(image_file, max_new_tokens=50, temperature=0.1):
-    """
-    Process an image file directly using OlmOCR
-    Args:
-        image_file: PIL Image or path to image file
-        max_new_tokens: Maximum tokens to generate
-        temperature: Sampling temperature
-    Returns:
-        Extracted text from the image
-    """
     try:
-        # Convert image to base64
-        if isinstance(image_file, str):
-            with open(image_file, 'rb') as f:
-                image_bytes = f.read()
         else:
-            buffered = BytesIO()
-            image_file.save(buffered, format="PNG")
-            image_bytes = buffered.getvalue()
-        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-        # Build the prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                ],
-            }
-        ]
-        # Process inputs
-        text = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        main_image = Image.open(BytesIO(image_bytes))
-        inputs = processor(
-            text=[text],
-            images=[main_image],
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = {key: value.to(device) for (key, value) in inputs.items()}
-        # Generate output
-        output = model.generate(
-            **inputs,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            num_return_sequences=1,
-            do_sample=True,
-        )
-        # Decode output
-        prompt_length = inputs["input_ids"].shape[1]
-        new_tokens = output[:, prompt_length:]
-        text_output = processor.tokenizer.batch_decode(
-            new_tokens,
-            skip_special_tokens=True
-        )
-        return text_output[0] if text_output else "No text extracted"
     except Exception as e:
-        return f"Error processing image: {str(e)}"
-# Create Gradio interface with tabs
-with gr.Blocks(title="OlmOCR API") as demo:
-    gr.Markdown("# OlmOCR - PDF & Image Text Extraction")
-    gr.Markdown("Extract text from PDFs and images using the OlmOCR model")
-    with gr.Tab("PDF Processing"):
-        with gr.Row():
-            with gr.Column():
-                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
-                pdf_page = gr.Number(label="Page Number", value=1, precision=0)
-                pdf_tokens = gr.Slider(label="Max New Tokens", minimum=10, maximum=500, value=50, step=10)
-                pdf_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.1, step=0.1)
-                pdf_button = gr.Button("Extract Text from PDF", variant="primary")
-            with gr.Column():
-                pdf_output = gr.Textbox(label="Extracted Text", lines=15)
-        pdf_button.click(
-            fn=process_pdf,
-            inputs=[pdf_input, pdf_page, pdf_tokens, pdf_temp],
-            outputs=pdf_output
-        )
-    with gr.Tab("Image Processing"):
-        with gr.Row():
-            with gr.Column():
-                image_input = gr.Image(label="Upload Image", type="pil")
-                image_tokens = gr.Slider(label="Max New Tokens", minimum=10, maximum=500, value=50, step=10)
-                image_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.1, step=0.1)
-                image_button = gr.Button("Extract Text from Image", variant="primary")
-            with gr.Column():
-                image_output = gr.Textbox(label="Extracted Text", lines=15)
-        image_button.click(
-            fn=process_image,
-            inputs=[image_input, image_tokens, image_temp],
-            outputs=image_output
-        )
     gr.Markdown("""
-    ### API Usage
-    Once running, you can access the API at:
-    - **Web Interface**: http://localhost:7860
-    - **API Endpoint**: http://localhost:7860/api/predict
-    ### Python API Client Example:
     ```python
     from gradio_client import Client
     client = Client("http://localhost:7860")
-    # For PDF
     result = client.predict(
-        pdf_file="path/to/file.pdf",
-        page_number=1,
-        max_new_tokens=50,
-        temperature=0.1,
         api_name="/predict"
     )
-    # For Image
-    result = client.predict(
-        image_file="path/to/image.png",
-        max_new_tokens=50,
-        temperature=0.1,
-        api_name="/predict_1"
-    )
     ```
     """)
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,  # Set to True to create a public link
-        show_api=True  # Enable API documentation
     )

 import gradio as gr
+import json
+import re
+from datetime import datetime
+from paddleocr import PaddleOCR
 from PIL import Image
+import pdf2image
+import numpy as np
+# Initialize PaddleOCR
+ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)
+def extract_text_from_image(image):
+    """Extract text from image using PaddleOCR"""
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    result = ocr.ocr(image, cls=True)
+    # Extract text with coordinates
+    text_blocks = []
+    for line in result[0]:
+        bbox = line[0]
+        text = line[1][0]
+        confidence = line[1][1]
+        # Calculate center point for positioning
+        y_center = (bbox[0][1] + bbox[2][1]) / 2
+        x_center = (bbox[0][0] + bbox[2][0]) / 2
+        text_blocks.append({
+            'text': text,
+            'y': y_center,
+            'x': x_center,
+            'confidence': confidence
+        })
+    return text_blocks
+def pdf_to_images(pdf_file):
+    """Convert PDF to images"""
+    images = pdf2image.convert_from_path(pdf_file)
+    return images
+def extract_gstin(text):
+    """Extract GSTIN using pattern matching"""
+    gstin_pattern = r'\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}'
+    match = re.search(gstin_pattern, text)
+    return match.group(0) if match else None
+def extract_pincode(text):
+    """Extract 6-digit PIN code"""
+    pincode_pattern = r'\b\d{6}\b'
+    match = re.search(pincode_pattern, text)
+    return match.group(0) if match else None
+def extract_mobile(text):
+    """Extract mobile number"""
+    mobile_pattern = r'\b[6-9]\d{9}\b'
+    match = re.search(mobile_pattern, text)
+    return match.group(0) if match else None
+def extract_date(text):
+    """Extract date in various formats"""
+    date_patterns = [
+        r'\d{2}[-/]\d{2}[-/]\d{4}',
+        r'\d{2}[-/]\d{2}[-/]\d{2}',
+        r'\d{4}[-/]\d{2}[-/]\d{2}'
+    ]
+    for pattern in date_patterns:
+        match = re.search(pattern, text)
+        if match:
+            return match.group(0)
+    return None
+def extract_invoice_number(text_blocks):
+    """Extract invoice/bill number"""
+    for block in text_blocks:
+        text = block['text']
+        if re.search(r'(invoice|bill)\s*(no|number|#)', text.lower()):
+            # Look for number in same or next block
+            number_match = re.search(r'[A-Z0-9/-]+', text)
+            if number_match:
+                return number_match.group(0)
+    return None
+def extract_amounts(text):
+    """Extract monetary amounts"""
+    amount_pattern = r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)'
+    amounts = re.findall(amount_pattern, text)
+    return [float(amt.replace(',', '')) for amt in amounts]
+def find_header_info(text_blocks):
+    """Extract header information (supplier details)"""
+    all_text = ' '.join([block['text'] for block in text_blocks])
+    header = {
+        "supplier_name": None,
+        "supplier_pincode": extract_pincode(all_text),
+        "gstin": extract_gstin(all_text),
+        "contact_no": extract_mobile(all_text),
+        "invoice_no": extract_invoice_number(text_blocks),
+        "invoice_date": extract_date(all_text)
+    }
+    # Extract supplier name (usually first few lines)
+    top_blocks = sorted(text_blocks, key=lambda x: x['y'])[:5]
+    supplier_name_candidates = []
+    for block in top_blocks:
+        text = block['text'].strip()
+        if len(text) > 3 and not re.match(r'^[\d\s.,]+$', text):
+            supplier_name_candidates.append(text)
+    if supplier_name_candidates:
+        header['supplier_name'] = supplier_name_candidates[0]
+    return header
+def find_line_items(text_blocks):
+    """Extract line items from invoice"""
+    # Sort blocks by Y coordinate
+    sorted_blocks = sorted(text_blocks, key=lambda x: x['y'])
+    items = []
+    current_item = {}
+    # Simple heuristic: Look for patterns
+    for i, block in enumerate(sorted_blocks):
+        text = block['text'].strip()
+        # Look for HSN codes (6 or 8 digits)
+        hsn_match = re.search(r'\b\d{4,8}\b', text)
+        if hsn_match and not current_item.get('hsn'):
+            current_item['hsn'] = hsn_match.group(0)
+        # Look for quantities
+        qty_match = re.search(r'\b(\d+(?:\.\d+)?)\s*(pcs|nos|kg|ltr|box|unit)?', text.lower())
+        if qty_match and not current_item.get('qty'):
+            current_item['qty'] = float(qty_match.group(1))
+            current_item['unit'] = qty_match.group(2) if qty_match.group(2) else 'Nos'
+        # Look for rates/amounts
+        amount_matches = re.findall(r'₹?\s*(\d+(?:,\d+)*(?:\.\d{2})?)', text)
+        if amount_matches:
+            amounts = [float(amt.replace(',', '')) for amt in amount_matches]
+            if not current_item.get('rate') and len(amounts) > 0:
+                current_item['rate'] = amounts[0]
+        # Look for GST percentages
+        gst_match = re.search(r'(\d+(?:\.\d+)?)\s*%', text)
+        if gst_match and not current_item.get('gst_percent'):
+            current_item['gst_percent'] = float(gst_match.group(1))
+        # If we have enough info, save item
+        if len(current_item) >= 3:
+            if 'item_name' not in current_item:
+                current_item['item_name'] = text[:50]
+            items.append({
+                'item_name': current_item.get('item_name', 'Item'),
+                'hsn': current_item.get('hsn', ''),
+                'qty': current_item.get('qty', 0),
+                'unit': current_item.get('unit', 'Nos'),
+                'rate': current_item.get('rate', 0),
+                'discount': current_item.get('discount', 0),
+                'gst_percent': current_item.get('gst_percent', 0)
+            })
+            current_item = {}
+    return items
+def calculate_totals(items):
+    """Calculate totals from line items"""
+    total_gross = 0
+    total_taxable = 0
+    total_gst = 0
+    for item in items:
+        qty = item.get('qty', 0)
+        rate = item.get('rate', 0)
+        discount = item.get('discount', 0)
+        gst_percent = item.get('gst_percent', 0)
+        gross = qty * rate
+        taxable = gross - discount
+        gst_amount = (taxable * gst_percent) / 100
+        item['gross_amount'] = round(gross, 2)
+        item['taxable_amount'] = round(taxable, 2)
+        item['gst_amount'] = round(gst_amount, 2)
+        item['total_amount'] = round(taxable + gst_amount, 2)
+        total_gross += gross
+        total_taxable += taxable
+        total_gst += gst_amount
+    return {
+        'total_gross': round(total_gross, 2),
+        'total_taxable': round(total_taxable, 2),
+        'total_gst': round(total_gst, 2),
+        'grand_total': round(total_taxable + total_gst, 2)
+    }
+def extract_invoice_data(file):
+    """Main function to extract all invoice data"""
     try:
+        # Convert PDF to image if needed
+        if file.name.lower().endswith('.pdf'):
+            images = pdf_to_images(file.name)
+            image = images[0]  # Process first page
         else:
+            image = Image.open(file.name)
+        # Extract text with OCR
+        text_blocks = extract_text_from_image(image)
+        # Extract different sections
+        header = find_header_info(text_blocks)
+        details = find_line_items(text_blocks)
+        footer = calculate_totals(details)
+        # Build final JSON structure
+        result = {
+            "header": header,
+            "details": details,
+            "footer": footer
+        }
+        return json.dumps(result, indent=2, ensure_ascii=False)
     except Exception as e:
+        return json.dumps({
+            "error": str(e),
+            "message": "Failed to process invoice"
+        }, indent=2)
+# Create Gradio Interface
+with gr.Blocks(title="Purchase Invoice Data Extraction", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🧾 Purchase Invoice Data Extraction API
+    Upload purchase invoices (PDF or Image) to automatically extract structured data including:
+    - Supplier details (Name, PIN, GSTIN, Contact)
+    - Invoice information (Number, Date)
+    - Line items (Name, HSN, Qty, Rate, Discounts, GST%)
+    - Calculated totals (Gross, Taxable, Tax, Grand Total)
+    """)
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(
+                label="Upload Invoice (PDF or Image)",
+                file_types=[".pdf", ".png", ".jpg", ".jpeg"]
+            )
+            extract_btn = gr.Button("Extract Data", variant="primary", size="lg")
+            gr.Markdown("""
+            ### Supported Formats:
+            - PDF documents
+            - PNG, JPG, JPEG images
+            - English and Hindi text
+            """)
+        with gr.Column():
+            output_json = gr.Code(
+                label="Extracted Data (JSON)",
+                language="json",
+                lines=25
+            )
     gr.Markdown("""
+    ### Output Structure:
+    ```json
+    {
+      "header": {
+        "supplier_name": "...",
+        "supplier_pincode": "...",
+        "gstin": "...",
+        "contact_no": "...",
+        "invoice_no": "...",
+        "invoice_date": "..."
+      },
+      "details": [
+        {
+          "item_name": "...",
+          "hsn": "...",
+          "qty": 0,
+          "unit": "...",
+          "rate": 0,
+          "discount": 0,
+          "gst_percent": 0,
+          "gross_amount": 0,
+          "taxable_amount": 0,
+          "gst_amount": 0,
+          "total_amount": 0
+        }
+      ],
+      "footer": {
+        "total_gross": 0,
+        "total_taxable": 0,
+        "total_gst": 0,
+        "grand_total": 0
+      }
+    }
+    ```
+    ---
+    ### API Usage:
+    **Python Client:**
     ```python
     from gradio_client import Client
     client = Client("http://localhost:7860")
     result = client.predict(
+        file="path/to/invoice.pdf",
         api_name="/predict"
     )
+    print(result)
+    ```
+    **cURL:**
+    ```bash
+    curl -X POST http://localhost:7860/api/predict \\
+      -F "file=@invoice.pdf"
     ```
     """)
+    extract_btn.click(
+        fn=extract_invoice_data,
+        inputs=[file_input],
+        outputs=[output_json]
+    )
+    # Example usage
+    gr.Examples(
+        examples=[],
+        inputs=[file_input],
+        outputs=[output_json],
+        fn=extract_invoice_data,
+        cache_examples=False
+    )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,
+        show_api=True
     )