Spaces:

rks28042003
/

donut

Sleeping

File size: 7,577 Bytes

4c57130

import gradio as gr
import torch
from PIL import Image
import json
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import os

# Global variables for model and processor
model = None
processor = None

# Load model and processor
def load_model():
    global model, processor
    if model is None or processor is None:
        print("Loading model and processor...")
        model_path = "sabaridsnfuji/Japanese-Receipt-VL-3B-JSON"
        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path)
        processor = AutoProcessor.from_pretrained(model_path)
        if torch.cuda.is_available():
            model = model.to("cuda")
            print("Model loaded on GPU")
        else:
            print("Model loaded on CPU")
    return model, processor

# Preprocess image
def preprocess_image(image):
    # Resize to optimal dimensions while maintaining aspect ratio
    target_width, target_height = 640, 896
    
    # Get current dimensions
    width, height = image.size
    
    # Calculate aspect ratio
    aspect = width / height
    
    if aspect > 1:  # Landscape
        new_width = target_width
        new_height = int(new_width / aspect)
    else:  # Portrait
        new_height = target_height
        new_width = int(new_height * aspect)
    
    # Resize image
    resized_image = image.resize((new_width, new_height), Image.LANCZOS)
    return resized_image

# Process receipt image
def process_receipt(image):
    if image is None:
        return {"error": "Please upload a receipt image."}
    
    try:
        # Load model and processor
        model, processor = load_model()
        
        # Preprocess image
        image = preprocess_image(image)
        
        # Optimized instruction prompt for Japanese receipt extraction
        instruct_prompt = """You are an intelligent document parser. Read the following Japanese receipt and extract every piece of information exactly as it appears, and present it in a well-structured JSON format using Japanese keys and values. Please strictly follow these rules: Only extract information that is actually present on the receipt. Do not include any missing, blank, or inferred fields. Do not summarize, omit, translate, or modify any part of the receipt. Every character, number, symbol, and line must be retained exactly as printed. Extract all available content including but not limited to: store details, receipt number, date, time, cashier name, product list, prices, tax breakdowns, payment details, receipt bags, barcodes, notices, and any footer messages. Preserve original formatting such as line breaks, symbols, and full-width characters (hiragana, katakana, kanji, numbers, etc.). Do not perform any translation, correction, interpretation, or reformatting of content. Use only what is present. Output the result in JSON format, using Japanese field names as keys."""
    
        # Prepare input
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": instruct_prompt}
                ]
            }
        ]
    
        # Process
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(text=[text], images=[image], return_tensors="pt")
        
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        # Generate output
        outputs = model.generate(**inputs, max_new_tokens=512)
        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        
        # Extract JSON part from the response
        try:
            # Find where the JSON starts (usually after the model's text response)
            json_start = result.find('{')
            json_end = result.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = result[json_start:json_end]
                # Parse the JSON to validate and format it
                parsed_json = json.loads(json_str)
                return parsed_json
            else:
                return {"error": "No valid JSON found in the response."}
        except Exception as e:
            return {"error": f"Error parsing JSON: {str(e)}", "raw_response": result}
    except Exception as e:
        return {"error": f"Error processing image: {str(e)}"}

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧾 Japanese Receipt OCR & JSON Extraction")
    
    gr.Markdown("""
    ## Upload a Japanese receipt image to extract structured data in JSON format
    
    This app uses the [Japanese-Receipt-VL-3B-JSON](https://huggingface.co/sabaridsnfuji/Japanese-Receipt-VL-3B-JSON) model to:
    - Extract store information, itemized purchases, tax calculations, and payment details
    - Preserve original Japanese text exactly as printed
    - Output structured JSON with Japanese keys
    
    The model works best with mobile phone-captured images of Japanese receipts.
    """)
    
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Upload Receipt Image")
            submit_btn = gr.Button("Extract Receipt Data", variant="primary")
            
            with gr.Accordion("Processing Tips", open=False):
                gr.Markdown("""
                **For Best Results:**
                - Use clear, well-lit photos
                - Capture the entire receipt
                - Avoid shadows and glare
                - Optimal resolution: 640-896px (portrait) or 896-640px (landscape)
                - Images are automatically resized to optimal dimensions
                """)
        
        with gr.Column():
            output_json = gr.JSON(label="Extracted Data (JSON)")
            
            with gr.Accordion("JSON Text", open=False):
                output_text = gr.TextArea(label="JSON Text (Copy/Paste)", interactive=False)
    
    # Connect functions
    submit_btn.click(
        fn=process_receipt, 
        inputs=input_image, 
        outputs=output_json
    ).then(
        fn=lambda x: json.dumps(x, ensure_ascii=False, indent=2),
        inputs=output_json,
        outputs=output_text
    )
    
    
    
    # Model info
    with gr.Accordion("Model Information", open=False):
        gr.Markdown("""
        - **Model:** Japanese-Receipt-VL-3B-JSON
        - **Base Model:** Qwen/Qwen2.5-VL-3B-Instruct
        - **Author:** Sabari Nathan / Couger Inc, Japan
        - **License:** Apache 2.0
        - **Type:** Vision-Language Model (Multimodal)
        - **Language:** Japanese (preserves original text exactly as printed)
        
        ### Output Format Example
        ```json
        {
            "店舗名": "セブンイレブン渋谷店",
            "日付": "2024年01月15日",
            "時刻": "14:30",
            "レシートNo": "0001234",
            "商品リスト": [
                {
                    "商品名": "おにぎり鮭",
                    "数量": 1,
                    "単価": 128,
                    "金額": 128
                }
            ],
            "小計": 840,
            "消費税": 84,
            "合計": 924,
            "支払方法": "現金",
            "お預り": 1000,
            "お釣り": 76
        }
        ```
        """)

# Load model at startup (for faster first inference)
load_model()

# Launch the app
demo.launch()