donut / app.py
rks28042003's picture
Fix examples section to use local files
3a96409
import gradio as gr
import torch
from PIL import Image
import json
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import os
# Global variables for model and processor
model = None
processor = None
# Load model and processor
def load_model():
global model, processor
if model is None or processor is None:
print("Loading model and processor...")
model_path = "sabaridsnfuji/Japanese-Receipt-VL-3B-JSON"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.to("cuda")
print("Model loaded on GPU")
else:
print("Model loaded on CPU")
return model, processor
# Preprocess image
def preprocess_image(image):
# Resize to optimal dimensions while maintaining aspect ratio
target_width, target_height = 640, 896
# Get current dimensions
width, height = image.size
# Calculate aspect ratio
aspect = width / height
if aspect > 1: # Landscape
new_width = target_width
new_height = int(new_width / aspect)
else: # Portrait
new_height = target_height
new_width = int(new_height * aspect)
# Resize image
resized_image = image.resize((new_width, new_height), Image.LANCZOS)
return resized_image
# Process receipt image
def process_receipt(image):
if image is None:
return {"error": "Please upload a receipt image."}
try:
# Load model and processor
model, processor = load_model()
# Preprocess image
image = preprocess_image(image)
# Optimized instruction prompt for Japanese receipt extraction
instruct_prompt = """You are an intelligent document parser. Read the following Japanese receipt and extract every piece of information exactly as it appears, and present it in a well-structured JSON format using Japanese keys and values. Please strictly follow these rules: Only extract information that is actually present on the receipt. Do not include any missing, blank, or inferred fields. Do not summarize, omit, translate, or modify any part of the receipt. Every character, number, symbol, and line must be retained exactly as printed. Extract all available content including but not limited to: store details, receipt number, date, time, cashier name, product list, prices, tax breakdowns, payment details, receipt bags, barcodes, notices, and any footer messages. Preserve original formatting such as line breaks, symbols, and full-width characters (hiragana, katakana, kanji, numbers, etc.). Do not perform any translation, correction, interpretation, or reformatting of content. Use only what is present. Output the result in JSON format, using Japanese field names as keys."""
# Prepare input
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": instruct_prompt}
]
}
]
# Process
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Generate output
outputs = model.generate(**inputs, max_new_tokens=512)
result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Extract JSON part from the response
try:
# Find where the JSON starts (usually after the model's text response)
json_start = result.find('{')
json_end = result.rfind('}') + 1
if json_start >= 0 and json_end > json_start:
json_str = result[json_start:json_end]
# Parse the JSON to validate and format it
parsed_json = json.loads(json_str)
return parsed_json
else:
return {"error": "No valid JSON found in the response."}
except Exception as e:
return {"error": f"Error parsing JSON: {str(e)}", "raw_response": result}
except Exception as e:
return {"error": f"Error processing image: {str(e)}"}
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧾 Japanese Receipt OCR & JSON Extraction")
gr.Markdown("""
## Upload a Japanese receipt image to extract structured data in JSON format
This app uses the [Japanese-Receipt-VL-3B-JSON](https://huggingface.co/sabaridsnfuji/Japanese-Receipt-VL-3B-JSON) model to:
- Extract store information, itemized purchases, tax calculations, and payment details
- Preserve original Japanese text exactly as printed
- Output structured JSON with Japanese keys
The model works best with mobile phone-captured images of Japanese receipts.
""")
with gr.Row():
with gr.Column():
input_image = gr.Image(type="pil", label="Upload Receipt Image")
submit_btn = gr.Button("Extract Receipt Data", variant="primary")
with gr.Accordion("Processing Tips", open=False):
gr.Markdown("""
**For Best Results:**
- Use clear, well-lit photos
- Capture the entire receipt
- Avoid shadows and glare
- Optimal resolution: 640-896px (portrait) or 896-640px (landscape)
- Images are automatically resized to optimal dimensions
""")
with gr.Column():
output_json = gr.JSON(label="Extracted Data (JSON)")
with gr.Accordion("JSON Text", open=False):
output_text = gr.TextArea(label="JSON Text (Copy/Paste)", interactive=False)
# Connect functions
submit_btn.click(
fn=process_receipt,
inputs=input_image,
outputs=output_json
).then(
fn=lambda x: json.dumps(x, ensure_ascii=False, indent=2),
inputs=output_json,
outputs=output_text
)
# Model info
with gr.Accordion("Model Information", open=False):
gr.Markdown("""
- **Model:** Japanese-Receipt-VL-3B-JSON
- **Base Model:** Qwen/Qwen2.5-VL-3B-Instruct
- **Author:** Sabari Nathan / Couger Inc, Japan
- **License:** Apache 2.0
- **Type:** Vision-Language Model (Multimodal)
- **Language:** Japanese (preserves original text exactly as printed)
### Output Format Example
```json
{
"店舗名": "セブンイレブン渋谷店",
"日付": "2024年01月15日",
"時刻": "14:30",
"レシートNo": "0001234",
"商品リスト": [
{
"商品名": "おにぎり鮭",
"数量": 1,
"単価": 128,
"金額": 128
}
],
"小計": 840,
"消費税": 84,
"合計": 924,
"支払方法": "現金",
"お預り": 1000,
"お釣り": 76
}
```
""")
# Load model at startup (for faster first inference)
load_model()
# Launch the app
demo.launch()