import gradio as gr import torch from PIL import Image import json from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor import os # Global variables for model and processor model = None processor = None # Load model and processor def load_model(): global model, processor if model is None or processor is None: print("Loading model and processor...") model_path = "sabaridsnfuji/Japanese-Receipt-VL-3B-JSON" model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path) if torch.cuda.is_available(): model = model.to("cuda") print("Model loaded on GPU") else: print("Model loaded on CPU") return model, processor # Preprocess image def preprocess_image(image): # Resize to optimal dimensions while maintaining aspect ratio target_width, target_height = 640, 896 # Get current dimensions width, height = image.size # Calculate aspect ratio aspect = width / height if aspect > 1: # Landscape new_width = target_width new_height = int(new_width / aspect) else: # Portrait new_height = target_height new_width = int(new_height * aspect) # Resize image resized_image = image.resize((new_width, new_height), Image.LANCZOS) return resized_image # Process receipt image def process_receipt(image): if image is None: return {"error": "Please upload a receipt image."} try: # Load model and processor model, processor = load_model() # Preprocess image image = preprocess_image(image) # Optimized instruction prompt for Japanese receipt extraction instruct_prompt = """You are an intelligent document parser. Read the following Japanese receipt and extract every piece of information exactly as it appears, and present it in a well-structured JSON format using Japanese keys and values. Please strictly follow these rules: Only extract information that is actually present on the receipt. Do not include any missing, blank, or inferred fields. Do not summarize, omit, translate, or modify any part of the receipt. Every character, number, symbol, and line must be retained exactly as printed. Extract all available content including but not limited to: store details, receipt number, date, time, cashier name, product list, prices, tax breakdowns, payment details, receipt bags, barcodes, notices, and any footer messages. Preserve original formatting such as line breaks, symbols, and full-width characters (hiragana, katakana, kanji, numbers, etc.). Do not perform any translation, correction, interpretation, or reformatting of content. Use only what is present. Output the result in JSON format, using Japanese field names as keys.""" # Prepare input messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": instruct_prompt} ] } ] # Process text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor(text=[text], images=[image], return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} # Generate output outputs = model.generate(**inputs, max_new_tokens=512) result = processor.batch_decode(outputs, skip_special_tokens=True)[0] # Extract JSON part from the response try: # Find where the JSON starts (usually after the model's text response) json_start = result.find('{') json_end = result.rfind('}') + 1 if json_start >= 0 and json_end > json_start: json_str = result[json_start:json_end] # Parse the JSON to validate and format it parsed_json = json.loads(json_str) return parsed_json else: return {"error": "No valid JSON found in the response."} except Exception as e: return {"error": f"Error parsing JSON: {str(e)}", "raw_response": result} except Exception as e: return {"error": f"Error processing image: {str(e)}"} # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧾 Japanese Receipt OCR & JSON Extraction") gr.Markdown(""" ## Upload a Japanese receipt image to extract structured data in JSON format This app uses the [Japanese-Receipt-VL-3B-JSON](https://huggingface.co/sabaridsnfuji/Japanese-Receipt-VL-3B-JSON) model to: - Extract store information, itemized purchases, tax calculations, and payment details - Preserve original Japanese text exactly as printed - Output structured JSON with Japanese keys The model works best with mobile phone-captured images of Japanese receipts. """) with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Upload Receipt Image") submit_btn = gr.Button("Extract Receipt Data", variant="primary") with gr.Accordion("Processing Tips", open=False): gr.Markdown(""" **For Best Results:** - Use clear, well-lit photos - Capture the entire receipt - Avoid shadows and glare - Optimal resolution: 640-896px (portrait) or 896-640px (landscape) - Images are automatically resized to optimal dimensions """) with gr.Column(): output_json = gr.JSON(label="Extracted Data (JSON)") with gr.Accordion("JSON Text", open=False): output_text = gr.TextArea(label="JSON Text (Copy/Paste)", interactive=False) # Connect functions submit_btn.click( fn=process_receipt, inputs=input_image, outputs=output_json ).then( fn=lambda x: json.dumps(x, ensure_ascii=False, indent=2), inputs=output_json, outputs=output_text ) # Model info with gr.Accordion("Model Information", open=False): gr.Markdown(""" - **Model:** Japanese-Receipt-VL-3B-JSON - **Base Model:** Qwen/Qwen2.5-VL-3B-Instruct - **Author:** Sabari Nathan / Couger Inc, Japan - **License:** Apache 2.0 - **Type:** Vision-Language Model (Multimodal) - **Language:** Japanese (preserves original text exactly as printed) ### Output Format Example ```json { "店舗名": "セブンイレブン渋谷店", "日付": "2024年01月15日", "時刻": "14:30", "レシートNo": "0001234", "商品リスト": [ { "商品名": "おにぎり鮭", "数量": 1, "単価": 128, "金額": 128 } ], "小計": 840, "消費税": 84, "合計": 924, "支払方法": "現金", "お預り": 1000, "お釣り": 76 } ``` """) # Load model at startup (for faster first inference) load_model() # Launch the app demo.launch()