Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import json | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| import os | |
| # Global variables for model and processor | |
| model = None | |
| processor = None | |
| # Load model and processor | |
| def load_model(): | |
| global model, processor | |
| if model is None or processor is None: | |
| print("Loading model and processor...") | |
| model_path = "sabaridsnfuji/Japanese-Receipt-VL-3B-JSON" | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path) | |
| processor = AutoProcessor.from_pretrained(model_path) | |
| if torch.cuda.is_available(): | |
| model = model.to("cuda") | |
| print("Model loaded on GPU") | |
| else: | |
| print("Model loaded on CPU") | |
| return model, processor | |
| # Preprocess image | |
| def preprocess_image(image): | |
| # Resize to optimal dimensions while maintaining aspect ratio | |
| target_width, target_height = 640, 896 | |
| # Get current dimensions | |
| width, height = image.size | |
| # Calculate aspect ratio | |
| aspect = width / height | |
| if aspect > 1: # Landscape | |
| new_width = target_width | |
| new_height = int(new_width / aspect) | |
| else: # Portrait | |
| new_height = target_height | |
| new_width = int(new_height * aspect) | |
| # Resize image | |
| resized_image = image.resize((new_width, new_height), Image.LANCZOS) | |
| return resized_image | |
| # Process receipt image | |
| def process_receipt(image): | |
| if image is None: | |
| return {"error": "Please upload a receipt image."} | |
| try: | |
| # Load model and processor | |
| model, processor = load_model() | |
| # Preprocess image | |
| image = preprocess_image(image) | |
| # Optimized instruction prompt for Japanese receipt extraction | |
| instruct_prompt = """You are an intelligent document parser. Read the following Japanese receipt and extract every piece of information exactly as it appears, and present it in a well-structured JSON format using Japanese keys and values. Please strictly follow these rules: Only extract information that is actually present on the receipt. Do not include any missing, blank, or inferred fields. Do not summarize, omit, translate, or modify any part of the receipt. Every character, number, symbol, and line must be retained exactly as printed. Extract all available content including but not limited to: store details, receipt number, date, time, cashier name, product list, prices, tax breakdowns, payment details, receipt bags, barcodes, notices, and any footer messages. Preserve original formatting such as line breaks, symbols, and full-width characters (hiragana, katakana, kanji, numbers, etc.). Do not perform any translation, correction, interpretation, or reformatting of content. Use only what is present. Output the result in JSON format, using Japanese field names as keys.""" | |
| # Prepare input | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": instruct_prompt} | |
| ] | |
| } | |
| ] | |
| # Process | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = processor(text=[text], images=[image], return_tensors="pt") | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| # Generate output | |
| outputs = model.generate(**inputs, max_new_tokens=512) | |
| result = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| # Extract JSON part from the response | |
| try: | |
| # Find where the JSON starts (usually after the model's text response) | |
| json_start = result.find('{') | |
| json_end = result.rfind('}') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| json_str = result[json_start:json_end] | |
| # Parse the JSON to validate and format it | |
| parsed_json = json.loads(json_str) | |
| return parsed_json | |
| else: | |
| return {"error": "No valid JSON found in the response."} | |
| except Exception as e: | |
| return {"error": f"Error parsing JSON: {str(e)}", "raw_response": result} | |
| except Exception as e: | |
| return {"error": f"Error processing image: {str(e)}"} | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🧾 Japanese Receipt OCR & JSON Extraction") | |
| gr.Markdown(""" | |
| ## Upload a Japanese receipt image to extract structured data in JSON format | |
| This app uses the [Japanese-Receipt-VL-3B-JSON](https://huggingface.co/sabaridsnfuji/Japanese-Receipt-VL-3B-JSON) model to: | |
| - Extract store information, itemized purchases, tax calculations, and payment details | |
| - Preserve original Japanese text exactly as printed | |
| - Output structured JSON with Japanese keys | |
| The model works best with mobile phone-captured images of Japanese receipts. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image(type="pil", label="Upload Receipt Image") | |
| submit_btn = gr.Button("Extract Receipt Data", variant="primary") | |
| with gr.Accordion("Processing Tips", open=False): | |
| gr.Markdown(""" | |
| **For Best Results:** | |
| - Use clear, well-lit photos | |
| - Capture the entire receipt | |
| - Avoid shadows and glare | |
| - Optimal resolution: 640-896px (portrait) or 896-640px (landscape) | |
| - Images are automatically resized to optimal dimensions | |
| """) | |
| with gr.Column(): | |
| output_json = gr.JSON(label="Extracted Data (JSON)") | |
| with gr.Accordion("JSON Text", open=False): | |
| output_text = gr.TextArea(label="JSON Text (Copy/Paste)", interactive=False) | |
| # Connect functions | |
| submit_btn.click( | |
| fn=process_receipt, | |
| inputs=input_image, | |
| outputs=output_json | |
| ).then( | |
| fn=lambda x: json.dumps(x, ensure_ascii=False, indent=2), | |
| inputs=output_json, | |
| outputs=output_text | |
| ) | |
| # Model info | |
| with gr.Accordion("Model Information", open=False): | |
| gr.Markdown(""" | |
| - **Model:** Japanese-Receipt-VL-3B-JSON | |
| - **Base Model:** Qwen/Qwen2.5-VL-3B-Instruct | |
| - **Author:** Sabari Nathan / Couger Inc, Japan | |
| - **License:** Apache 2.0 | |
| - **Type:** Vision-Language Model (Multimodal) | |
| - **Language:** Japanese (preserves original text exactly as printed) | |
| ### Output Format Example | |
| ```json | |
| { | |
| "店舗名": "セブンイレブン渋谷店", | |
| "日付": "2024年01月15日", | |
| "時刻": "14:30", | |
| "レシートNo": "0001234", | |
| "商品リスト": [ | |
| { | |
| "商品名": "おにぎり鮭", | |
| "数量": 1, | |
| "単価": 128, | |
| "金額": 128 | |
| } | |
| ], | |
| "小計": 840, | |
| "消費税": 84, | |
| "合計": 924, | |
| "支払方法": "現金", | |
| "お預り": 1000, | |
| "お釣り": 76 | |
| } | |
| ``` | |
| """) | |
| # Load model at startup (for faster first inference) | |
| load_model() | |
| # Launch the app | |
| demo.launch() |