Spaces:

rks28042003
/

donut

Sleeping

App Files Files Community

donut / app.py

rks28042003

Fix examples section to use local files

3a96409 6 months ago

raw

history blame contribute delete

7.58 kB

	import gradio as gr
	import torch
	from PIL import Image
	import json
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	import os

	# Global variables for model and processor
	model = None
	processor = None

	# Load model and processor
	def load_model():
	global model, processor
	if model is None or processor is None:
	print("Loading model and processor...")
	model_path = "sabaridsnfuji/Japanese-Receipt-VL-3B-JSON"
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path)
	processor = AutoProcessor.from_pretrained(model_path)
	if torch.cuda.is_available():
	model = model.to("cuda")
	print("Model loaded on GPU")
	else:
	print("Model loaded on CPU")
	return model, processor

	# Preprocess image
	def preprocess_image(image):
	# Resize to optimal dimensions while maintaining aspect ratio
	target_width, target_height = 640, 896

	# Get current dimensions
	width, height = image.size

	# Calculate aspect ratio
	aspect = width / height

	if aspect > 1: # Landscape
	new_width = target_width
	new_height = int(new_width / aspect)
	else: # Portrait
	new_height = target_height
	new_width = int(new_height * aspect)

	# Resize image
	resized_image = image.resize((new_width, new_height), Image.LANCZOS)
	return resized_image

	# Process receipt image
	def process_receipt(image):
	if image is None:
	return {"error": "Please upload a receipt image."}

	try:
	# Load model and processor
	model, processor = load_model()

	# Preprocess image
	image = preprocess_image(image)

	# Optimized instruction prompt for Japanese receipt extraction
	instruct_prompt = """You are an intelligent document parser. Read the following Japanese receipt and extract every piece of information exactly as it appears, and present it in a well-structured JSON format using Japanese keys and values. Please strictly follow these rules: Only extract information that is actually present on the receipt. Do not include any missing, blank, or inferred fields. Do not summarize, omit, translate, or modify any part of the receipt. Every character, number, symbol, and line must be retained exactly as printed. Extract all available content including but not limited to: store details, receipt number, date, time, cashier name, product list, prices, tax breakdowns, payment details, receipt bags, barcodes, notices, and any footer messages. Preserve original formatting such as line breaks, symbols, and full-width characters (hiragana, katakana, kanji, numbers, etc.). Do not perform any translation, correction, interpretation, or reformatting of content. Use only what is present. Output the result in JSON format, using Japanese field names as keys."""

	# Prepare input
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": instruct_prompt}
	]
	}
	]

	# Process
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[image], return_tensors="pt")

	if torch.cuda.is_available():
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	# Generate output
	outputs = model.generate(**inputs, max_new_tokens=512)
	result = processor.batch_decode(outputs, skip_special_tokens=True)[0]

	# Extract JSON part from the response
	try:
	# Find where the JSON starts (usually after the model's text response)
	json_start = result.find('{')
	json_end = result.rfind('}') + 1

	if json_start >= 0 and json_end > json_start:
	json_str = result[json_start:json_end]
	# Parse the JSON to validate and format it
	parsed_json = json.loads(json_str)
	return parsed_json
	else:
	return {"error": "No valid JSON found in the response."}
	except Exception as e:
	return {"error": f"Error parsing JSON: {str(e)}", "raw_response": result}
	except Exception as e:
	return {"error": f"Error processing image: {str(e)}"}

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧾 Japanese Receipt OCR & JSON Extraction")

	gr.Markdown("""
	## Upload a Japanese receipt image to extract structured data in JSON format

	This app uses the [Japanese-Receipt-VL-3B-JSON](https://huggingface.co/sabaridsnfuji/Japanese-Receipt-VL-3B-JSON) model to:
	- Extract store information, itemized purchases, tax calculations, and payment details
	- Preserve original Japanese text exactly as printed
	- Output structured JSON with Japanese keys

	The model works best with mobile phone-captured images of Japanese receipts.
	""")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(type="pil", label="Upload Receipt Image")
	submit_btn = gr.Button("Extract Receipt Data", variant="primary")

	with gr.Accordion("Processing Tips", open=False):
	gr.Markdown("""
	For Best Results:
	- Use clear, well-lit photos
	- Capture the entire receipt
	- Avoid shadows and glare
	- Optimal resolution: 640-896px (portrait) or 896-640px (landscape)
	- Images are automatically resized to optimal dimensions
	""")

	with gr.Column():
	output_json = gr.JSON(label="Extracted Data (JSON)")

	with gr.Accordion("JSON Text", open=False):
	output_text = gr.TextArea(label="JSON Text (Copy/Paste)", interactive=False)

	# Connect functions
	submit_btn.click(
	fn=process_receipt,
	inputs=input_image,
	outputs=output_json
	).then(
	fn=lambda x: json.dumps(x, ensure_ascii=False, indent=2),
	inputs=output_json,
	outputs=output_text
	)



	# Model info
	with gr.Accordion("Model Information", open=False):
	gr.Markdown("""
	- Model: Japanese-Receipt-VL-3B-JSON
	- Base Model: Qwen/Qwen2.5-VL-3B-Instruct
	- Author: Sabari Nathan / Couger Inc, Japan
	- License: Apache 2.0
	- Type: Vision-Language Model (Multimodal)
	- Language: Japanese (preserves original text exactly as printed)

	### Output Format Example
	```json
	{
	"店舗名": "セブンイレブン渋谷店",
	"日付": "2024年01月15日",
	"時刻": "14:30",
	"レシートNo": "0001234",
	"商品リスト": [
	{
	"商品名": "おにぎり鮭",
	"数量": 1,
	"単価": 128,
	"金額": 128
	}
	],
	"小計": 840,
	"消費税": 84,
	"合計": 924,
	"支払方法": "現金",
	"お預り": 1000,
	"お釣り": 76
	}
	```
	""")

	# Load model at startup (for faster first inference)
	load_model()

	# Launch the app
	demo.launch()