Spaces:

davanstrien
/

vllm-index-card-extractor

Running on Zero

Update example images in app.py: remove bpl_1.jpg and add new examples (bpl_4.jpg, bpl_6.jpg, bpl_8.jpg, bpl_9.jpg, bpl_12.jpg, bpl_15.jpg, bpl_22.jpg)

2b9ac6b 2 months ago

raw

history blame contribute delete

6.33 kB

	import gradio as gr
	from PIL import Image
	import os
	import torch
	import json
	import spaces
	from transformers import AutoModelForImageTextToText, AutoProcessor
	from qwen_vl_utils import process_vision_info

	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	# Load model and processor
	print("Loading Qwen3-VL-30B-A3B-Instruct model...")
	model = AutoModelForImageTextToText.from_pretrained(
	"Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
	print("Model loaded successfully!")

	EXTRACTION_PROMPT = """Extract metadata from this library catalog card as JSON.

	Library catalog cards contain bibliographic information about materials and filing/access information. Extract whatever fields are present:

	CORE BIBLIOGRAPHIC FIELDS:
	- title: Full title of the work
	- author: Main author/creator (person or organization)
	- editor: Editor if different from author
	- contributor: Other contributors (translators, illustrators, etc.)
	- publication_date: Date(s) of publication
	- publisher: Publisher name
	- publication_place: Place of publication
	- physical_description: Physical details (volumes, pages, size, illustrations)
	- series: Series information if part of a series
	- edition: Edition statement
	- contents: Description of contents, volumes, or parts

	CATALOGING/ACCESS FIELDS:
	- call_number: Library classification number
	- subject_headings: Subject terms (often numbered list)
	- added_entries: Additional access points for co-authors, editors, etc. (often with Roman numerals)
	- notes: Any additional notes

	CARD-SPECIFIC:
	- filing_heading: The heading under which this card is filed (often at top, may be in all caps)
	- card_sequence: If this is a continuation card (e.g., "Card 2", "Card 3")

	Return ONLY valid JSON. Use null for fields not present on the card. Use arrays [] for repeating fields like subject_headings and added_entries."""


	@spaces.GPU
	def extract_metadata(image):
	"""Extract structured metadata from catalog card image."""
	if image is None:
	return "Please upload an image."

	try:
	# Ensure image is PIL Image
	if not isinstance(image, Image.Image):
	image = Image.open(image).convert("RGB")

	# Format messages for Qwen3-VL
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": EXTRACTION_PROMPT},
	],
	}
	]

	# Prepare inputs
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(model.device)

	# Generate
	with torch.inference_mode():
	generated_ids = model.generate(
	**inputs, max_new_tokens=512, temperature=0.1, do_sample=False
	)

	# Trim input tokens from output
	generated_ids_trimmed = [
	out_ids[len(in_ids) :]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	# Decode output
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)[0]

	# Try to parse as JSON for pretty formatting
	try:
	json_data = json.loads(output_text)
	return json.dumps(json_data, indent=2)
	except json.JSONDecodeError:
	# If not valid JSON, return as-is
	return output_text

	except Exception as e:
	return f"Error during extraction: {str(e)}"


	# Create Gradio interface
	with gr.Blocks(title="Library Card Metadata Extractor") as demo:
	gr.Markdown("# 📇 Library Card Metadata Extractor")
	gr.Markdown(
	"Extract structured metadata from library catalog cards using Qwen/Qwen3-VL-30B-A3B-Instruct. "
	"Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
	"call numbers, and more.\n\n"
	"This demo works with catalog cards from libraries and archives, such as the "
	"[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
	"and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
	)

	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Upload Catalog Card")
	image_input = gr.Image(label="Library Catalog Card", type="pil")
	submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 📋 Extracted Metadata (JSON)")
	output = gr.Code(label="Metadata", language="json", lines=15)

	submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output)

	gr.Markdown("---")

	# Examples
	gr.Markdown("## 🎯 Try Examples")
	gr.Examples(
	examples=[
	["examples/bpl_0.jpg"],
	["examples/bpl_2.jpg"],
	["examples/bpl_4.jpg"],
	["examples/bpl_6.jpg"],
	["examples/bpl_8.jpg"],
	["examples/bpl_9.jpg"],
	["examples/bpl_12.jpg"],
	["examples/bpl_15.jpg"],
	["examples/bpl_22.jpg"],
	],
	inputs=image_input,
	outputs=output,
	fn=extract_metadata,
	cache_examples=False,
	)

	gr.Markdown("---")

	# Footer
	gr.Markdown(
	"<center>\n\n"
	"Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) \| "
	"Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
	"and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
	"</center>"
	)

	if __name__ == "__main__":
	print("Launching demo...")
	demo.launch()