Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,334 Bytes
a85cd29 bbe7feb a85cd29 bbe7feb a85cd29 bbe7feb a85cd29 bbe7feb caebeb2 a85cd29 bbe7feb a85cd29 4d1a611 caebeb2 a85cd29 bbe7feb a85cd29 bbe7feb caebeb2 bbe7feb a85cd29 bbe7feb caebeb2 bbe7feb caebeb2 bbe7feb caebeb2 bbe7feb caebeb2 bbe7feb a85cd29 bbe7feb 20edd58 caebeb2 a85cd29 caebeb2 a85cd29 caebeb2 a85cd29 caebeb2 a85cd29 caebeb2 a85cd29 2b9ac6b a85cd29 caebeb2 a85cd29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import gradio as gr
from PIL import Image
import os
import torch
import json
import spaces
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Load model and processor
print("Loading Qwen3-VL-30B-A3B-Instruct model...")
model = AutoModelForImageTextToText.from_pretrained(
"Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
print("Model loaded successfully!")
EXTRACTION_PROMPT = """Extract metadata from this library catalog card as JSON.
Library catalog cards contain bibliographic information about materials and filing/access information. Extract whatever fields are present:
CORE BIBLIOGRAPHIC FIELDS:
- title: Full title of the work
- author: Main author/creator (person or organization)
- editor: Editor if different from author
- contributor: Other contributors (translators, illustrators, etc.)
- publication_date: Date(s) of publication
- publisher: Publisher name
- publication_place: Place of publication
- physical_description: Physical details (volumes, pages, size, illustrations)
- series: Series information if part of a series
- edition: Edition statement
- contents: Description of contents, volumes, or parts
CATALOGING/ACCESS FIELDS:
- call_number: Library classification number
- subject_headings: Subject terms (often numbered list)
- added_entries: Additional access points for co-authors, editors, etc. (often with Roman numerals)
- notes: Any additional notes
CARD-SPECIFIC:
- filing_heading: The heading under which this card is filed (often at top, may be in all caps)
- card_sequence: If this is a continuation card (e.g., "Card 2", "Card 3")
Return ONLY valid JSON. Use null for fields not present on the card. Use arrays [] for repeating fields like subject_headings and added_entries."""
@spaces.GPU
def extract_metadata(image):
"""Extract structured metadata from catalog card image."""
if image is None:
return "Please upload an image."
try:
# Ensure image is PIL Image
if not isinstance(image, Image.Image):
image = Image.open(image).convert("RGB")
# Format messages for Qwen3-VL
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": EXTRACTION_PROMPT},
],
}
]
# Prepare inputs
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Generate
with torch.inference_mode():
generated_ids = model.generate(
**inputs, max_new_tokens=512, temperature=0.1, do_sample=False
)
# Trim input tokens from output
generated_ids_trimmed = [
out_ids[len(in_ids) :]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode output
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
# Try to parse as JSON for pretty formatting
try:
json_data = json.loads(output_text)
return json.dumps(json_data, indent=2)
except json.JSONDecodeError:
# If not valid JSON, return as-is
return output_text
except Exception as e:
return f"Error during extraction: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Library Card Metadata Extractor") as demo:
gr.Markdown("# π Library Card Metadata Extractor")
gr.Markdown(
"Extract structured metadata from library catalog cards using **Qwen/Qwen3-VL-30B-A3B-Instruct**. "
"Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
"call numbers, and more.\n\n"
"This demo works with catalog cards from libraries and archives, such as the "
"[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
"and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π€ Upload Catalog Card")
image_input = gr.Image(label="Library Catalog Card", type="pil")
submit_btn = gr.Button("π Extract Metadata", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### π Extracted Metadata (JSON)")
output = gr.Code(label="Metadata", language="json", lines=15)
submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output)
gr.Markdown("---")
# Examples
gr.Markdown("## π― Try Examples")
gr.Examples(
examples=[
["examples/bpl_0.jpg"],
["examples/bpl_2.jpg"],
["examples/bpl_4.jpg"],
["examples/bpl_6.jpg"],
["examples/bpl_8.jpg"],
["examples/bpl_9.jpg"],
["examples/bpl_12.jpg"],
["examples/bpl_15.jpg"],
["examples/bpl_22.jpg"],
],
inputs=image_input,
outputs=output,
fn=extract_metadata,
cache_examples=False,
)
gr.Markdown("---")
# Footer
gr.Markdown(
"<center>\n\n"
"Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) | "
"Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
"and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
"</center>"
)
if __name__ == "__main__":
print("Launching demo...")
demo.launch()
|