File size: 6,334 Bytes
a85cd29
bbe7feb
a85cd29
bbe7feb
a85cd29
 
bbe7feb
 
a85cd29
 
 
bbe7feb
 
 
caebeb2
a85cd29
bbe7feb
a85cd29
 
4d1a611
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caebeb2
a85cd29
 
 
bbe7feb
a85cd29
 
 
 
 
bbe7feb
 
 
 
 
 
 
 
 
caebeb2
 
bbe7feb
 
 
 
 
 
a85cd29
bbe7feb
 
 
 
 
 
 
caebeb2
bbe7feb
 
 
 
 
 
caebeb2
bbe7feb
 
 
 
caebeb2
 
bbe7feb
 
 
 
 
 
caebeb2
bbe7feb
 
 
 
 
 
 
 
 
a85cd29
 
bbe7feb
20edd58
caebeb2
a85cd29
 
 
 
caebeb2
a85cd29
 
 
 
 
 
 
 
 
 
 
 
caebeb2
a85cd29
 
 
 
caebeb2
a85cd29
caebeb2
a85cd29
 
 
 
 
 
 
 
 
2b9ac6b
 
 
 
 
 
 
a85cd29
 
 
 
caebeb2
a85cd29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gradio as gr
from PIL import Image
import os
import torch
import json
import spaces
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Load model and processor
print("Loading Qwen3-VL-30B-A3B-Instruct model...")
model = AutoModelForImageTextToText.from_pretrained(
    "Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
print("Model loaded successfully!")

EXTRACTION_PROMPT = """Extract metadata from this library catalog card as JSON.

Library catalog cards contain bibliographic information about materials and filing/access information. Extract whatever fields are present:

CORE BIBLIOGRAPHIC FIELDS:
- title: Full title of the work
- author: Main author/creator (person or organization)
- editor: Editor if different from author
- contributor: Other contributors (translators, illustrators, etc.)
- publication_date: Date(s) of publication
- publisher: Publisher name
- publication_place: Place of publication
- physical_description: Physical details (volumes, pages, size, illustrations)
- series: Series information if part of a series
- edition: Edition statement
- contents: Description of contents, volumes, or parts

CATALOGING/ACCESS FIELDS:
- call_number: Library classification number
- subject_headings: Subject terms (often numbered list)
- added_entries: Additional access points for co-authors, editors, etc. (often with Roman numerals)
- notes: Any additional notes

CARD-SPECIFIC:
- filing_heading: The heading under which this card is filed (often at top, may be in all caps)
- card_sequence: If this is a continuation card (e.g., "Card 2", "Card 3")

Return ONLY valid JSON. Use null for fields not present on the card. Use arrays [] for repeating fields like subject_headings and added_entries."""


@spaces.GPU
def extract_metadata(image):
    """Extract structured metadata from catalog card image."""
    if image is None:
        return "Please upload an image."

    try:
        # Ensure image is PIL Image
        if not isinstance(image, Image.Image):
            image = Image.open(image).convert("RGB")

        # Format messages for Qwen3-VL
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": EXTRACTION_PROMPT},
                ],
            }
        ]

        # Prepare inputs
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(model.device)

        # Generate
        with torch.inference_mode():
            generated_ids = model.generate(
                **inputs, max_new_tokens=512, temperature=0.1, do_sample=False
            )

        # Trim input tokens from output
        generated_ids_trimmed = [
            out_ids[len(in_ids) :]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]

        # Decode output
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]

        # Try to parse as JSON for pretty formatting
        try:
            json_data = json.loads(output_text)
            return json.dumps(json_data, indent=2)
        except json.JSONDecodeError:
            # If not valid JSON, return as-is
            return output_text

    except Exception as e:
        return f"Error during extraction: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="Library Card Metadata Extractor") as demo:
    gr.Markdown("# πŸ“‡ Library Card Metadata Extractor")
    gr.Markdown(
        "Extract structured metadata from library catalog cards using **Qwen/Qwen3-VL-30B-A3B-Instruct**. "
        "Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
        "call numbers, and more.\n\n"
        "This demo works with catalog cards from libraries and archives, such as the "
        "[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
        "and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
    )

    gr.Markdown("---")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“€ Upload Catalog Card")
            image_input = gr.Image(label="Library Catalog Card", type="pil")
            submit_btn = gr.Button("πŸ” Extract Metadata", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### πŸ“‹ Extracted Metadata (JSON)")
            output = gr.Code(label="Metadata", language="json", lines=15)

    submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output)

    gr.Markdown("---")

    # Examples
    gr.Markdown("## 🎯 Try Examples")
    gr.Examples(
        examples=[
            ["examples/bpl_0.jpg"],
            ["examples/bpl_2.jpg"],
            ["examples/bpl_4.jpg"],
            ["examples/bpl_6.jpg"],
            ["examples/bpl_8.jpg"],
            ["examples/bpl_9.jpg"],
            ["examples/bpl_12.jpg"],
            ["examples/bpl_15.jpg"],
            ["examples/bpl_22.jpg"],
        ],
        inputs=image_input,
        outputs=output,
        fn=extract_metadata,
        cache_examples=False,
    )

    gr.Markdown("---")

    # Footer
    gr.Markdown(
        "<center>\n\n"
        "Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) | "
        "Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
        "and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
        "</center>"
    )

if __name__ == "__main__":
    print("Launching demo...")
    demo.launch()