File size: 3,023 Bytes
a7fae6f
 
54724a9
 
 
 
 
 
a7fae6f
 
54724a9
 
 
a7fae6f
 
54724a9
a7fae6f
 
 
54724a9
 
 
 
 
 
 
 
 
42c3b13
a7fae6f
 
 
42c3b13
54724a9
a7fae6f
 
54724a9
a7fae6f
42c3b13
a7fae6f
 
54724a9
a7fae6f
 
e402baa
a7fae6f
42c3b13
54724a9
 
 
42c3b13
 
 
 
 
 
 
54724a9
 
42c3b13
 
 
 
 
54724a9
 
 
 
 
 
 
 
42c3b13
54724a9
 
 
42c3b13
 
 
 
e402baa
a7fae6f
54724a9
 
 
 
 
 
 
 
a7fae6f
 
 
 
42c3b13
a7fae6f
 
e402baa
 
 
 
a7fae6f
e402baa
42c3b13
e402baa
 
 
54724a9
 
42c3b13
54724a9
 
e402baa
 
42c3b13
e402baa
a7fae6f
 
 
 
 
54724a9
a7fae6f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from PIL import Image
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    AutoTokenizer,
    AutoModelForSeq2SeqLM
)
import torch

# -------- Load Models -------- #

blip_processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

seo_tokenizer = AutoTokenizer.from_pretrained(
    "google/flan-t5-base"
)
seo_model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/flan-t5-base"
)

# -------- Core Function -------- #

def generate_alt_text(image, seo_mode, keywords):
    if image is None:
        return ""

    # ---- Step 1: Base Caption ---- #
    inputs = blip_processor(image, return_tensors="pt")

    with torch.no_grad():
        output = blip_model.generate(
            **inputs,
            max_new_tokens=30
        )

    base_caption = blip_processor.decode(
        output[0],
        skip_special_tokens=True
    ).strip()

    # ---- Step 2: Normal Mode ---- #
    if not seo_mode:
        return base_caption.capitalize() + "."

    # ---- Step 3: SEO Prompt ---- #
    keywords = keywords.strip()

    keyword_instruction = (
        f"Include the following keywords naturally: {keywords}. "
        if keywords else ""
    )

    prompt = (
        "You are an SEO expert. "
        "Write a detailed, descriptive, and natural alt text for a website image. "
        "The alt text should be longer than the original caption and written in a single sentence. "
        f"{keyword_instruction}"
        f"Image description: {base_caption}."
    )

    seo_inputs = seo_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True
    )

    # ---- Step 4: Force Expansion ---- #
    with torch.no_grad():
        seo_output = seo_model.generate(
            **seo_inputs,
            max_new_tokens=120,
            do_sample=True,
            top_p=0.95,
            temperature=1.0
        )

    seo_alt_text = seo_tokenizer.decode(
        seo_output[0],
        skip_special_tokens=True
    ).strip()

    return seo_alt_text

# -------- Gradio UI -------- #

with gr.Blocks(title="Alt Text Generator") as demo:
    gr.Markdown("""
    # 🖼️ Alt Text Generator
    AI-powered alt text for accessibility and SEO.
    """)

    image_input = gr.Image(
        type="pil",
        label="Upload Image"
    )

    seo_toggle = gr.Checkbox(
        label="SEO Mode (expanded, keyword-aware alt text)",
        value=False
    )

    keyword_input = gr.Textbox(
        label="Keywords (optional)",
        placeholder="e.g. science experiment for kids, STEM education"
    )

    alt_text_output = gr.Textbox(
        label="Generated Alt Text",
        lines=6
    )

    generate_btn = gr.Button("Generate Alt Text 🚀")

    generate_btn.click(
        fn=generate_alt_text,
        inputs=[image_input, seo_toggle, keyword_input],
        outputs=alt_text_output
    )

demo.launch()