File size: 12,978 Bytes
bbfde3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""

Local AI Vision Models for Alt Text Generation (100% FREE)

Uses Hugging Face transformers to run models locally - no API costs!



Supported models:

- BLIP: Good balance of speed and quality

- GIT: More detailed descriptions

- LLAVA: Most advanced (requires more resources)

"""

import os
from typing import Optional
from pathlib import Path
import io

try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False
    print("⚠️  Pillow not installed. Run: pip install pillow")

try:
    from transformers import BlipProcessor, BlipForConditionalGeneration
    from transformers import AutoProcessor, AutoModelForCausalLM
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️  Transformers not installed. Run: pip install transformers torch")


class LocalVisionModel:
    """

    Local AI model for generating image descriptions

    Runs on your computer - 100% FREE with no API limits!

    """
    
    def __init__(self, model_name: str = "blip-base"):
        """

        Initialize local vision model

        

        Args:

            model_name: Model to use

                - "blip-base" (default): Fast, good quality, ~1GB

                - "blip-large": Better quality, slower, ~2GB

                - "git-base": Alternative model, ~1.5GB

        """
        self.model_name = model_name
        self.enabled = False
        self.model = None
        self.processor = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        if not TRANSFORMERS_AVAILABLE:
            print("❌ Transformers library not available")
            print("   Install with: pip install transformers torch")
            return
        
        if not PIL_AVAILABLE:
            print("❌ Pillow not available")
            print("   Install with: pip install pillow")
            return
        
        # Load model
        try:
            print(f"📥 Loading {model_name} model... (this may take a minute on first run)")
            
            if "blip" in model_name.lower():
                self._load_blip_model(model_name)
            elif "git" in model_name.lower():
                self._load_git_model()
            else:
                print(f"⚠️  Unknown model: {model_name}, defaulting to BLIP")
                self._load_blip_model("blip-base")
            
            self.enabled = True
            print(f"✅ {model_name} model loaded successfully on {self.device}")
            
        except Exception as e:
            print(f"❌ Failed to load model: {e}")
            self.enabled = False
    
    def _load_blip_model(self, model_name: str):
        """Load BLIP model (recommended for most use cases)"""
        if "large" in model_name:
            model_id = "Salesforce/blip-image-captioning-large"
        else:
            model_id = "Salesforce/blip-image-captioning-base"
        
        self.processor = BlipProcessor.from_pretrained(model_id)
        self.model = BlipForConditionalGeneration.from_pretrained(model_id)
        self.model.to(self.device)
        self.model_type = "blip"
    
    def _load_git_model(self):
        """Load GIT model (alternative to BLIP)"""
        model_id = "microsoft/git-base"
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(model_id)
        self.model.to(self.device)
        self.model_type = "git"
    
    def is_enabled(self) -> bool:
        """Check if model is loaded and ready"""
        return self.enabled and self.model is not None
    
    def generate_alt_text(

        self,

        image_data: bytes,

        shape_name: str = "",

        slide_number: int = 0,

        max_length: int = 250

    ) -> Optional[str]:
        """

        Generate alt text for an image using local AI

        

        Args:

            image_data: Raw image bytes

            shape_name: Shape name (for context)

            slide_number: Slide number (for context)

            max_length: Maximum alt text length

            

        Returns:

            Generated alt text or None if failed

        """
        if not self.is_enabled():
            return None
        
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_data)).convert("RGB")
            
            # Check if image looks decorative (very small, likely a logo/icon)
            if image.size[0] < 100 and image.size[1] < 100:
                # Small image - likely decorative
                if any(hint in shape_name.lower() for hint in ["logo", "icon", "background", "border"]):
                    return "decorative"
            
            # Generate description
            if self.model_type == "blip":
                alt_text = self._generate_blip(image)
            elif self.model_type == "git":
                alt_text = self._generate_git(image)
            else:
                return None
            
            # Clean up the text
            alt_text = self._clean_alt_text(alt_text, max_length)
            
            return alt_text
            
        except Exception as e:
            print(f"Error generating alt text: {e}")
            return None
    
    def _generate_blip(self, image: Image.Image) -> str:
        """Generate caption using BLIP model"""
        # Process image
        inputs = self.processor(image, return_tensors="pt").to(self.device)
        
        # Generate caption
        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                max_length=50,
                num_beams=5,  # Better quality with beam search
                early_stopping=True
            )
        
        caption = self.processor.decode(out[0], skip_special_tokens=True)
        return caption
    
    def _generate_git(self, image: Image.Image) -> str:
        """Generate caption using GIT model"""
        # Process image
        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        
        # Generate caption
        with torch.no_grad():
            generated_ids = self.model.generate(
                pixel_values=inputs.pixel_values,
                max_length=50
            )
        
        caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return caption
    
    def _clean_alt_text(self, alt_text: str, max_length: int) -> str:
        """Clean and format generated alt text"""
        # Remove common prefixes that BLIP adds
        prefixes_to_remove = [
            "a picture of ",
            "an image of ",
            "a photo of ",
            "there is ",
            "arafed ",  # Common BLIP artifact
        ]
        
        alt_text_lower = alt_text.lower()
        for prefix in prefixes_to_remove:
            if alt_text_lower.startswith(prefix):
                alt_text = alt_text[len(prefix):]
                break
        
        # Capitalize first letter
        if alt_text:
            alt_text = alt_text[0].upper() + alt_text[1:]
        
        # Truncate if needed
        if len(alt_text) > max_length:
            alt_text = alt_text[:max_length-3] + "..."
        
        return alt_text.strip()


class HuggingFaceInferenceAPI:
    """

    Hugging Face Inference API (FREE tier available)

    Falls back to this if local models don't work

    """
    
    def __init__(self, api_token: Optional[str] = None):
        """

        Initialize Hugging Face Inference API

        

        Args:

            api_token: HF token (if None, reads from HF_TOKEN env var)

                      Get free token at: https://huggingface.co/settings/tokens

        """
        self.api_token = api_token or os.getenv("HF_TOKEN")
        self.enabled = False
        
        if not self.api_token:
            print("⚠️  No Hugging Face token found. Set HF_TOKEN environment variable.")
            print("   Get free token at: https://huggingface.co/settings/tokens")
            return
        
        try:
            import requests
            self.requests = requests
            self.enabled = True
            self.api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
            print("✅ Hugging Face Inference API initialized")
        except ImportError:
            print("❌ 'requests' library not available. Run: pip install requests")
    
    def is_enabled(self) -> bool:
        """Check if API is ready"""
        return self.enabled and self.api_token is not None
    
    def generate_alt_text(

        self,

        image_data: bytes,

        shape_name: str = "",

        slide_number: int = 0,

        max_length: int = 250

    ) -> Optional[str]:
        """

        Generate alt text using Hugging Face Inference API

        

        Args:

            image_data: Raw image bytes

            shape_name: Shape name

            slide_number: Slide number

            max_length: Maximum length

            

        Returns:

            Generated alt text or None

        """
        if not self.is_enabled():
            return None
        
        try:
            headers = {"Authorization": f"Bearer {self.api_token}"}
            response = self.requests.post(
                self.api_url,
                headers=headers,
                data=image_data,
                timeout=30
            )
            
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    caption = result[0].get("generated_text", "")
                    return self._clean_alt_text(caption, max_length)
            else:
                print(f"HF API error: {response.status_code}")
                return None
                
        except Exception as e:
            print(f"HF API request failed: {e}")
            return None
    
    def _clean_alt_text(self, alt_text: str, max_length: int) -> str:
        """Clean generated text"""
        # Remove common prefixes
        prefixes = ["a picture of ", "an image of ", "a photo of "]
        alt_text_lower = alt_text.lower()
        for prefix in prefixes:
            if alt_text_lower.startswith(prefix):
                alt_text = alt_text[len(prefix):]
                break
        
        # Capitalize first letter
        if alt_text:
            alt_text = alt_text[0].upper() + alt_text[1:]
        
        # Truncate if needed
        if len(alt_text) > max_length:
            alt_text = alt_text[:max_length-3] + "..."
        
        return alt_text.strip()


# Singleton instances
_local_model: Optional[LocalVisionModel] = None
_hf_api: Optional[HuggingFaceInferenceAPI] = None


def get_vision_model() -> Optional[LocalVisionModel]:
    """Get or create local vision model singleton"""
    global _local_model
    if _local_model is None:
        model_name = os.getenv("LOCAL_VISION_MODEL", "blip-base")
        _local_model = LocalVisionModel(model_name)
    return _local_model


def get_hf_api() -> Optional[HuggingFaceInferenceAPI]:
    """Get or create Hugging Face API singleton"""
    global _hf_api
    if _hf_api is None:
        _hf_api = HuggingFaceInferenceAPI()
    return _hf_api


def generate_alt_text_free(

    image_data: bytes,

    shape_name: str = "",

    slide_number: int = 0,

    max_length: int = 250

) -> Optional[str]:
    """

    Generate alt text using FREE methods (tries local first, then HF API)

    

    Priority:

    1. Local AI model (completely free, unlimited)

    2. Hugging Face Inference API (free tier)

    3. None (fallback to placeholder in main code)

    

    Args:

        image_data: Raw image bytes

        shape_name: Shape name

        slide_number: Slide number

        max_length: Maximum length

        

    Returns:

        Generated alt text or None

    """
    # Try local model first (best option - free and unlimited)
    local_model = get_vision_model()
    if local_model and local_model.is_enabled():
        result = local_model.generate_alt_text(image_data, shape_name, slide_number, max_length)
        if result:
            return result
    
    # Fallback to Hugging Face API (free tier)
    hf_api = get_hf_api()
    if hf_api and hf_api.is_enabled():
        result = hf_api.generate_alt_text(image_data, shape_name, slide_number, max_length)
        if result:
            return result
    
    # If both fail, return None (main code will use placeholder)
    return None