# Multimodal Processing: Agentic Business Digitization Framework ## Overview Multimodal processing handles non-text content (images, videos) to extract business-relevant information. This is critical for businesses that rely heavily on visual content (restaurants, travel agencies, retail stores). ## Vision AI Strategy ### Qwen3.5:0.8B Vision Capabilities (via Ollama) **Why Qwen3.5:0.8B?** - Lightweight and fast for local inference - Good visual understanding capabilities - JSON-structured responses - No API costs (local execution) - Privacy-preserving (runs locally) ### Vision Agent Architecture ```python class VisionAgent: """ Intelligent image analysis using Qwen3.5:0.8B via Ollama """ def __init__(self): from ollama import Client self.client = Client(host='http://localhost:11434') self.model = "qwen3.5:0.8b" # Rate limiting (Ollama is local, but still manage concurrency) self.rate_limiter = RateLimiter( max_requests_per_minute=30, max_concurrent=5 ) async def analyze_image( self, image: ExtractedImage, context: str = "" ) -> ImageAnalysis: """ Analyze single image with optional context """ # Build context-aware prompt prompt = self.build_vision_prompt(context) # Rate limiting await self.rate_limiter.acquire() # Call Qwen via Ollama with image response = self.client.chat( model=self.model, messages=[{ "role": "user", "content": prompt, "images": [image.file_path] }] ) # Parse structured response analysis = self.parse_vision_response(response['message']['content']) return ImageAnalysis( image_id=image.image_id, description=analysis['description'], category=ImageCategory(analysis['category']), tags=analysis['tags'], is_product=analysis['is_product'], is_service_related=analysis['is_service_related'], suggested_associations=analysis.get('associations', []), confidence=analysis.get('confidence', 0.8), analyzed_at=datetime.now(), metadata=analysis.get('metadata', {}) ) ``` ### Vision Prompting Strategy #### Product Image Prompt ```python def build_product_vision_prompt(self, context: str) -> str: """ Optimized prompt for product image analysis with Qwen """ return f""" Analyze this product image in detail for a business digitization system. Context from documents: {context[:300] if context else "No additional context"} Provide a JSON response with the following structure: {{ "description": "Detailed 3-4 sentence description of the product shown", "category": "product", "product_name": "Best guess of product name based on image", "tags": ["tag1", "tag2", "tag3"], "is_product": true, "is_service_related": false, "visual_attributes": {{ "color": "predominant color", "style": "modern/vintage/minimalist", "setting": "studio/lifestyle/packshot" }}, "suggested_specifications": {{ "material": "if visible", "size": "if determinable", "features": ["feature1", "feature2"] }}, "associations": ["suggested product names this could match"], "confidence": 0.8 }} Guidelines: - Be specific and descriptive - Focus on business-relevant details - Identify brand names or logos if visible - Note quality indicators (professional photography, lighting) - Suggest product category (electronics, clothing, food, etc.) - Respond ONLY with valid JSON, no additional text """ ``` #### Service/Destination Image Prompt ```python def build_service_vision_prompt(self, context: str) -> str: """ Optimized prompt for service/destination images """ return f""" Analyze this image which may represent a service, destination, or experience. Context from documents: {context[:300] if context else "No additional context"} Provide a JSON response: {{ "description": "Detailed 3-4 sentence description of what's shown", "category": "service|destination|food|experience|other", "location_type": "if applicable: beach/mountain/city/restaurant/hotel/etc", "tags": ["tag1", "tag2", ...], "is_product": false, "is_service_related": true, "visual_attributes": {{ "setting": "indoor/outdoor/natural/urban", "time_of_day": "if determinable", "weather": "if visible", "crowd_level": "empty/moderate/crowded" }}, "service_indicators": {{ "activity_type": "dining/touring/adventure/relaxation/etc", "difficulty_level": "if applicable", "suitable_for": ["families", "couples", "solo travelers", etc] }}, "associations": ["suggested service/package names"], "confidence": 0.0-1.0 }} Guidelines: - Identify location characteristics - Note activities or experiences visible - Describe atmosphere and ambiance - Identify target audience indicators """ ``` #### Food/Menu Image Prompt ```python def build_food_vision_prompt(self, context: str) -> str: """ Specialized prompt for food/menu images """ return f""" Analyze this food or menu image. Context: {context[:300] if context else "No context"} JSON response: {{ "description": "Detailed description of food/dishes shown", "category": "food", "cuisine_type": "Italian/Chinese/Indian/etc", "dishes_visible": [ {{ "name": "estimated dish name", "description": "brief description", "presentation_style": "plating style" }} ], "tags": ["cuisine type", "dish names", "ingredients visible"], "is_product": true, "is_service_related": true, "visual_attributes": {{ "presentation_quality": "casual/fine_dining/street_food", "portion_size": "small/medium/large", "color_palette": "appetizing/vibrant/etc" }}, "menu_indicators": {{ "price_visible": true/false, "dish_count": number if menu, "menu_type": "a_la_carte/set_menu/etc" }}, "confidence": 0.0-1.0 }} """ ``` ### Batch Image Processing ```python async def analyze_images_batch( self, images: List[ExtractedImage], context: str = "" ) -> List[ImageAnalysis]: """ Process multiple images efficiently with Qwen """ # Group images into batches of 5 for parallel processing batch_size = 5 batches = [images[i:i+batch_size] for i in range(0, len(images), batch_size)] all_analyses = [] for batch in batches: # Process batch in parallel tasks = [ self.analyze_image(img, context) for img in batch ] batch_results = await asyncio.gather(*tasks, return_exceptions=True) # Handle errors gracefully for img, result in zip(batch, batch_results): if isinstance(result, Exception): logger.error(f"Vision analysis failed for {img.image_id}: {result}") all_analyses.append(self.create_fallback_analysis(img)) else: all_analyses.append(result) return all_analyses def create_fallback_analysis(self, image: ExtractedImage) -> ImageAnalysis: """ Create minimal analysis when vision AI fails """ return ImageAnalysis( image_id=image.image_id, description="Image analysis unavailable", category=ImageCategory.OTHER, tags=[], is_product=False, is_service_related=False, suggested_associations=[], confidence=0.0, analyzed_at=datetime.now(), metadata={'error': 'vision_analysis_failed'} ) ``` ## Image Association Logic ### Matching Images to Products/Services ```python class ImageAssociationEngine: """ Associate images with products or services """ def associate_images( self, images: List[ImageAnalysis], products: List[Product], services: List[Service], page_index: PageIndex ) -> dict: """ Match images to inventory items """ associations = { 'product_associations': {}, 'service_associations': {}, 'unassociated': [] } # Associate product images for product in products: matched_images = self.match_images_to_product( product, images, page_index ) if matched_images: associations['product_associations'][product.product_id] = matched_images # Associate service images for service in services: matched_images = self.match_images_to_service( service, images, page_index ) if matched_images: associations['service_associations'][service.service_id] = matched_images # Track unassociated images associated_ids = set() for imgs in associations['product_associations'].values(): associated_ids.update(img.image_id for img in imgs) for imgs in associations['service_associations'].values(): associated_ids.update(img.image_id for img in imgs) associations['unassociated'] = [ img for img in images if img.image_id not in associated_ids ] return associations def match_images_to_product( self, product: Product, images: List[ImageAnalysis], page_index: PageIndex ) -> List[ImageAnalysis]: """ Find images that belong to this product """ matched = [] for image in images: if not image.is_product: continue # Strategy 1: Direct name matching if product.name and self.name_match(product.name, image): matched.append(image) continue # Strategy 2: Tag overlap if self.tag_overlap(product.tags, image.tags) > 0.5: matched.append(image) continue # Strategy 3: Context proximity if self.context_proximity(product, image, page_index) > 0.7: matched.append(image) continue return matched def name_match(self, product_name: str, image: ImageAnalysis) -> bool: """ Check if product name appears in image analysis """ product_name_lower = product_name.lower() # Check description if product_name_lower in image.description.lower(): return True # Check suggested associations for association in image.suggested_associations: if product_name_lower in association.lower(): return True return False def tag_overlap(self, tags1: List[str], tags2: List[str]) -> float: """ Calculate tag similarity (Jaccard index) """ if not tags1 or not tags2: return 0.0 set1 = set(tag.lower() for tag in tags1) set2 = set(tag.lower() for tag in tags2) intersection = len(set1 & set2) union = len(set1 | set2) return intersection / union if union > 0 else 0.0 def context_proximity( self, product: Product, image: ImageAnalysis, page_index: PageIndex ) -> float: """ Check if image and product appear in similar context """ # Get pages mentioning product product_pages = self.find_product_pages(product, page_index) # Get page where image was found image_page = self.find_image_page(image, page_index) # Check if same document/page if image_page and image_page in product_pages: return 1.0 return 0.0 ``` ## Video Processing ### Video Metadata Extraction ```python class VideoProcessor: """ Video file handling and metadata extraction """ def process_video(self, video_path: str) -> VideoMetadata: """ Extract metadata without full processing """ try: probe = ffmpeg.probe(video_path) video_stream = next( (s for s in probe['streams'] if s['codec_type'] == 'video'), None ) if not video_stream: raise ValueError("No video stream found") return VideoMetadata( file_path=video_path, duration=float(probe['format']['duration']), width=int(video_stream['width']), height=int(video_stream['height']), codec=video_stream['codec_name'], frame_rate=self.parse_frame_rate(video_stream['r_frame_rate']), file_size=int(probe['format']['size']), format=probe['format']['format_name'] ) except Exception as e: logger.error(f"Video processing failed: {e}") return self.create_fallback_metadata(video_path) def extract_thumbnail(self, video_path: str, timestamp: float = 1.0) -> str: """ Extract frame as thumbnail """ output_path = f"{video_path}_thumb.jpg" try: ( ffmpeg .input(video_path, ss=timestamp) .filter('scale', 640, -1) .output(output_path, vframes=1) .overwrite_output() .run(quiet=True) ) return output_path except Exception as e: logger.error(f"Thumbnail extraction failed: {e}") return None ``` ### Video Frame Analysis (Optional) ```python async def analyze_video_frames( self, video_path: str, sample_rate: int = 30 # Extract 1 frame per 30 seconds ) -> List[ImageAnalysis]: """ Analyze key frames from video """ # Extract frames at intervals frames = self.extract_frames(video_path, sample_rate) # Analyze each frame with vision AI analyses = [] for i, frame_path in enumerate(frames): try: # Create temporary ExtractedImage temp_image = ExtractedImage( image_id=f"video_frame_{i}", file_path=frame_path, width=0, height=0, file_size=os.path.getsize(frame_path), mime_type="image/jpeg", extraction_method="video_frame", is_embedded=True ) # Analyze with vision agent analysis = await self.vision_agent.analyze_image(temp_image) analyses.append(analysis) except Exception as e: logger.warning(f"Frame analysis failed: {e}") finally: # Cleanup temporary frame if os.path.exists(frame_path): os.remove(frame_path) return analyses ``` ## Image Quality Assessment ```python class ImageQualityChecker: """ Assess image quality for business use """ def assess_quality(self, image_path: str) -> dict: """ Check if image meets quality standards """ with Image.open(image_path) as img: width, height = img.size quality_score = { 'resolution': self.check_resolution(width, height), 'aspect_ratio': self.check_aspect_ratio(width, height), 'file_size': self.check_file_size(image_path), 'format': self.check_format(img), 'overall': 0.0 } # Calculate overall score quality_score['overall'] = sum(quality_score.values()) / 4 return quality_score def check_resolution(self, width: int, height: int) -> float: """ Score based on resolution (0.0 to 1.0) """ pixels = width * height if pixels >= 1920 * 1080: # Full HD or better return 1.0 elif pixels >= 1280 * 720: # HD return 0.8 elif pixels >= 640 * 480: # VGA return 0.6 else: return 0.4 def check_aspect_ratio(self, width: int, height: int) -> float: """ Check if aspect ratio is standard """ ratio = width / height # Common aspect ratios: 16:9, 4:3, 1:1, 3:2 standard_ratios = [16/9, 4/3, 1.0, 3/2] # Find closest standard ratio closest_diff = min(abs(ratio - sr) for sr in standard_ratios) if closest_diff < 0.1: return 1.0 elif closest_diff < 0.2: return 0.8 else: return 0.6 ``` ## Image Deduplication ```python class ImageDeduplicator: """ Identify and remove duplicate images """ def deduplicate(self, images: List[ExtractedImage]) -> List[ExtractedImage]: """ Remove duplicate images using perceptual hashing """ seen_hashes = {} unique_images = [] for image in images: # Calculate perceptual hash img_hash = self.calculate_perceptual_hash(image.file_path) # Check for near-duplicates is_duplicate = False for existing_hash in seen_hashes.keys(): if self.hamming_distance(img_hash, existing_hash) < 5: is_duplicate = True logger.info(f"Duplicate image found: {image.image_id}") break if not is_duplicate: seen_hashes[img_hash] = image unique_images.append(image) return unique_images def calculate_perceptual_hash(self, image_path: str, hash_size: int = 8) -> str: """ Calculate perceptual hash for image comparison """ with Image.open(image_path) as img: # Convert to grayscale img = img.convert('L') # Resize to hash_size x hash_size img = img.resize((hash_size, hash_size), Image.Resampling.LANCZOS) # Get pixel data pixels = list(img.getdata()) # Calculate average avg = sum(pixels) / len(pixels) # Create hash bits = ''.join('1' if pixel > avg else '0' for pixel in pixels) return bits def hamming_distance(self, hash1: str, hash2: str) -> int: """ Calculate Hamming distance between two hashes """ return sum(c1 != c2 for c1, c2 in zip(hash1, hash2)) ``` ## Local Model Management for Vision ```python class VisionModelManager: """ Manage local Qwen model for vision processing """ def __init__(self): self.model_name = "qwen3.5:0.8b" self.ollama_client = Client(host='http://localhost:11434') self.max_images_per_job = 100 # Limit for batch processing def ensure_model_available(self): """ Check if Qwen model is available, pull if needed """ try: self.ollama_client.show(self.model_name) logger.info(f"Model {self.model_name} is available") except Exception: logger.info(f"Pulling model {self.model_name}...") self.ollama_client.pull(self.model_name) logger.info(f"Model {self.model_name} pulled successfully") def estimate_processing_time(self, image_count: int) -> float: """ Estimate processing time for batch of images ~2-3 seconds per image on typical hardware """ return image_count * 2.5 # seconds def check_system_resources(self) -> dict: """ Check if system has enough resources for vision processing """ import psutil ram = psutil.virtual_memory() # Qwen3.5:0.8B needs ~1-2GB RAM min_ram_gb = 4 return { 'available_ram_gb': ram.available / (1024**3), 'sufficient': ram.available > (min_ram_gb * 1024**3), 'recommendation': 'Close other applications if processing fails' } ``` ## Conclusion This multimodal processing strategy provides: - **Intelligent image analysis** using Qwen3.5:0.8B (local via Ollama) - **Context-aware prompting** for accurate categorization - **Image-to-inventory association** logic - **Quality assessment** for business usability - **Deduplication** to reduce redundancy - **Cost-effective** local processing with no API costs The Qwen-powered vision approach enables rich metadata extraction from visual content, significantly enhancing the digitization process for visually-oriented businesses while maintaining privacy and zero API costs.