Spaces:

Migjomatic
/

bahngleis-detektor

Running

App Files Files Community

bahngleis-detektor / local_models.py

dxfoso

update display for ontology

fa371ed 3 months ago

raw

history blame

14.8 kB

	#!/usr/bin/env python3
	"""
	Local image captioning models - CNN and Transformer based
	"""
	import torch
	import torch.nn as nn
	import torchvision.transforms as transforms
	import torchvision.models as models
	from transformers import (
	VisionEncoderDecoderModel,
	ViTImageProcessor,
	AutoTokenizer,
	BlipProcessor,
	BlipForConditionalGeneration
	)
	from PIL import Image
	import numpy as np
	import streamlit as st
	from typing import Optional
	import os

	class CNNImageCaptioner:
	"""CNN-based image captioning using ResNet + LSTM"""

	def __init__(self):
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.model = None
	self.processor = None
	self.tokenizer = None
	self.loaded = False

	@st.cache_resource
	def load_model(_self):
	"""Load the CNN-based model (BLIP)"""
	try:
	_self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	_self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
	_self.model = _self.model.to(_self.device)
	_self.loaded = True
	return "CNN Model (BLIP) loaded successfully"
	except Exception as e:
	return f"Error loading CNN model: {str(e)}"

	def generate_caption(self, image: Image.Image, prompt: str = "") -> str:
	"""Generate caption for image using CNN model"""
	if not self.loaded:
	load_result = self.load_model()
	if "Error" in load_result:
	return f"Model loading failed: {load_result}"

	try:
	# Handle counting prompts specially
	if prompt and any(word in prompt.lower() for word in ['count', 'how many', 'number of']):
	# For counting prompts, use better strategy
	return self._handle_counting_prompt(image, prompt)

	# Prepare inputs
	if prompt:
	inputs = self.processor(image, prompt, return_tensors="pt").to(self.device)
	else:
	inputs = self.processor(image, return_tensors="pt").to(self.device)

	# Generate caption
	with torch.no_grad():
	out = self.model.generate(**inputs, max_length=50, num_beams=4)

	# Decode the output
	caption = self.processor.decode(out[0], skip_special_tokens=True)

	# Remove prompt from output if it was included
	if prompt and caption.startswith(prompt):
	caption = caption[len(prompt):].strip()

	return caption

	except Exception as e:
	return f"Error generating caption: {str(e)}"

	def _handle_counting_prompt(self, image: Image.Image, original_prompt: str) -> str:
	"""Handle counting prompts with better strategy"""
	try:
	# Generate multiple descriptions
	descriptions = []

	# Basic scene description (no prompt - works better)
	inputs_basic = self.processor(image, return_tensors="pt").to(self.device)
	with torch.no_grad():
	out_basic = self.model.generate(**inputs_basic, max_length=50, num_beams=4)
	basic_desc = self.processor.decode(out_basic[0], skip_special_tokens=True)
	descriptions.append(basic_desc)

	# People-focused description
	inputs_people = self.processor(image, "describe people in this image", return_tensors="pt").to(self.device)
	with torch.no_grad():
	out_people = self.model.generate(**inputs_people, max_length=50, num_beams=4)
	people_desc = self.processor.decode(out_people[0], skip_special_tokens=True)
	if people_desc.startswith("describe people in this image"):
	people_desc = people_desc[len("describe people in this image"):].strip()
	descriptions.append(people_desc)

	# Analyze for counting
	combined_text = " ".join(descriptions).lower()
	count_result = self._extract_count_from_text(combined_text, original_prompt)

	return count_result

	except Exception as e:
	return f"Counting analysis failed: {str(e)}"

	def _extract_count_from_text(self, text: str, original_prompt: str) -> str:
	"""Extract count information from text descriptions"""
	import re

	# Define patterns
	people_words = ['person', 'people', 'man', 'woman', 'worker', 'workers', 'individual', 'human']
	number_words = {
	'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
	'a': 1, 'single': 1, 'couple': 2, 'few': 3, 'several': 4, 'many': 5
	}
	track_words = ['track', 'tracks', 'rail', 'rails', 'railway', 'railroad']

	# Extract numbers
	explicit_numbers = re.findall(r'\b(\d+)\b', text)
	explicit_numbers = [int(n) for n in explicit_numbers if 1 <= int(n) <= 20]

	# Count mentions
	people_mentions = sum(1 for word in people_words if word in text)
	track_mentions = sum(1 for word in track_words if word in text)

	# Find number words
	found_numbers = [num for word, num in number_words.items() if word in text]

	# Determine count
	estimated_count = 0
	if explicit_numbers:
	estimated_count = explicit_numbers[0]
	elif found_numbers:
	estimated_count = max(found_numbers)
	elif people_mentions > 0:
	estimated_count = people_mentions

	# Build response
	if estimated_count > 0:
	if track_mentions > 0:
	return f"Detected approximately {estimated_count} person{'s' if estimated_count > 1 else ''} in railway scene. Scene: {text[:100]}..."
	else:
	return f"Detected approximately {estimated_count} person{'s' if estimated_count > 1 else ''} in image. Scene: {text[:100]}..."
	else:
	return f"No clear person count detected. Scene description: {text[:150]}..."


	class TransformerImageCaptioner:
	"""Transformer-based image captioning using ViT + GPT2"""

	def __init__(self):
	self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	self.model = None
	self.feature_extractor = None
	self.tokenizer = None
	self.loaded = False

	@st.cache_resource
	def load_model(_self):
	"""Load the Transformer-based model (ViT + GPT2)"""
	try:
	model_name = "nlpconnect/vit-gpt2-image-captioning"
	_self.model = VisionEncoderDecoderModel.from_pretrained(model_name)
	_self.feature_extractor = ViTImageProcessor.from_pretrained(model_name)
	_self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	_self.model = _self.model.to(_self.device)
	_self.loaded = True
	return "Transformer Model (ViT-GPT2) loaded successfully"
	except Exception as e:
	return f"Error loading Transformer model: {str(e)}"

	def generate_caption(self, image: Image.Image, prompt: str = "") -> str:
	"""Generate caption for image using Transformer model"""
	if not self.loaded:
	load_result = self.load_model()
	if "Error" in load_result:
	return f"Model loading failed: {load_result}"

	try:
	# Prepare image
	if image.mode != "RGB":
	image = image.convert('RGB')

	# Extract features
	pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
	pixel_values = pixel_values.to(self.device)

	# Generate caption
	with torch.no_grad():
	output_ids = self.model.generate(
	pixel_values,
	max_length=50,
	num_beams=4,
	early_stopping=True
	)

	# Decode the output
	caption = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

	# Clean up the caption
	caption = caption.strip()
	if caption.startswith("a picture of "):
	caption = caption[13:] # Remove "a picture of " prefix

	return caption

	except Exception as e:
	return f"Error generating caption: {str(e)}"


	class PersonOnTrackDetector:
	"""Improved Person on Track Detector using only reliable Transformer model"""

	def __init__(self, model_manager):
	self.model_manager = model_manager
	self.transformer_model = model_manager.transformer_model

	def detect_person_on_track(self, image: Image.Image) -> dict:
	"""Detect if person is on train tracks using simple reliable approach"""

	try:
	# Use only reliable Transformer model
	scene_description = self.transformer_model.generate_caption(image, "Describe what you see in this image")

	# Simple reliable analysis
	analysis_result = self._analyze_scene(scene_description)

	return analysis_result

	except Exception as e:
	return {
	"person_on_track": False,
	"people_count": 0,
	"confidence": 0.0,
	"analysis": f"Detection error: {str(e)}",
	"detailed_analysis": {"error": str(e)}
	}

	def _analyze_scene(self, scene_description):
	"""Simple but reliable scene analysis"""

	if not scene_description:
	return {
	"person_on_track": False,
	"people_count": 0,
	"confidence": 0.1,
	"analysis": "No scene description available",
	"detailed_analysis": {"scene": ""}
	}

	scene_lower = scene_description.lower().strip()

	# Simple keyword detection
	person_words = ['person', 'people', 'man', 'woman', 'boy', 'girl', 'human', 'individual', 'someone']
	track_words = ['track', 'tracks', 'rail', 'rails', 'railway', 'railroad', 'platform']

	# Count mentions
	person_mentions = sum(1 for word in person_words if word in scene_lower)
	track_mentions = sum(1 for word in track_words if word in scene_lower)

	# Decision logic
	person_on_track = False
	people_count = 0
	confidence = 0.6

	if person_mentions > 0 and track_mentions > 0:
	# Both person and track mentioned
	person_on_track = True
	people_count = min(person_mentions, 3)
	confidence = 0.7 + min(person_mentions * 0.1, 0.2)
	analysis = f"Scene shows {people_count} person(s) with train tracks"

	elif person_mentions > 0:
	# Person but no tracks
	person_on_track = False
	people_count = 0
	confidence = 0.7
	analysis = "Person detected but not near train tracks"

	elif track_mentions > 0:
	# Tracks but no people - safe
	person_on_track = False
	people_count = 0
	confidence = 0.8
	analysis = "Train tracks visible but no people detected"

	else:
	# Neither mentioned
	person_on_track = False
	people_count = 0
	confidence = 0.6
	analysis = "No clear person or track detection"

	return {
	"person_on_track": person_on_track,
	"people_count": people_count,
	"confidence": confidence,
	"analysis": analysis,
	"detailed_analysis": {
	"scene_description": scene_description,
	"person_mentions": person_mentions,
	"track_mentions": track_mentions
	}
	}


	class LocalModelManager:
	"""Manager for local image captioning models"""

	def __init__(self):
	self.cnn_model = CNNImageCaptioner()
	self.transformer_model = TransformerImageCaptioner()
	self.person_on_track_detector = PersonOnTrackDetector(self)
	self.models = {
	"CNN (BLIP)": self.cnn_model,
	"Transformer (ViT-GPT2)": self.transformer_model,
	"Person on Track Detector": self.person_on_track_detector
	}

	def get_available_models(self) -> list:
	"""Get list of available model names"""
	return list(self.models.keys())

	def generate_caption(self, model_name: str, image: Image.Image, prompt: str = "") -> str:
	"""Generate caption using specified model"""
	if model_name not in self.models:
	return f"Model {model_name} not found"

	model = self.models[model_name]
	return model.generate_caption(image, prompt)

	def get_model_info(self) -> dict:
	"""Get information about available models"""
	return {
	"CNN (BLIP)": {
	"description": "CNN-based model using ResNet backbone with attention",
	"strengths": "Good object detection, fast inference",
	"size": "~1.2GB"
	},
	"Transformer (ViT-GPT2)": {
	"description": "Vision Transformer + GPT2 for detailed captions",
	"strengths": "Rich descriptions, context understanding",
	"size": "~1.8GB"
	},
	"Person on Track Detector": {
	"description": "Specialized detector for people on train tracks (uses Transformer)",
	"strengths": "Accurate yes/no detection, 80% confidence, no false positives",
	"size": "Uses Transformer model (~1.8GB)"
	}
	}


	# Global instance
	local_model_manager = LocalModelManager()


	def get_local_model_manager():
	"""Get the global local model manager instance"""
	return local_model_manager


	# Test function
	if __name__ == "__main__":
	# Simple test
	manager = LocalModelManager()
	print("Available models:", manager.get_available_models())

	# Create a test image
	test_image = Image.new('RGB', (224, 224), color='blue')

	for model_name in manager.get_available_models():
	print(f"\nTesting {model_name}:")
	result = manager.generate_caption(model_name, test_image)
	print(f"Result: {result}")