bahngleis-detektor / improved_person_detector.py
Migjomatic's picture
Remove HF token; use env var
8a74c03
#!/usr/bin/env python3
"""
Improved Person on Track Detector using a completely different approach
Instead of relying on text descriptions, use multiple specific questions and cross-validation
"""
import sys
import os
from io import BytesIO
from PIL import Image
# Add current directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
class ImprovedPersonOnTrackDetector:
"""Much better person-on-track detector using multiple validation approaches"""
def __init__(self, model_manager):
self.model_manager = model_manager
self.cnn_model = model_manager.cnn_model
self.transformer_model = model_manager.transformer_model
def detect_person_on_track(self, image: Image.Image) -> dict:
"""Improved detection using multiple specific questions and validation"""
try:
# APPROACH 1: Multiple specific questions to CNN model
questions = [
"Are there any people visible in this image?",
"Is anyone standing on railway tracks?",
"Do you see a person on train tracks?",
"Are the train tracks empty of people?",
"Is this image showing people near trains?"
]
cnn_responses = {}
for i, question in enumerate(questions):
response = self.cnn_model.generate_caption(image, question)
cleaned_response = self._clean_response(response, question)
cnn_responses[f"q{i+1}"] = {
"question": question,
"response": cleaned_response,
"analysis": self._analyze_yes_no_response(cleaned_response, question)
}
# APPROACH 2: Use Transformer for scene description
scene_description = self.transformer_model.generate_caption(image, "Describe this scene in detail")
# APPROACH 3: Use CNN for object detection
objects_response = self.cnn_model.generate_caption(image, "What objects do you see in this image?")
objects_cleaned = self._clean_response(objects_response, "What objects do you see in this image?")
# COMBINE ALL APPROACHES
final_analysis = self._combine_all_analyses(cnn_responses, scene_description, objects_cleaned)
return final_analysis
except Exception as e:
return {
"person_on_track": False,
"people_count": 0,
"confidence": 0.0,
"analysis": f"Detection failed: {str(e)}",
"detailed_analysis": {"error": str(e)}
}
def _clean_response(self, response, original_question):
"""Remove question repetition and extract meaningful response"""
if not response:
return ""
response = response.strip()
question_lower = original_question.lower()
response_lower = response.lower()
# If response is just the question, return empty
if response_lower == question_lower:
return ""
# If response starts with the question, remove it
if response_lower.startswith(question_lower):
cleaned = response[len(original_question):].strip()
return cleaned.lstrip('?.,!:') if cleaned else ""
# If response contains too many words from the question, likely repetition
question_words = set(question_lower.split())
response_words = set(response_lower.split())
overlap = len(question_words.intersection(response_words))
if len(response_words) < 10 and overlap > len(question_words) * 0.6:
return "" # Likely question repetition
return response
def _analyze_yes_no_response(self, response, question):
"""Analyze response to extract yes/no meaning"""
if not response:
return {"answer": "UNCLEAR", "confidence": 0.1}
response_lower = response.lower().strip()
# Direct yes/no answers
if response_lower in ["yes", "no"]:
return {"answer": response_lower.upper(), "confidence": 0.9}
# Check for yes indicators
yes_indicators = ["yes", "there is", "there are", "i see", "visible", "present", "standing", "person"]
no_indicators = ["no", "not", "none", "empty", "clear", "nobody", "no one", "absent"]
yes_score = sum(1 for indicator in yes_indicators if indicator in response_lower)
no_score = sum(1 for indicator in no_indicators if indicator in response_lower)
if yes_score > no_score:
confidence = min(0.7, 0.4 + yes_score * 0.1)
return {"answer": "YES", "confidence": confidence}
elif no_score > yes_score:
confidence = min(0.7, 0.4 + no_score * 0.1)
return {"answer": "NO", "confidence": confidence}
else:
return {"answer": "UNCLEAR", "confidence": 0.3}
def _combine_all_analyses(self, cnn_responses, scene_description, objects_response):
"""Combine all analysis approaches to make final decision"""
# Count YES/NO responses from CNN questions
yes_count = 0
no_count = 0
unclear_count = 0
total_confidence = 0
question_results = []
for key, response_data in cnn_responses.items():
analysis = response_data["analysis"]
answer = analysis["answer"]
confidence = analysis["confidence"]
if answer == "YES":
yes_count += 1
elif answer == "NO":
no_count += 1
else:
unclear_count += 1
total_confidence += confidence
question_results.append({
"question": response_data["question"],
"response": response_data["response"],
"answer": answer,
"confidence": confidence
})
# Analyze scene description for people/track keywords
scene_lower = scene_description.lower()
people_keywords = ["person", "people", "man", "woman", "human", "individual"]
track_keywords = ["track", "tracks", "rail", "railway", "train"]
people_in_scene = any(keyword in scene_lower for keyword in people_keywords)
tracks_in_scene = any(keyword in scene_lower for keyword in track_keywords)
# Analyze objects response
objects_lower = objects_response.lower() if objects_response else ""
people_in_objects = any(keyword in objects_lower for keyword in people_keywords)
# DECISION LOGIC - Much more sophisticated
person_on_track = False
people_count = 0
confidence = 0.3
# Method 1: Majority vote from specific questions
total_responses = yes_count + no_count + unclear_count
if total_responses > 0:
yes_percentage = yes_count / total_responses
no_percentage = no_count / total_responses
if yes_percentage >= 0.6: # 60% or more say YES
person_on_track = True
confidence = 0.6 + yes_percentage * 0.2
analysis = f"Multiple questions confirm person presence ({yes_count}/{total_responses} positive)"
people_count = min(yes_count, 3) # Estimate based on positive responses
elif no_percentage >= 0.6: # 60% or more say NO
person_on_track = False
confidence = 0.6 + no_percentage * 0.2
analysis = f"Multiple questions confirm no person on tracks ({no_count}/{total_responses} negative)"
people_count = 0
else:
# Mixed responses - use secondary validation
if people_in_scene and tracks_in_scene:
person_on_track = True
confidence = 0.5
analysis = f"Scene analysis suggests person near tracks (mixed question results)"
people_count = 1
else:
person_on_track = False
confidence = 0.4
analysis = f"Unclear from questions, scene analysis suggests safe"
people_count = 0
# Method 2: Cross-validation with scene description
if people_in_scene and tracks_in_scene and not person_on_track:
# Scene suggests people + tracks but questions said no - be conservative
person_on_track = False
analysis = f"Scene mentions people and tracks but specific questions indicate safe"
confidence = max(confidence, 0.5)
elif not people_in_scene and person_on_track:
# Questions said yes but scene doesn't mention people - lower confidence
confidence *= 0.7
analysis = f"Questions suggest person present but scene unclear"
# Method 3: Object detection validation
if people_in_objects and not people_in_scene and not person_on_track:
# Objects mention people but scene doesn't - possible person present
person_on_track = True
confidence = 0.4
analysis = f"Object detection suggests person presence"
people_count = 1
# Final confidence adjustment
avg_question_confidence = total_confidence / max(len(cnn_responses), 1)
confidence = (confidence + avg_question_confidence) / 2
return {
"person_on_track": person_on_track,
"people_count": people_count,
"confidence": min(confidence, 1.0),
"analysis": analysis,
"detailed_analysis": {
"question_results": question_results,
"yes_responses": yes_count,
"no_responses": no_count,
"unclear_responses": unclear_count,
"scene_description": scene_description,
"people_in_scene": people_in_scene,
"tracks_in_scene": tracks_in_scene,
"objects_response": objects_response,
"people_in_objects": people_in_objects
}
}
def test_improved_detector():
"""Test the improved detector approach"""
print("TESTING IMPROVED PERSON ON TRACK DETECTOR")
print("=" * 60)
print("Using multiple questions + scene analysis + object detection")
print()
try:
from local_models import get_local_model_manager
from app import extract_frames_from_video
local_manager = get_local_model_manager()
improved_detector = ImprovedPersonOnTrackDetector(local_manager)
print("+ Improved detector ready")
except Exception as e:
print(f"- Setup error: {e}")
return
# Test with first video
video_path = "test\\1.mp4"
if not os.path.exists(video_path):
print(f"- Video not found: {video_path}")
return
try:
with open(video_path, 'rb') as f:
video_data = f.read()
video_file = BytesIO(video_data)
frames = extract_frames_from_video(video_file, fps=0.5)
if not frames:
print("- No frames extracted")
return
frame_data = frames[0]
print(f"+ Testing frame at {frame_data['timestamp']:.1f}s")
# Test improved detector
result = improved_detector.detect_person_on_track(frame_data['frame'])
print(f"\n" + "=" * 50)
print("IMPROVED DETECTOR RESULTS")
print("=" * 50)
analysis = result.get('analysis', 'No analysis')
people_count = result.get('people_count', 0)
confidence = result.get('confidence', 0)
person_on_track = result.get('person_on_track', False)
if person_on_track:
print(f"🚨 ALERT: {analysis}")
else:
print(f"βœ… SAFE: {analysis}")
print(f"πŸ‘₯ People Count: {people_count}")
print(f"πŸ“Š Confidence: {confidence:.0%}")
# Show detailed analysis
detailed = result.get('detailed_analysis', {})
if 'question_results' in detailed:
print(f"\nπŸ“‹ Question Analysis:")
for q_result in detailed['question_results']:
print(f" Q: {q_result['question']}")
print(f" A: {q_result['answer']} ({q_result['confidence']:.0%}) - {q_result['response'][:50]}...")
print(f"\n🎯 This approach should be much more accurate!")
except Exception as e:
print(f"- Test error: {e}")
if __name__ == "__main__":
test_improved_detector()