|
|
""" |
|
|
Google Gemini Vision - Trash Detection Module |
|
|
|
|
|
Uses Gemini's multimodal capabilities to detect and describe trash in images. |
|
|
Complements YOLOv8 with natural language understanding and detailed analysis. |
|
|
""" |
|
|
|
|
|
import os |
|
|
from typing import Dict, List, Optional |
|
|
from PIL import Image |
|
|
import base64 |
|
|
from io import BytesIO |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
class GeminiVisionDetector: |
|
|
"""Trash detection using Google Gemini Vision API.""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize Gemini Vision detector.""" |
|
|
self.enabled = False |
|
|
self.model = None |
|
|
|
|
|
api_key = os.getenv("GEMINI_API_KEY") |
|
|
if api_key: |
|
|
try: |
|
|
import google.generativeai as genai |
|
|
genai.configure(api_key=api_key) |
|
|
self.model = genai.GenerativeModel('gemini-1.5-flash-latest') |
|
|
self.enabled = True |
|
|
print("β Gemini Vision detector initialized") |
|
|
except ImportError: |
|
|
print("β Google GenAI package not installed (pip install google-generativeai)") |
|
|
except Exception as e: |
|
|
print(f"β Gemini Vision initialization failed: {e}") |
|
|
else: |
|
|
print("βΉ Gemini Vision disabled (no GEMINI_API_KEY)") |
|
|
|
|
|
def detect_trash(self, image: Image.Image) -> Dict: |
|
|
""" |
|
|
Detect trash items in image using Gemini Vision. |
|
|
|
|
|
Args: |
|
|
image: PIL Image object |
|
|
|
|
|
Returns: |
|
|
Dictionary with detection results: |
|
|
{ |
|
|
'items': List[str], # List of detected items |
|
|
'count': int, # Total item count |
|
|
'description': str, # Detailed description |
|
|
'severity': str, # LOW, MEDIUM, HIGH |
|
|
'categories': Dict # Category counts |
|
|
} |
|
|
""" |
|
|
if not self.enabled: |
|
|
return self._offline_fallback() |
|
|
|
|
|
try: |
|
|
|
|
|
prompt = """Analyze this image for trash and litter. Provide a detailed analysis: |
|
|
|
|
|
1. List all visible trash items (be specific: "plastic bottle", "cigarette butt", not just "trash") |
|
|
2. Estimate the total number of items |
|
|
3. Categorize items (plastic, paper, metal, organic, other) |
|
|
4. Assess severity (LOW: 1-5 items, MEDIUM: 6-15 items, HIGH: 16+ items) |
|
|
5. Describe the location/context |
|
|
|
|
|
Format your response as: |
|
|
ITEMS: item1, item2, item3, ... |
|
|
COUNT: <number> |
|
|
CATEGORIES: plastic:<count>, paper:<count>, metal:<count>, organic:<count>, other:<count> |
|
|
SEVERITY: <LOW|MEDIUM|HIGH> |
|
|
DESCRIPTION: <1-2 sentence description>""" |
|
|
|
|
|
|
|
|
response = self.model.generate_content([prompt, image]) |
|
|
|
|
|
|
|
|
return self._parse_gemini_response(response.text) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Gemini Vision detection failed: {e}") |
|
|
return self._offline_fallback() |
|
|
|
|
|
def _parse_gemini_response(self, response_text: str) -> Dict: |
|
|
"""Parse Gemini's structured response.""" |
|
|
lines = response_text.strip().split('\n') |
|
|
|
|
|
result = { |
|
|
'items': [], |
|
|
'count': 0, |
|
|
'description': '', |
|
|
'severity': 'MEDIUM', |
|
|
'categories': {} |
|
|
} |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
|
|
|
if line.startswith('ITEMS:'): |
|
|
items_str = line.replace('ITEMS:', '').strip() |
|
|
result['items'] = [item.strip() for item in items_str.split(',')] |
|
|
|
|
|
elif line.startswith('COUNT:'): |
|
|
try: |
|
|
result['count'] = int(line.replace('COUNT:', '').strip()) |
|
|
except: |
|
|
result['count'] = len(result['items']) |
|
|
|
|
|
elif line.startswith('CATEGORIES:'): |
|
|
cats_str = line.replace('CATEGORIES:', '').strip() |
|
|
for cat_pair in cats_str.split(','): |
|
|
if ':' in cat_pair: |
|
|
cat, count = cat_pair.split(':') |
|
|
try: |
|
|
result['categories'][cat.strip()] = int(count.strip()) |
|
|
except: |
|
|
pass |
|
|
|
|
|
elif line.startswith('SEVERITY:'): |
|
|
severity = line.replace('SEVERITY:', '').strip().upper() |
|
|
if severity in ['LOW', 'MEDIUM', 'HIGH']: |
|
|
result['severity'] = severity |
|
|
|
|
|
elif line.startswith('DESCRIPTION:'): |
|
|
result['description'] = line.replace('DESCRIPTION:', '').strip() |
|
|
|
|
|
return result |
|
|
|
|
|
def _offline_fallback(self) -> Dict: |
|
|
"""Return mock results when Gemini is not available.""" |
|
|
return { |
|
|
'items': ['plastic bottle', 'food wrapper', 'paper cup'], |
|
|
'count': 3, |
|
|
'description': 'Gemini Vision not configured. Using mock detection.', |
|
|
'severity': 'MEDIUM', |
|
|
'categories': { |
|
|
'plastic': 2, |
|
|
'paper': 1 |
|
|
} |
|
|
} |
|
|
|
|
|
def compare_with_yolo(self, yolo_results: Dict, gemini_results: Dict) -> str: |
|
|
""" |
|
|
Generate comparison between YOLOv8 and Gemini detections. |
|
|
|
|
|
Args: |
|
|
yolo_results: Results from YOLOv8 detector |
|
|
gemini_results: Results from Gemini Vision |
|
|
|
|
|
Returns: |
|
|
Formatted comparison text |
|
|
""" |
|
|
|
|
|
if isinstance(yolo_results, dict): |
|
|
yolo_count = yolo_results.get('total_items', 0) |
|
|
if yolo_count == 0 and 'detections' in yolo_results: |
|
|
yolo_count = len(yolo_results.get('detections', [])) |
|
|
yolo_categories = list(yolo_results.get('categories', {}).keys()) |
|
|
yolo_confidence = yolo_results.get('avg_confidence', 0) |
|
|
else: |
|
|
|
|
|
yolo_count = len(yolo_results) if isinstance(yolo_results, list) else 0 |
|
|
yolo_categories = [] |
|
|
yolo_confidence = 0 |
|
|
|
|
|
comparison = f""" |
|
|
π **Dual-Engine Detection Comparison** |
|
|
|
|
|
**YOLOv8 (Computer Vision):** |
|
|
- Detected: {yolo_count} items |
|
|
- Categories: {', '.join(yolo_categories) if yolo_categories else 'Multiple types'} |
|
|
- Confidence: {yolo_confidence:.1f}% |
|
|
|
|
|
**Gemini Vision (Multimodal AI):** |
|
|
- Detected: {gemini_results['count']} items |
|
|
- Categories: {', '.join(gemini_results['categories'].keys())} |
|
|
- Severity: {gemini_results['severity']} |
|
|
- Context: {gemini_results['description']} |
|
|
|
|
|
**Insights:** |
|
|
{self._generate_insights(yolo_count, yolo_confidence, gemini_results)} |
|
|
""" |
|
|
return comparison.strip() |
|
|
|
|
|
def _generate_insights(self, yolo_count: int, yolo_confidence: float, gemini: Dict) -> str: |
|
|
"""Generate insights from comparing both detections.""" |
|
|
insights = [] |
|
|
|
|
|
|
|
|
gemini_count = gemini['count'] |
|
|
|
|
|
if abs(yolo_count - gemini_count) <= 2: |
|
|
insights.append("β Both models agree on item count") |
|
|
else: |
|
|
insights.append(f"β Count difference: YOLOv8={yolo_count}, Gemini={gemini_count}") |
|
|
|
|
|
|
|
|
if gemini['description']: |
|
|
insights.append(f"π‘ Gemini provides context: \"{gemini['description']}\"") |
|
|
|
|
|
|
|
|
if yolo_confidence > 80: |
|
|
insights.append(f"π― YOLOv8 provides precise bounding boxes ({yolo_confidence:.0f}% confidence)") |
|
|
elif yolo_confidence > 0: |
|
|
insights.append(f"π YOLOv8 provides precise bounding boxes") |
|
|
|
|
|
return '\n'.join(insights) if insights else "Both models detected trash successfully" |
|
|
|
|
|
|
|
|
|
|
|
_gemini_detector = None |
|
|
|
|
|
def get_gemini_detector() -> GeminiVisionDetector: |
|
|
"""Get singleton Gemini detector instance.""" |
|
|
global _gemini_detector |
|
|
if _gemini_detector is None: |
|
|
_gemini_detector = GeminiVisionDetector() |
|
|
return _gemini_detector |
|
|
|