| from PIL import Image |
| import imagehash |
| import requests |
| import io |
| import json |
| from pathlib import Path |
| from math import radians, cos, sin, asin, sqrt |
|
|
| |
| from app import dataset |
|
|
| def _load_accepted_reports(): |
| """Load all accepted reports from dataset.jsonl.""" |
| try: |
| if not dataset.DATA_FILE.exists(): |
| return [] |
| |
| accepted_reports = [] |
| with dataset.DATA_FILE.open("r", encoding="utf8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| report = json.loads(line) |
| |
| if report.get("status") == "accepted" and report.get("accept") is True: |
| accepted_reports.append(report) |
| except json.JSONDecodeError: |
| continue |
| |
| return accepted_reports |
| except Exception as e: |
| print(f"[ERROR] Failed to load accepted reports from dataset: {str(e)}") |
| return [] |
|
|
|
|
| def is_duplicate(user_id: str, description: str, category: str, store: bool = True) -> bool: |
| """ |
| Check if this exact report has been submitted before by checking dataset.jsonl. |
| Only returns True for exact matches (same user, same description, same category). |
| Only checks ACCEPTED reports from dataset.jsonl. |
| Set store=False to check without storing (for validation before acceptance). |
| Note: store parameter is kept for compatibility but doesn't do anything (data is stored via dataset.save_report). |
| """ |
| try: |
| |
| normalized_desc = " ".join(description.strip().lower().split()) |
| user_id_normalized = (user_id or "anon").lower() |
| category_normalized = category.lower() |
| |
| |
| accepted_reports = _load_accepted_reports() |
| |
| |
| for report in accepted_reports: |
| report_user_id = (report.get("user_id") or "anon").lower() |
| report_desc = " ".join((report.get("description") or "").strip().lower().split()) |
| report_category = (report.get("category") or "").lower() |
| |
| |
| if (report_user_id == user_id_normalized and |
| report_desc == normalized_desc and |
| report_category == category_normalized): |
| print(f"[DEBUG] Text duplicate found in dataset: user_id={user_id_normalized}, category={category}") |
| return True |
| |
| return False |
| except Exception as e: |
| print(f"[ERROR] Text duplicate check failed: {str(e)}") |
| import traceback |
| print(traceback.format_exc()) |
| return False |
|
|
| def is_duplicate_image(image_url: str, threshold: int = 0, store: bool = True) -> bool: |
| """Check if an image is a duplicate using URL first, then perceptual hash (pHash). |
| Checks ACCEPTED reports from dataset.jsonl. |
| threshold = maximum Hamming distance allowed to consider images equal. |
| threshold=0 means EXACT hash match only (most strict). |
| Set store=False to check without storing (for validation before acceptance). |
| DEPRECATED: Use is_duplicate_image_from_bytes instead. |
| Note: store parameter is kept for compatibility but doesn't do anything (data is stored via dataset.save_report). |
| """ |
| if not image_url: |
| return False |
| |
| try: |
| |
| try: |
| from urllib.parse import urlparse, urlunparse |
| parsed = urlparse(image_url) |
| normalized_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', '', '')) |
| |
| |
| accepted_reports = _load_accepted_reports() |
| for report in accepted_reports: |
| |
| report_image_url = report.get("image_url") |
| if report_image_url: |
| try: |
| report_parsed = urlparse(report_image_url) |
| report_normalized = urlunparse((report_parsed.scheme, report_parsed.netloc, report_parsed.path, '', '', '')) |
| if report_normalized == normalized_url: |
| print(f"[DEBUG] Duplicate detected: Exact URL match in dataset for {normalized_url}") |
| return True |
| except Exception: |
| continue |
| except Exception as e: |
| print(f"[WARNING] URL normalization failed: {str(e)}") |
| |
| |
| |
| try: |
| resp = requests.get(image_url, timeout=10) |
| resp.raise_for_status() |
| img = Image.open(io.BytesIO(resp.content)).convert('RGB') |
| img_hash = imagehash.phash(img) |
| img_hash_int = int(str(img_hash), 16) |
|
|
| |
| accepted_reports = _load_accepted_reports() |
| for report in accepted_reports: |
| report_hash = report.get("image_hash") |
| if report_hash is not None: |
| try: |
| if isinstance(report_hash, str): |
| report_hash_int = int(report_hash, 16) |
| else: |
| report_hash_int = int(report_hash) |
| |
| if abs(img_hash_int - report_hash_int) == 0: |
| print(f"[DEBUG] Duplicate detected: Exact hash match in dataset") |
| return True |
| except (ValueError, TypeError): |
| continue |
| |
| return False |
| except Exception as e: |
| |
| print(f"[ERROR] Image hash check failed for {image_url}: {str(e)}") |
| return False |
| except Exception as e: |
| print(f"[ERROR] Image duplicate check failed: {str(e)}") |
| return False |
|
|
|
|
| def is_duplicate_image_from_bytes(image_bytes: bytes, threshold: int = 0, store: bool = True) -> bool: |
| """Check if an image is a duplicate using perceptual hash (pHash) from bytes. |
| Works with image bytes directly (no URL required). |
| Checks ACCEPTED reports from dataset.jsonl for image hashes. |
| threshold = maximum Hamming distance allowed to consider images equal. |
| threshold=0 means EXACT hash match only (most strict). |
| Set store=False to check without storing (for validation before acceptance). |
| Note: store parameter is kept for compatibility but doesn't do anything (image hash is stored via dataset.save_report). |
| """ |
| if not image_bytes: |
| return False |
| |
| try: |
| |
| img = Image.open(io.BytesIO(image_bytes)).convert('RGB') |
| img_hash = imagehash.phash(img) |
| img_hash_int = int(str(img_hash), 16) |
|
|
| |
| accepted_reports = _load_accepted_reports() |
| |
| |
| for report in accepted_reports: |
| |
| report_hash = report.get("image_hash") |
| if report_hash is not None: |
| try: |
| |
| if isinstance(report_hash, str): |
| report_hash_int = int(report_hash, 16) |
| else: |
| report_hash_int = int(report_hash) |
| |
| |
| if abs(img_hash_int - report_hash_int) == 0: |
| print(f"[DEBUG] Image duplicate detected: Exact hash match in dataset") |
| return True |
| except (ValueError, TypeError): |
| continue |
| |
| return False |
| except Exception as e: |
| |
| |
| print(f"[ERROR] Image hash check failed: {str(e)}") |
| import traceback |
| print(traceback.format_exc()) |
| return False |
|
|
| def haversine(lat1, lon1, lat2, lon2): |
| """Calculate great-circle distance between two lat/lon points in meters.""" |
| lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2]) |
| delta_lat = lat2 - lat1 |
| delta_lon = lon2 - lon1 |
| a = sin(delta_lat/2)**2 + cos(lat1) * cos(lat2) * sin(delta_lon/2)**2 |
| c = 2 * asin(sqrt(a)) |
| r = 6371000 |
| return c * r |
|
|
| def is_duplicate_location(lat: float, lon: float, description: str, category: str, threshold: float = 10.0, store: bool = True) -> bool: |
| """ |
| Return True if an existing ACCEPTED report with same category exists within threshold meters. |
| Checks dataset.jsonl for accepted reports with same category within threshold (default 10 meters). |
| Set store=False to check without storing (for validation before acceptance). |
| threshold can be a float for more precise control (e.g., 10.0 meters). |
| Note: store parameter is kept for compatibility but doesn't do anything (location is stored via dataset.save_report). |
| """ |
| try: |
| |
| accepted_reports = _load_accepted_reports() |
| |
| category_normalized = category.lower() |
| |
| |
| for report in accepted_reports: |
| report_category = (report.get("category") or "").lower() |
| |
| |
| if report_category == category_normalized: |
| |
| report_lat = report.get("latitude") |
| report_lon = report.get("longitude") |
| |
| if report_lat is not None and report_lon is not None: |
| |
| dist = haversine(lat, lon, float(report_lat), float(report_lon)) |
| |
| if dist <= threshold: |
| print(f"[DEBUG] Location duplicate found in dataset: ({lat}, {lon}) is {dist:.2f}m from ({report_lat}, {report_lon}) for category '{category}'") |
| return True |
| |
| return False |
| except Exception as e: |
| |
| print(f"[ERROR] Location duplicate check failed: {str(e)}") |
| import traceback |
| print(traceback.format_exc()) |
| return False |
|
|