handyhome-web-scripts / detect_duplicate.py
takomattyy's picture
Upload 5 files
dbf841a verified
import sys, json, os, requests, numpy as np
from PIL import Image
from io import BytesIO
from imagededup.methods import PHash
import tempfile
def download_image(url, output_path='temp_image.jpg'):
"""Download image from URL and save locally"""
print(f"DUPLICATE DOWNLOAD Downloading image from {url}", file=sys.stderr)
response = requests.get(url)
response.raise_for_status()
image_data = response.content
print(f"DUPLICATE DOWNLOAD Image data size: {len(image_data)} bytes", file=sys.stderr)
print(f"DUPLICATE DOWNLOAD Saving to {output_path}", file=sys.stderr)
# Process the image
image = Image.open(BytesIO(image_data))
# Convert to RGB if necessary
if image.mode == 'RGBA':
background = Image.new('RGB', image.size, (255, 255, 255))
background.paste(image, mask=image.split()[-1])
image = background
print(f"DUPLICATE DOWNLOAD Converted RGBA to RGB", file=sys.stderr)
elif image.mode != 'RGB':
image = image.convert('RGB')
print(f"DUPLICATE DOWNLOAD Converted {image.mode} to RGB", file=sys.stderr)
# Save as JPG
image.save(output_path, 'JPEG', quality=95)
print(f"DUPLICATE DOWNLOAD Saved image to {output_path}", file=sys.stderr)
return output_path
def download_reference_images(urls, temp_dir):
"""Download all reference images to temporary directory"""
downloaded_paths = []
for i, url in enumerate(urls):
try:
output_path = os.path.join(temp_dir, f"ref_image_{i}.jpg")
download_image(url, output_path)
downloaded_paths.append(output_path)
print(f"[DUPLICATE] Downloaded reference image {i+1}/{len(urls)}", file=sys.stderr)
except Exception as e:
print(f"[DUPLICATE] Failed to download reference image {i+1}: {str(e)}", file=sys.stderr)
continue
return downloaded_paths
def detect_duplicates(target_image_path, reference_image_paths, reference_urls, similarity_threshold=0.85):
"""Detect duplicates using perceptual hashing"""
print(f"DUPLICATE ANALYSIS Starting duplicate detection...", file=sys.stderr)
try:
phasher = PHash()
# Encode the target image
print(f"DUPLICATE ANALYSIS Encoding target image...", file=sys.stderr)
target_encoding = phasher.encode_image(target_image_path)
print(f"DUPLICATE ANALYSIS Target image hash: {target_encoding}", file=sys.stderr)
# Encode all reference images
print(f"DUPLICATE ANALYSIS Encoding reference images...", file=sys.stderr)
reference_encodings = {}
for i, ref_path in enumerate(reference_image_paths):
try:
encoding = phasher.encode_image(ref_path)
reference_encodings[ref_path] = encoding
print(f"DUPLICATE ANALYSIS Reference image {i+1} hash: {encoding}", file=sys.stderr)
except Exception as e:
print(f"DUPLICATE ANALYSIS Failed to encode reference image {i+1}: {str(e)}", file=sys.stderr)
continue
if not reference_encodings:
return {
"duplicates_found": False,
"message": "No reference images could be processed",
"similar_images": [],
"highest_similarity": 0.0
}
# Create a mapping of paths to URLs
path_to_url = dict(zip(reference_image_paths, reference_urls))
# Calculate similarities manually
print(f"DUPLICATE ANALYSIS Comparing images...", file=sys.stderr)
similar_images = []
highest_similarity = 0.0
for ref_path, ref_encoding in reference_encodings.items():
try:
# Calculate Hamming distance between hashes
hamming_distance = phasher.hamming_distance(target_encoding, ref_encoding)
# Convert to similarity score (lower distance = higher similarity)
# PHash uses 64-bit hashes, so max distance is 64
similarity = 1 - (hamming_distance / 64.0)
print(f"DUPLICATE ANALYSIS Comparing with {ref_path}: Hamming distance = {hamming_distance}, Similarity = {similarity:.4f}", file=sys.stderr)
if similarity >= similarity_threshold:
similar_images.append({
"reference_path": path_to_url[ref_path], # Use URL instead of path
"similarity_score": round(float(similarity), 4),
"hamming_distance": int(hamming_distance)
})
if similarity > highest_similarity:
highest_similarity = similarity
except Exception as e:
print(f"DUPLICATE ANALYSIS Error comparing with reference image: {str(e)}", file=sys.stderr)
continue
# Sort by similarity (highest first)
similar_images.sort(key=lambda x: x["similarity_score"], reverse=True)
print(f"DUPLICATE ANALYSIS Analysis complete. Found {len(similar_images)} similar images.", file=sys.stderr)
return {
"duplicates_found": len(similar_images) > 0,
"message": f"Found {len(similar_images)} similar images" if similar_images else "No duplicates found",
"similar_images": similar_images,
"highest_similarity": round(float(highest_similarity), 4),
"threshold_used": similarity_threshold
}
except Exception as e:
print(f"DUPLICATE ANALYSIS Error in duplicate detection: {str(e)}", file=sys.stderr)
return {
"duplicates_found": False,
"message": f"Error during duplicate detection: {str(e)}",
"similar_images": [],
"highest_similarity": 0.0
}
# Main execution
if len(sys.argv) < 3:
print(json.dumps({"error": "Usage: python detect_duplicate.py <target_image_url> <reference_url1> [reference_url2] ..."}))
sys.exit(1)
target_image_url = sys.argv[1]
reference_urls = sys.argv[2:]
try:
# Create temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
print(f"[DUPLICATE] Using temporary directory: {temp_dir}", file=sys.stderr)
# Download target image
target_image_path = os.path.join(temp_dir, "target_image.jpg")
download_image(target_image_url, target_image_path)
# Download reference images
reference_image_paths = download_reference_images(reference_urls, temp_dir)
if not reference_image_paths:
print(json.dumps({
"error": "No reference images could be downloaded"
}))
sys.exit(1)
# Detect duplicates
results = detect_duplicates(target_image_path, reference_image_paths, reference_urls)
# Clean up
if os.path.exists(target_image_path):
os.remove(target_image_path)
for ref_path in reference_image_paths:
if os.path.exists(ref_path):
os.remove(ref_path)
print(json.dumps({
"success": True,
"result": results
}))
except Exception as e:
print(json.dumps({"error": str(e)}))
sys.exit(1)