Spaces:
Runtime error
Runtime error
Delete germanShepherdCatalog.py
Browse files- germanShepherdCatalog.py +0 -258
germanShepherdCatalog.py
DELETED
|
@@ -1,258 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
import numpy as np
|
| 3 |
-
import os
|
| 4 |
-
import sys
|
| 5 |
-
import urllib
|
| 6 |
-
import math
|
| 7 |
-
|
| 8 |
-
from utils import *
|
| 9 |
-
|
| 10 |
-
class LoadCatalogue:
|
| 11 |
-
def __init__(self, germanShepherdData='data/'):
|
| 12 |
-
|
| 13 |
-
self.germanShepherdRunningData = os.popen('hostname').read().startswith('Abhinit-Sundar-HP-Desktop')
|
| 14 |
-
self.germanShepherdRunningData = False
|
| 15 |
-
if self.germanShepherdRunningData:
|
| 16 |
-
self.germanShepherdData = germanShepherdData
|
| 17 |
-
|
| 18 |
-
else:
|
| 19 |
-
self.germanShepherdData = 'https://www.pedigreedatabase.com/german_shepherd_dog/search.html'
|
| 20 |
-
self.germanShepherdDataLocal = 'data/'
|
| 21 |
-
|
| 22 |
-
def germanShepherdLocalURL(self, germanShepherdFileInput, check_fullsize=False, fullsize=None):
|
| 23 |
-
if self.germanShepherdRunningData:
|
| 24 |
-
return germanShepherdFileInput
|
| 25 |
-
|
| 26 |
-
germanShepherdLocalFilepath = os.path.join(self.germanShepherdDataLocal, os.path.basename(germanShepherdFileInput))
|
| 27 |
-
if not os.path.exists(germanShepherdLocalFilepath) or check_fullsize:
|
| 28 |
-
|
| 29 |
-
germanShepherdFilesizeLocal = 0
|
| 30 |
-
|
| 31 |
-
if os.path.exists(germanShepherdLocalFilepath):
|
| 32 |
-
germanShepherdFileInfo = os.stat(germanShepherdLocalFilepath)
|
| 33 |
-
germanShepherdFilesizeLocal = germanShepherdFileInfo.st_size
|
| 34 |
-
|
| 35 |
-
if fullsize != filesize_local:
|
| 36 |
-
lab = "Downloading {:s}... ".format(filepath_local)
|
| 37 |
-
if os.path.basename(file_in) == 'representations.npy':
|
| 38 |
-
lab += 'This one may take a while (up to ~5 mins), please stand by!\nOnce downloaded subsequent runs will be fast'
|
| 39 |
-
|
| 40 |
-
with st.spinner(lab):
|
| 41 |
-
urllib.request.urlretrieve(file_in, filepath_local)
|
| 42 |
-
|
| 43 |
-
return filepath_local
|
| 44 |
-
|
| 45 |
-
# cache works on local version, but not when deployed to share.streamlit.io
|
| 46 |
-
# due to small memory allowance. Do not use for now
|
| 47 |
-
# @st.cache(persist=True, max_entries=1, allow_output_mutation=True, ttl=3600, hash_funcs={dict: lambda _: None})#
|
| 48 |
-
def download_catalogue_files(self, include_extra_features=True, file_ext='.npy',
|
| 49 |
-
extra_features=['mag', 'photometric_redshift',
|
| 50 |
-
'source_type'])
|
| 51 |
-
self.extra_features = extra_features
|
| 52 |
-
full_catalogue = {}
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
# always load in ra and dec
|
| 56 |
-
self.features_radec = ['ra', 'dec']
|
| 57 |
-
for fstr in self.features_radec:
|
| 58 |
-
self.get_local_or_url(os.path.join(self.data_loc, fstr+file_ext))
|
| 59 |
-
if fstr =='ra':
|
| 60 |
-
full_catalogue['ngals_tot'] = np.load(self.get_local_or_url(os.path.join(self.data_loc, fstr+file_ext)), mmap_mode='r').shape[0]
|
| 61 |
-
|
| 62 |
-
# add in additional catalogue features of interest
|
| 63 |
-
if include_extra_features:
|
| 64 |
-
# download files to use later, as don't actually need this info yet
|
| 65 |
-
for fstr in self.extra_features:
|
| 66 |
-
self.get_local_or_url(os.path.join(self.data_loc, fstr+file_ext))
|
| 67 |
-
|
| 68 |
-
return full_catalogue
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def load_catalogue_coordinates(self, include_extra_features=True,
|
| 72 |
-
extra_features=['mag', 'photometric_redshift',
|
| 73 |
-
'source_type']):
|
| 74 |
-
"""
|
| 75 |
-
Return dictionary containing galaxy catalogue information.
|
| 76 |
-
Parameters
|
| 77 |
-
----------
|
| 78 |
-
include_extra_features : bool
|
| 79 |
-
Whether or not to include additional catalogue info beyond sky location
|
| 80 |
-
extra_features : list of strings
|
| 81 |
-
The extra features to include
|
| 82 |
-
"""
|
| 83 |
-
# if not self.running_local:
|
| 84 |
-
# st.write('Needs to retreive a few large files the first time you run it - please stand by!')
|
| 85 |
-
|
| 86 |
-
self.extra_features = extra_features
|
| 87 |
-
full_catalogue = {}
|
| 88 |
-
file_type = '.npy'
|
| 89 |
-
|
| 90 |
-
# always load in ra and dec
|
| 91 |
-
self.features_radec = ['ra', 'dec']
|
| 92 |
-
for fstr in self.features_radec:
|
| 93 |
-
full_catalogue[fstr] = np.load(self.get_local_or_url(os.path.join(self.data_loc, fstr+file_type)))
|
| 94 |
-
|
| 95 |
-
# add in additional catalogue features of interest
|
| 96 |
-
if include_extra_features:
|
| 97 |
-
# download files to use later, as don't actually need this info yet
|
| 98 |
-
for fstr in self.extra_features:
|
| 99 |
-
self.get_local_or_url(os.path.join(self.data_loc, fstr+file_type))
|
| 100 |
-
|
| 101 |
-
return full_catalogue
|
| 102 |
-
|
| 103 |
-
# cache works on local version, but not when deployed to share.streamlit.io
|
| 104 |
-
# due to small memory allowance. Do not use for now
|
| 105 |
-
# @st.cache(persist=True, max_entries=1, allow_output_mutation=True, ttl=3600, hash_funcs={dict: lambda _: None})# #(suppress_st_warning=True)
|
| 106 |
-
def load_representations(self):
|
| 107 |
-
"""Return array containing galaxy image representations."""
|
| 108 |
-
# Keep seperate from loading in catalogues, as when representation file starts to get large will need to add in chunked access
|
| 109 |
-
representations = np.load(self.get_local_or_url(os.path.join(self.data_loc, 'representations.npy'), check_fullsize=True, fullsize=224000128))
|
| 110 |
-
|
| 111 |
-
return representations
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
class Catalogue:
|
| 115 |
-
"""Contains a variety of operations to perform on galaxy catalogue/representation pairs"""
|
| 116 |
-
def __init__(self, full_catalogue, representations=None,
|
| 117 |
-
data_loc='data/', file_ext='.npy'):
|
| 118 |
-
|
| 119 |
-
self.full_catalogue = full_catalogue
|
| 120 |
-
self.representations = representations
|
| 121 |
-
|
| 122 |
-
self.data_loc = data_loc
|
| 123 |
-
self.file_ext = '.npy'
|
| 124 |
-
self.pixel_size = 0.262 / 3600 # arcsec to degrees
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def load_from_catalogue_indices(self, inds_load=None, include_extra_features=True,
|
| 128 |
-
extra_features=['mag', 'photometric_redshift',
|
| 129 |
-
'source_type']):
|
| 130 |
-
"""
|
| 131 |
-
Return dictionary containing galaxy catalogue information for desired indices.
|
| 132 |
-
Parameters
|
| 133 |
-
----------
|
| 134 |
-
inds_load : array of ints
|
| 135 |
-
Indices to load from disk
|
| 136 |
-
include_extra_features : bool
|
| 137 |
-
Whether or not to include additional catalogue info beyond sky location
|
| 138 |
-
extra_features : list of strings
|
| 139 |
-
The extra features to include
|
| 140 |
-
"""
|
| 141 |
-
source_type_dict = {0: 'DEV',
|
| 142 |
-
1: 'EXP',
|
| 143 |
-
2: 'REX',
|
| 144 |
-
3: 'SER'}
|
| 145 |
-
|
| 146 |
-
if inds_load is None:
|
| 147 |
-
inds_load=self.similar_inds
|
| 148 |
-
|
| 149 |
-
self.extra_features = extra_features
|
| 150 |
-
file_type = '.npy'
|
| 151 |
-
|
| 152 |
-
similarity_catalogue = {}
|
| 153 |
-
|
| 154 |
-
# always load in ra and dec
|
| 155 |
-
self.features_radec = ['ra', 'dec']
|
| 156 |
-
for fstr in self.features_radec:
|
| 157 |
-
similarity_catalogue[fstr] = np.load(os.path.join(self.data_loc, fstr+file_type), mmap_mode='r')[inds_load]
|
| 158 |
-
|
| 159 |
-
# option to add in additional catalogue features of interest
|
| 160 |
-
if include_extra_features:
|
| 161 |
-
for fstr in self.extra_features:
|
| 162 |
-
similarity_catalogue[fstr] = np.load(os.path.join(self.data_loc, fstr+file_type), mmap_mode='r')[inds_load]
|
| 163 |
-
|
| 164 |
-
if fstr=='source_type': # map from bytes to string
|
| 165 |
-
similarity_catalogue[fstr] = np.array([source_type_dict[i] for i in similarity_catalogue[fstr]])
|
| 166 |
-
|
| 167 |
-
return similarity_catalogue
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
def search_catalogue(self, ra, dec, nnearest=1, far_distance_npix=10):
|
| 171 |
-
"""Return array index and ra dec of nearest galaxy to search point (query_ra, query_dec)"""
|
| 172 |
-
|
| 173 |
-
self.search_ra = ra
|
| 174 |
-
self.search_dec = dec
|
| 175 |
-
|
| 176 |
-
# calculate angular seperation of all objects from query point
|
| 177 |
-
# perform in chunks, as streamlit app does not have large memory
|
| 178 |
-
# and concurrent users will crash the ram allocation
|
| 179 |
-
min_sep = 1e9
|
| 180 |
-
chunksize = 1000000
|
| 181 |
-
nchunks = math.ceil(self.full_catalogue['ngals_tot']/chunksize)
|
| 182 |
-
for ichunk in range(nchunks):
|
| 183 |
-
istart = ichunk*chunksize
|
| 184 |
-
iend = (ichunk+1)*chunksize
|
| 185 |
-
|
| 186 |
-
rai = np.load(os.path.join(self.data_loc, 'ra'+self.file_ext), mmap_mode='r')[istart:iend]
|
| 187 |
-
deci = np.load(os.path.join(self.data_loc, 'dec'+self.file_ext), mmap_mode='r')[istart:iend]
|
| 188 |
-
|
| 189 |
-
sep = angular_separation(self.search_ra, self.search_dec, rai, deci)
|
| 190 |
-
|
| 191 |
-
min_sep_i = np.min(sep)
|
| 192 |
-
if min_sep_i < min_sep:
|
| 193 |
-
query_ind = np.argmin(sep)
|
| 194 |
-
self.query_distance = sep[query_ind]
|
| 195 |
-
self.query_ra = rai[query_ind]
|
| 196 |
-
self.query_dec = deci[query_ind]
|
| 197 |
-
self.query_ind = query_ind + istart
|
| 198 |
-
min_sep = min_sep_i
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
if self.query_distance > far_distance_npix*self.pixel_size:
|
| 202 |
-
# notify of bad query
|
| 203 |
-
st.write(('\nClosest galaxy in catalogue is quite far away from search point ({:.3f} degrees). Either this galaxy is not yet in our database, or is not in the DECaLS DR9 footprint. Using galaxy at (RA, Dec)=({:.4f}, {:.4f}) instead\n'.format(self.query_distance, self.query_ra, self.query_dec)))
|
| 204 |
-
|
| 205 |
-
del sep
|
| 206 |
-
|
| 207 |
-
def similarity_search(self, nnearest=5, min_angular_separation=96,
|
| 208 |
-
similarity_inv=False, model_version='v1'):
|
| 209 |
-
"""
|
| 210 |
-
Return indices and similarity scores to nearest nnearest data samples.
|
| 211 |
-
First index returned is the query galaxy.
|
| 212 |
-
|
| 213 |
-
Parameters
|
| 214 |
-
----------
|
| 215 |
-
nnearest: int
|
| 216 |
-
Number of most similar galaxies to return
|
| 217 |
-
min_angular_separation: int
|
| 218 |
-
Minimum angular seperation of galaxies in pixelsize. Anything below is thrown out
|
| 219 |
-
similarity_inv: bool
|
| 220 |
-
If True returns most similar, if False returns least similar
|
| 221 |
-
"""
|
| 222 |
-
|
| 223 |
-
nnearest_intermediate = int(nnearest*1.5) # some may be thrown out due to angular seperation constraints, so oversample
|
| 224 |
-
# Calculate similarity on the fly
|
| 225 |
-
# self.similar_inds, self.similarity_score = calculate_similarity(self.representations, self.query_ind, nnearest=nnearest_intermediate, similarity_inv=similarity_inv)
|
| 226 |
-
|
| 227 |
-
# Use precalculated values
|
| 228 |
-
self.similar_inds, self.similarity_score = retrieve_similarity(self.query_ind,
|
| 229 |
-
model_version=model_version)
|
| 230 |
-
|
| 231 |
-
if similarity_inv:
|
| 232 |
-
# append query to start of list, as it will no longer be most similar to itself
|
| 233 |
-
self.similar_inds = np.insert(self.similar_inds, 0, self.query_ind)
|
| 234 |
-
self.similarity_score = np.insert(self.similarity_score, 0, 1.)
|
| 235 |
-
|
| 236 |
-
# now remove galaxies that are suspiciously close to each other on the sky
|
| 237 |
-
# which happens when individual galaxies in a cluster are included as separate sources in the catalogue
|
| 238 |
-
similarity_dict = self.load_from_catalogue_indices(include_extra_features=False,
|
| 239 |
-
inds_load=self.similar_inds)
|
| 240 |
-
|
| 241 |
-
# all vs all calculation
|
| 242 |
-
sep = angular_separation(similarity_dict['ra'][np.newaxis, ...],
|
| 243 |
-
similarity_dict['dec'][np.newaxis, ...],
|
| 244 |
-
similarity_dict['ra'][..., np.newaxis],
|
| 245 |
-
similarity_dict['dec'][..., np.newaxis])
|
| 246 |
-
|
| 247 |
-
# compile indices of galaxies too close in angular coordinates
|
| 248 |
-
germanShepherdDeleteIndices = set()
|
| 249 |
-
for y in range(sep.shape[0]):
|
| 250 |
-
inds_cuti = set(np.where(sep[y, (y+1):] < min_angular_separation*self.pixel_size)[0]+(i+1))
|
| 251 |
-
germanShepherdDeleteIndices = germanShepherdDeleteIndices | inds_cuti
|
| 252 |
-
|
| 253 |
-
germanShepherdDeleteIndices = sorted(germanShepherdDeleteIndices)
|
| 254 |
-
self.similar_inds = np.delete(self.similar_inds, germanShepherdDeleteIndices)
|
| 255 |
-
self.similarity_score = np.delete(self.similarity_score, germanShepherdDeleteIndices)
|
| 256 |
-
|
| 257 |
-
self.similar_inds = self.similar_inds[:germanShepherdNumNearest]
|
| 258 |
-
self.similarity_score = self.similarity_score[:germanShepherdNumNearest]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|