Spaces:

mg643
/

MetDataAnalysis

Sleeping

yash-b18

Update to CLI to prevent crashing on invalid inputs

f78d74e 9 months ago

7.3 kB

	import time
	import requests
	import random
	import pandas as pd

	URL = "https://collectionapi.metmuseum.org/public/collection/v1/"

	'''
	Need to add error handling
	'''

	# gets information on a single object
	def get_object(objectID):
	try:
	response = requests.get(f"{URL}/objects/{objectID}")
	response.raise_for_status()
	return response.json()
	except Exception as e:
	print(f"Error fetching object {objectID}: {e}")
	return None


	# gets all the objects that have some sort of image
	def get_objectsWithImages():
	try:
	response = requests.get(f"{URL}search?hasImages=true&q=*")
	response.raise_for_status()
	data = response.json()
	total = data.get("total", 0)
	objectIDs = data.get("objectIDs", [])
	return total, objectIDs
	except Exception as e:
	print(f"Error fetching objects with images: {e}")
	return 0, []

	# gets the urls for random objects with images
	def get_images(totalObjects, objectIDs, limit):
	try:
	images = []
	# grabbing extra in case a primary image is blank (Works best on small limits)
	rand_indexes = random.sample(range(totalObjects), limit + 20)
	for i in rand_indexes:
	obj = get_object(objectIDs[i])
	if obj and obj.get("primaryImage"):
	images.append((obj["primaryImage"], obj.get("title", "Untitled")))
	if len(images) == limit:
	break

	return images
	except Exception as e:
	print(f"Error in get_images: {e}")
	return []

	def department_counts(q="*", max_ids=200):
	"""
	Analytic: return a list of (department, count) for search results.
	- q: search query (default '*' = anything)
	- max_ids: cap how many object IDs to inspect (keeps it fast)

	Uses the Met search endpoint (images only), then tallies the 'department'
	field from each object's metadata.
	"""
	try:
	resp = requests.get(f"{URL}search", params={"q": q, "hasImages": True}, timeout=15)
	resp.raise_for_status()
	ids = (resp.json().get("objectIDs") or [])[:max_ids]
	except Exception:
	return []

	counts = {}
	for oid in ids:
	try:
	obj = get_object(oid) # uses your existing helper
	dep = obj.get("department") or "(unknown)"
	counts[dep] = counts.get(dep, 0) + 1
	except Exception:
	continue

	# return sorted (department, count) pairs, highest first
	return sorted(counts.items(), key=lambda x: x[1], reverse=True)

	"""
	Get a list of departments
	"""
	def list_met_departments():
	"""
	Return a DataFrame of all Met departments (id + name) to help you choose.
	"""
	try:
	r = requests.get(f"{URL}/departments")
	r.raise_for_status()
	depts = r.json().get("departments", [])
	return pd.DataFrame(depts)[["departmentId", "displayName"]]
	except Exception as e:
	print(f"Error fetching departments: {e}")
	return pd.DataFrame(columns=["departmentId", "displayName"])

	"""
	returns a dataframe of list of objects with images and metadata that matches search term
	To be polite, will only get default max 5 random results from each department,
	but can be specified as parameter. Also, can choose the departments to search from.
	"""
	def search_for_images(query,
	max_per_department=5,
	departments=None):

	if not query or not query.strip():
	raise ValueError("Please provide a query.")

	# 1) Departments
	if departments is None:
	resp = requests.get(f"{URL}/departments")
	resp.raise_for_status()
	dept_list = resp.json().get("departments", [])
	dept_ids = [d["departmentId"] for d in dept_list]
	dept_id_to_name = {d["departmentId"]: d["displayName"] for d in dept_list}
	else:
	dept_ids = list(departments)
	# Names will be filled from object details; provide a generic fallback
	dept_id_to_name = {d: f"Department {d}" for d in dept_ids}

	# 2) Per-department search → random sample of IDs → fetch details
	rows = []
	#session = requests.Session()

	print("Searching for", query)
	print("Across departments:", dept_ids)

	for dept_id in dept_ids:
	# limit to only those with images and is highlighted
	params = {
	"q": query,
	"hasImages": "true",
	#"isHighlight": "true",
	"departmentId": dept_id,
	}
	try:
	r = requests.get(f"{URL}/search", params=params)
	#print(r.url)
	r.raise_for_status()
	except requests.RequestException:
	continue

	object_ids = (r.json() or {}).get("objectIDs") or []
	if not object_ids:
	continue

	sample_ids = random.sample(object_ids, k=min(max_per_department, len(object_ids)))

	for oid in sample_ids:
	#print(f"Found: {oid}")
	try:
	obj = requests.get(f"{URL}/objects/{oid}").json()
	except requests.RequestException:
	continue

	# skip if no image
	if not obj["primaryImage"]:
	continue

	rows.append({
	"objectID": obj.get("objectID"),
	"title": obj.get("title"),
	"artistDisplayName": obj.get("artistDisplayName"),
	"objectDate": obj.get("objectDate"),
	"culture": obj.get("culture"),
	"medium": obj.get("medium"),
	"department": obj.get("department") or dept_id_to_name.get(dept_id),
	"objectName": obj.get("objectName"),
	"classification": obj.get("classification"),
	"primaryImageSmall": obj.get("primaryImageSmall"),
	"primaryImage": obj.get("primaryImage"),
	"objectURL": obj.get("objectURL"),
	"isPublicDomain": obj.get("isPublicDomain"),
	})

	df = pd.DataFrame(rows)
	if not df.empty:
	df = df.sort_values(["department", "title"]).reset_index(drop=True)
	return df

	"""
	CLI when run from command line
	Displays the departments to allow user to input a department
	Then searches the department based on input, and displays the results with images.
	"""
	def main():
	print("Welcome to Met Search.")
	departments = list_met_departments()
	print(departments.to_string(index=False))
	random.seed(time.time())
	while True:
	dept_no = input("Choose a departmentId #: (Type 'q', 'quit' to stop the program, or enter for all) ").strip()
	if dept_no.lower() in ['q', 'quit']:
	break
	elif dept_no == '':
	dept = None
	else:
	if not dept_no.isdigit():
	print("Invalid input. Please try again.")
	continue
	dept = [int(dept_no)]

	query = input("Search the Met for: ").strip()
	if query == '':
	print("Please enter a query.")
	continue
	results = search_for_images(query,2, departments=dept)
	if results.empty:
	print("No results found.")
	continue
	print(results.to_string(index=False))

	if __name__ == '__main__':
	main()