Spaces:

tokgae
/

cora

Sleeping

App Files Files Community

cora / loaders /smithsonian_loader.py

tokgae

Upload folder using huggingface_hub

38ab39c verified 2 months ago

raw

history blame contribute delete

3.68 kB

	import os
	import requests
	import uuid
	from PIL import Image
	from io import BytesIO
	from dotenv import load_dotenv
	from cora_vision import CoraVision
	from cora_memory import CoraMemory

	# Load Env (Needs SI_API_KEY if required, but often SI allows some access or we need to find the specific endpoint)
	load_dotenv()
	SI_API_KEY = os.getenv("SI_API_KEY")

	class SmithsonianLoader:
	def __init__(self):
	self.vision = CoraVision()
	self.memory = CoraMemory()
	self.base_url = "https://api.si.edu/openaccess/api/v1.0/search"

	def search_and_index(self, query, limit=5):
	"""
	Searches Smithsonian API and indexes results into CoraMemory.
	"""
	print(f"🏛️ Searching Smithsonian for: '{query}'...")

	if not SI_API_KEY:
	print("⚠️ Warning: SI_API_KEY not found in .env. API calls might fail if key is required.")

	# Construct Params
	params = {
	"q": query,
	"rows": limit,
	"api_key": SI_API_KEY
	}

	try:
	response = requests.get(self.base_url, params=params)
	if response.status_code != 200:
	print(f"❌ API Error: {response.text}")
	return

	data = response.json()
	rows = data.get('response', {}).get('rows', [])

	print(f"Found {len(rows)} artifacts. Processing...")

	for item in rows:
	try:
	# Extract Data
	title = item.get('title', 'Unknown Artifact')
	content = item.get('content', {})
	# Try to find media
	media = content.get('descriptiveNonRepeating', {}).get('online_media', {}).get('media', [])

	if not media:
	continue

	# Get first image URL (usually thumbnail or screen image)
	image_url = media[0].get('content')
	if not image_url:
	continue

	print(f"📥 Downloading: {title}...")

	# Download Image
	img_resp = requests.get(image_url)
	img = Image.open(BytesIO(img_resp.content))

	# Save Locally
	filename = f"si_{uuid.uuid4()}.jpg"
	local_path = os.path.join("archive_images", filename)
	if not os.path.exists("archive_images"):
	os.makedirs("archive_images")

	img.save(local_path)

	# Embed & Tag (The "Training" Part)
	emb = self.vision.embed_image(img)
	tags = self.vision.detect_tags(img)

	# Add Source tag
	tags.append("smithsonian_open_access")

	# Index
	self.memory.save(local_path, emb, title, tags)
	print(f"✅ Indexed: {title}")

	except Exception as e:
	print(f"⚠️ Failed to process item: {e}")

	except Exception as e:
	print(f"Critical Loader Error: {e}")

	if __name__ == "__main__":
	loader = SmithsonianLoader()
	# Test Run
	loader.search_and_index("Roman Armor", limit=3)