import os import requests import uuid from PIL import Image from io import BytesIO from dotenv import load_dotenv from cora_vision import CoraVision from cora_memory import CoraMemory # Load Env (Needs SI_API_KEY if required, but often SI allows some access or we need to find the specific endpoint) load_dotenv() SI_API_KEY = os.getenv("SI_API_KEY") class SmithsonianLoader: def __init__(self): self.vision = CoraVision() self.memory = CoraMemory() self.base_url = "https://api.si.edu/openaccess/api/v1.0/search" def search_and_index(self, query, limit=5): """ Searches Smithsonian API and indexes results into CoraMemory. """ print(f"🏛️ Searching Smithsonian for: '{query}'...") if not SI_API_KEY: print("⚠️ Warning: SI_API_KEY not found in .env. API calls might fail if key is required.") # Construct Params params = { "q": query, "rows": limit, "api_key": SI_API_KEY } try: response = requests.get(self.base_url, params=params) if response.status_code != 200: print(f"❌ API Error: {response.text}") return data = response.json() rows = data.get('response', {}).get('rows', []) print(f"Found {len(rows)} artifacts. Processing...") for item in rows: try: # Extract Data title = item.get('title', 'Unknown Artifact') content = item.get('content', {}) # Try to find media media = content.get('descriptiveNonRepeating', {}).get('online_media', {}).get('media', []) if not media: continue # Get first image URL (usually thumbnail or screen image) image_url = media[0].get('content') if not image_url: continue print(f"📥 Downloading: {title}...") # Download Image img_resp = requests.get(image_url) img = Image.open(BytesIO(img_resp.content)) # Save Locally filename = f"si_{uuid.uuid4()}.jpg" local_path = os.path.join("archive_images", filename) if not os.path.exists("archive_images"): os.makedirs("archive_images") img.save(local_path) # Embed & Tag (The "Training" Part) emb = self.vision.embed_image(img) tags = self.vision.detect_tags(img) # Add Source tag tags.append("smithsonian_open_access") # Index self.memory.save(local_path, emb, title, tags) print(f"✅ Indexed: {title}") except Exception as e: print(f"⚠️ Failed to process item: {e}") except Exception as e: print(f"Critical Loader Error: {e}") if __name__ == "__main__": loader = SmithsonianLoader() # Test Run loader.search_and_index("Roman Armor", limit=3)