cora / loaders /smithsonian_loader.py
tokgae's picture
Upload folder using huggingface_hub
38ab39c verified
import os
import requests
import uuid
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv
from cora_vision import CoraVision
from cora_memory import CoraMemory
# Load Env (Needs SI_API_KEY if required, but often SI allows some access or we need to find the specific endpoint)
load_dotenv()
SI_API_KEY = os.getenv("SI_API_KEY")
class SmithsonianLoader:
def __init__(self):
self.vision = CoraVision()
self.memory = CoraMemory()
self.base_url = "https://api.si.edu/openaccess/api/v1.0/search"
def search_and_index(self, query, limit=5):
"""
Searches Smithsonian API and indexes results into CoraMemory.
"""
print(f"🏛️ Searching Smithsonian for: '{query}'...")
if not SI_API_KEY:
print("⚠️ Warning: SI_API_KEY not found in .env. API calls might fail if key is required.")
# Construct Params
params = {
"q": query,
"rows": limit,
"api_key": SI_API_KEY
}
try:
response = requests.get(self.base_url, params=params)
if response.status_code != 200:
print(f"❌ API Error: {response.text}")
return
data = response.json()
rows = data.get('response', {}).get('rows', [])
print(f"Found {len(rows)} artifacts. Processing...")
for item in rows:
try:
# Extract Data
title = item.get('title', 'Unknown Artifact')
content = item.get('content', {})
# Try to find media
media = content.get('descriptiveNonRepeating', {}).get('online_media', {}).get('media', [])
if not media:
continue
# Get first image URL (usually thumbnail or screen image)
image_url = media[0].get('content')
if not image_url:
continue
print(f"📥 Downloading: {title}...")
# Download Image
img_resp = requests.get(image_url)
img = Image.open(BytesIO(img_resp.content))
# Save Locally
filename = f"si_{uuid.uuid4()}.jpg"
local_path = os.path.join("archive_images", filename)
if not os.path.exists("archive_images"):
os.makedirs("archive_images")
img.save(local_path)
# Embed & Tag (The "Training" Part)
emb = self.vision.embed_image(img)
tags = self.vision.detect_tags(img)
# Add Source tag
tags.append("smithsonian_open_access")
# Index
self.memory.save(local_path, emb, title, tags)
print(f"✅ Indexed: {title}")
except Exception as e:
print(f"⚠️ Failed to process item: {e}")
except Exception as e:
print(f"Critical Loader Error: {e}")
if __name__ == "__main__":
loader = SmithsonianLoader()
# Test Run
loader.search_and_index("Roman Armor", limit=3)