Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| import pandas as pd | |
| import os | |
| import time | |
| from datetime import datetime | |
| from tqdm import tqdm | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| BING_API_KEY = os.getenv("BING_API_KEY", None) | |
| def get_actor_images( | |
| name: str, role: str = None, count: int = 50, api_key: str = BING_API_KEY | |
| ): | |
| """Get a list of actor images from the Bing Image Search API""" | |
| if api_key is None: | |
| raise ValueError("You must provide a Bing API key") | |
| headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY} | |
| query = f'"{name}"' | |
| if role: | |
| query = f"{query} ({role})" | |
| params = { | |
| "q": query, | |
| "count": count, | |
| "imageType": "Photo", | |
| "safeSearch": "Strict", | |
| "imageContent": "Face", | |
| "freshness": "Year", | |
| } | |
| response = requests.get( | |
| f"https://api.bing.microsoft.com/v7.0/images/search", | |
| headers=headers, | |
| params=params, | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| def read_actors_list( | |
| max_actors: int = None, last_year_active: int = None, sort_by: str = None | |
| ): | |
| """Read and filter the list of actors""" | |
| df = pd.read_csv("data/imdb_actors.csv") | |
| if last_year_active: | |
| df = df[df["lastYear"] >= last_year_active] | |
| if sort_by: | |
| df = df.sort_values(sort_by, ascending=False) | |
| if max_actors: | |
| df = df.head(max_actors) | |
| return df | |
| def store_all_actor_images_data( | |
| max_actors: int = None, | |
| images_per_actor: int = 10, | |
| last_year_active: int = None, | |
| output_file=None, | |
| max_api_calls_per_second: int = 3, | |
| ): | |
| """Get images data for each actor from the Bing Image Search API and store the results as csv""" | |
| df = read_actors_list(max_actors, last_year_active) | |
| df_im = None | |
| if output_file: | |
| try: | |
| df_im = pd.read_csv(output_file) | |
| except: | |
| # file does not exists yet | |
| pass | |
| # remove actors for which we already have images data | |
| if df_im is not None: | |
| df = df[~df["nconst"].isin(df_im["nconst"].unique())] | |
| print(f"Start retrieving images from Bing for {len(df)} actors") | |
| for _, row in tqdm(df.iterrows(), total=df.shape[0]): | |
| try: | |
| images_data = get_actor_images( | |
| name=row["primaryName"], count=images_per_actor | |
| ) | |
| except Exception as e: | |
| print(e) | |
| continue | |
| df_im_tmp = pd.DataFrame(images_data["value"]) | |
| df_im_tmp["nconst"] = row["nconst"] | |
| df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp))) | |
| if df_im is not None: | |
| df_im = pd.concat([df_im, df_im_tmp]) | |
| else: | |
| df_im = df_im_tmp | |
| # Store progress | |
| df_im.to_csv(output_file, index=False) | |
| # Limit speed of requests to Bing Search (3 calls per seconds) | |
| time.sleep(1.0 / max_api_calls_per_second) | |
| if __name__ == "__main__": | |
| store_all_actor_images_data( | |
| output_file="data/actors_images_new.csv", | |
| max_actors=2000, | |
| images_per_actor=20, | |
| last_year_active=datetime.now().year - 5, | |
| max_api_calls_per_second=100, | |
| ) | |