Spaces:
Runtime error
Runtime error
scripts for downloading actors data and extract embeddings
Browse files- .env.example +1 -0
- .gitignore +4 -0
- README.md +25 -2
- combine_actors_data.py +61 -0
- data/.gitkeep +0 -0
- download_imdb_data.py +32 -0
- get_images_data.py +107 -0
- process_images.py +48 -0
- requirements.txt +10 -0
.env.example
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
BING_API_KEY=000000000000000000000000
|
.gitignore
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Byte-compiled / optimized / DLL files
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
| 1 |
+
# data files from imdb
|
| 2 |
+
data/title.*.tsv*
|
| 3 |
+
data/name.*.tsv*
|
| 4 |
+
|
| 5 |
# Byte-compiled / optimized / DLL files
|
| 6 |
__pycache__/
|
| 7 |
*.py[cod]
|
README.md
CHANGED
|
@@ -1,2 +1,25 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Actor matching demo
|
| 2 |
+
|
| 3 |
+
Who should play Hannibal (the Carthaginian, not the cannibal) if HBO ever adapts his story? How about you? Who should be your actor?
|
| 4 |
+
This application lets you input an image and see the top three actors that more closely resemble the image based on facial features.
|
| 5 |
+
|
| 6 |
+
Try it out on HugginFace _[Coming Soon]_
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
## Data
|
| 10 |
+
|
| 11 |
+
The data comes from two sources:
|
| 12 |
+
|
| 13 |
+
1. I built a list of relevant actors that have been in popular movies across their careers. The datasets that I used to build can be found on the [IMDB datasets page](https://datasets.imdbws.com/) (see instructions [here](https://www.imdb.com/interfaces/))
|
| 14 |
+
2. I then found 20 images of each actor using Microsoft Bing Search API using queries such as *"Brad Pitt, actor or actress"*
|
| 15 |
+
|
| 16 |
+
Note that due to API limits, I only took images from 1,000 actors.
|
| 17 |
+
|
| 18 |
+
## Application
|
| 19 |
+
|
| 20 |
+
The application is built with Gradio and deployed on HuggingFace Space. In the background, it uses:
|
| 21 |
+
|
| 22 |
+
1. The [`face_recognition` library](https://github.com/ageitgey/face_recognition) to compute an embedding of the image
|
| 23 |
+
2. Spotify's `annoy` library to efficiently search the closest actors based on the image embedding and a small database of actors' faces embeddings.
|
| 24 |
+
3. Show you your best matches!
|
| 25 |
+
|
combine_actors_data.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def process_actors_data(keep_alive: bool = True):
|
| 6 |
+
current_year = datetime.now().year
|
| 7 |
+
|
| 8 |
+
# Read actors data
|
| 9 |
+
df = pd.read_csv("data/name.basics.tsv", sep="\t")
|
| 10 |
+
df["birthYear"] = pd.to_numeric(df["birthYear"], errors="coerce")
|
| 11 |
+
df["deathYear"] = pd.to_numeric(df["deathYear"], errors="coerce")
|
| 12 |
+
|
| 13 |
+
# Prepare and cleanup actors data
|
| 14 |
+
if keep_alive:
|
| 15 |
+
df = df[df["deathYear"].isna()]
|
| 16 |
+
df = df[df.knownForTitles.apply(lambda x: len(x)) > 0]
|
| 17 |
+
df = df.dropna(subset=["primaryProfession"])
|
| 18 |
+
df = df[df.primaryProfession.apply(lambda x: "actor" in x.split(","))]
|
| 19 |
+
df = df[df.knownForTitles != "\\N"]
|
| 20 |
+
df = df.dropna(subset=["birthYear"])
|
| 21 |
+
#df["knownForTitles"] = df["knownForTitles"].apply(lambda x: x.split(","))
|
| 22 |
+
|
| 23 |
+
#dfat = df[["nconst", "knownForTitles"]].explode("knownForTitles")
|
| 24 |
+
#dfat.columns = ["nconst", "tconst"]
|
| 25 |
+
dfat = pd.read_csv("data/title.principals.tsv.gz", sep="\t")
|
| 26 |
+
dfat = dfat[dfat.category.isin(["actor", "self"])][["tconst", "nconst"]]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Get data for the movies/shows the actors were known for
|
| 30 |
+
dftr = pd.read_csv("data/title.ratings.tsv", sep="\t")
|
| 31 |
+
dftb = pd.read_csv("data/title.basics.tsv", sep="\t")
|
| 32 |
+
dftb["startYear"] = pd.to_numeric(dftb["startYear"], errors="coerce")
|
| 33 |
+
dftb["endYear"] = pd.to_numeric(dftb["endYear"], errors="coerce")
|
| 34 |
+
|
| 35 |
+
# Estimate last year the show/movie was released (TV shows span several years and might still be active)
|
| 36 |
+
dftb.loc[(dftb.titleType.isin(["tvSeries", "tvMiniSeries"]) & (dftb.endYear.isna())), "lastYear"] = current_year
|
| 37 |
+
dftb["lastYear"] = dftb["lastYear"].fillna(dftb["startYear"])
|
| 38 |
+
dftb = dftb.dropna(subset=["lastYear"])
|
| 39 |
+
dftb = dftb[dftb.isAdult == 0]
|
| 40 |
+
|
| 41 |
+
# Aggregate stats for all movies the actor was known for
|
| 42 |
+
dft = pd.merge(dftb, dftr, how="inner", on="tconst")
|
| 43 |
+
del dftb, dftr
|
| 44 |
+
dfat = pd.merge(dfat, dft, how="inner", on="tconst")
|
| 45 |
+
del dft
|
| 46 |
+
dfat["totalRating"] = dfat.averageRating*dfat.numVotes
|
| 47 |
+
dfat = dfat.groupby("nconst").agg({"averageRating": "mean", "totalRating": "sum", "numVotes": "sum", "tconst": "count", "startYear": "min", "lastYear": "max"})
|
| 48 |
+
|
| 49 |
+
# Merge everything with actor data and cleanup
|
| 50 |
+
df = df.drop(["deathYear", "knownForTitles", "primaryProfession"], axis=1)
|
| 51 |
+
df = pd.merge(df, dfat, how="inner", on="nconst").sort_values("totalRating", ascending=False)
|
| 52 |
+
df = df.dropna(subset=["birthYear", "startYear", "lastYear"])
|
| 53 |
+
df[["birthYear", "startYear", "lastYear"]] = df[["birthYear", "startYear", "lastYear"]].astype(int)
|
| 54 |
+
df = df.round(2)
|
| 55 |
+
|
| 56 |
+
return df
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
df = process_actors_data()
|
| 61 |
+
df.to_csv("data/imdb_actors.csv", index=False)
|
data/.gitkeep
ADDED
|
File without changes
|
download_imdb_data.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gzip
|
| 3 |
+
import shutil
|
| 4 |
+
from urllib.request import urlretrieve
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
def download_large_file(url: str, output_file: str):
|
| 8 |
+
if not os.path.exists(output_file):
|
| 9 |
+
urlretrieve(url, output_file)
|
| 10 |
+
|
| 11 |
+
def unzip_file(input_file):
|
| 12 |
+
output_file = os.path.splitext(input_file)[0]
|
| 13 |
+
if not os.path.exists(output_file):
|
| 14 |
+
with gzip.open(input_file, "rb") as f_in:
|
| 15 |
+
# Input file has the format xxx.tsv.gz
|
| 16 |
+
with open(output_file, "wb") as f_out:
|
| 17 |
+
shutil.copyfileobj(f_in, f_out)
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
imdb_url = "https://datasets.imdbws.com"
|
| 21 |
+
filenames = [
|
| 22 |
+
"name.basics.tsv.gz",
|
| 23 |
+
"title.basics.tsv.gz",
|
| 24 |
+
"title.ratings.tsv.gz",
|
| 25 |
+
"title.principals.tsv.gz"
|
| 26 |
+
]
|
| 27 |
+
for filename in tqdm(filenames):
|
| 28 |
+
url = f"{imdb_url}/{filename}"
|
| 29 |
+
output_file = os.path.join("data", filename)
|
| 30 |
+
download_large_file(url, output_file)
|
| 31 |
+
unzip_file(output_file)
|
| 32 |
+
|
get_images_data.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
BING_API_KEY = os.getenv("BING_API_KEY", None)
|
| 14 |
+
|
| 15 |
+
def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
| 16 |
+
"""Get a list of actor images from the Bing Image Search API"""
|
| 17 |
+
if api_key is None:
|
| 18 |
+
raise ValueError("You must provide a Bing API key")
|
| 19 |
+
|
| 20 |
+
headers = {
|
| 21 |
+
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
| 22 |
+
}
|
| 23 |
+
query = f"{name}, actor or actress"
|
| 24 |
+
params = {
|
| 25 |
+
"q": query,
|
| 26 |
+
"count": count,
|
| 27 |
+
"imageType": "Photo",
|
| 28 |
+
"safeSearch": "Strict",
|
| 29 |
+
"imageContent": "Face",
|
| 30 |
+
"freshness": "Year"
|
| 31 |
+
}
|
| 32 |
+
response = requests.get(
|
| 33 |
+
f"https://api.bing.microsoft.com/v7.0/images/search",
|
| 34 |
+
headers=headers,
|
| 35 |
+
params=params
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if response.status_code == 200:
|
| 39 |
+
return response.json()
|
| 40 |
+
|
| 41 |
+
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
| 42 |
+
"""Read and filter the list of actors"""
|
| 43 |
+
|
| 44 |
+
df = pd.read_csv("data/imdb_actors.csv")
|
| 45 |
+
if last_year_active:
|
| 46 |
+
df = df[df["lastYear"] >= last_year_active]
|
| 47 |
+
|
| 48 |
+
if sort_by:
|
| 49 |
+
df = df.sort_values(sort_by, ascending=False)
|
| 50 |
+
|
| 51 |
+
if max_actors:
|
| 52 |
+
df = df.head(max_actors)
|
| 53 |
+
|
| 54 |
+
return df
|
| 55 |
+
|
| 56 |
+
def store_all_actor_images_data(
|
| 57 |
+
max_actors: int = None,
|
| 58 |
+
images_per_actor: int = 10,
|
| 59 |
+
last_year_active: int = None,
|
| 60 |
+
output_file = None,
|
| 61 |
+
max_api_calls_per_second: int = 3
|
| 62 |
+
):
|
| 63 |
+
"""Get images data for each actor from the Bing Image Search API and store the results as csv"""
|
| 64 |
+
|
| 65 |
+
df = read_actors_list(max_actors, last_year_active)
|
| 66 |
+
df_im = None
|
| 67 |
+
if output_file:
|
| 68 |
+
try:
|
| 69 |
+
df_im = pd.read_csv(output_file)
|
| 70 |
+
except:
|
| 71 |
+
# file does not exists yet
|
| 72 |
+
pass
|
| 73 |
+
|
| 74 |
+
# remove actors for which we already have images data
|
| 75 |
+
if df_im is not None:
|
| 76 |
+
df = df[~df["nconst"].isin(df_im["nconst"].unique())]
|
| 77 |
+
|
| 78 |
+
print(f"Start retrieving images from Bing for {len(df)} actors")
|
| 79 |
+
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 80 |
+
images_data = get_actor_images(
|
| 81 |
+
name=row["primaryName"],
|
| 82 |
+
count=images_per_actor
|
| 83 |
+
)
|
| 84 |
+
df_im_tmp = pd.DataFrame(images_data["value"])
|
| 85 |
+
df_im_tmp["nconst"] = row["nconst"]
|
| 86 |
+
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
| 87 |
+
|
| 88 |
+
if df_im is not None:
|
| 89 |
+
df_im = pd.concat([df_im, df_im_tmp])
|
| 90 |
+
else:
|
| 91 |
+
df_im = df_im_tmp
|
| 92 |
+
|
| 93 |
+
# Store progress
|
| 94 |
+
df_im.to_csv(output_file, index=False)
|
| 95 |
+
|
| 96 |
+
# Limit speed of requests to Bing Search (3 calls per seconds)
|
| 97 |
+
time.sleep(1.0 / max_api_calls_per_second)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
store_all_actor_images_data(
|
| 102 |
+
output_file="data/actors_images.csv",
|
| 103 |
+
max_actors=1000,
|
| 104 |
+
images_per_actor=20,
|
| 105 |
+
last_year_active=datetime.now().year - 5,
|
| 106 |
+
max_api_calls_per_second=2
|
| 107 |
+
)
|
process_images.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import face_recognition
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_image(url: str):
|
| 9 |
+
response = requests.get(url)
|
| 10 |
+
response.raise_for_status()
|
| 11 |
+
img_file_object = BytesIO(response.content)
|
| 12 |
+
return face_recognition.load_image_file(img_file_object)
|
| 13 |
+
|
| 14 |
+
def get_embeddings(url: str):
|
| 15 |
+
try:
|
| 16 |
+
image = get_image(url)
|
| 17 |
+
return list(face_recognition.face_encodings(image, num_jitters=5, model="large")[0])
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(e)
|
| 20 |
+
|
| 21 |
+
def process_all_images(input_file, output_file):
|
| 22 |
+
df = pd.read_csv(input_file)[["nconst","contentUrl"]]
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
df_emb = pd.read_csv(output_file)
|
| 26 |
+
df = df[~df["contentUrl"].isin(df_emb["contentUrl"])]
|
| 27 |
+
except:
|
| 28 |
+
# file does not exists yet
|
| 29 |
+
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
| 30 |
+
|
| 31 |
+
print(f"Start processing of {df.shape[0]} images")
|
| 32 |
+
df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
| 33 |
+
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 34 |
+
embeddings = get_embeddings(row["contentUrl"])
|
| 35 |
+
if embeddings:
|
| 36 |
+
new_row = row.copy()
|
| 37 |
+
new_row["embeddings"] = embeddings
|
| 38 |
+
df_emb = df_emb.append(new_row, ignore_index=True)
|
| 39 |
+
df_emb.to_csv(output_file, index=False)
|
| 40 |
+
|
| 41 |
+
return df_emb
|
| 42 |
+
|
| 43 |
+
def build_annoy_index():
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
output_file = "data/actors_embeddings.csv"
|
| 48 |
+
df_embeddings = process_all_images(input_file="data/actors_images.csv", output_file=output_file)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Inference
|
| 2 |
+
cmake # required for dlib (used by face_recognition)
|
| 3 |
+
face_recognition
|
| 4 |
+
annoy
|
| 5 |
+
|
| 6 |
+
# Preprocessing
|
| 7 |
+
microsoft-bing-imagesearch
|
| 8 |
+
python-dotenv
|
| 9 |
+
pandas
|
| 10 |
+
tqdm
|