Spaces:
Runtime error
Runtime error
fix image gathering
Browse files- data/actors_embeddings.csv +2 -2
- data/actors_images.csv +2 -2
- data/imdb_actors.csv +2 -2
- get_images_data.py +15 -10
- process_images.py +4 -2
data/actors_embeddings.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9f1da52b8d6f8926a9aac335a4125f646359c5d5a882aea9ded679e4066f057
|
| 3 |
+
size 36828171
|
data/actors_images.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e859801f01b0dd87938c23be5211a66244489b7cdcd784a5c4dc008f3964869
|
| 3 |
+
size 38713146
|
data/imdb_actors.csv
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a95d36387eb646a14ea8038d3d02efbfa6d424d69d32a8b931ff8331d1951b97
|
| 3 |
+
size 7829655
|
get_images_data.py
CHANGED
|
@@ -20,7 +20,7 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
|
| 20 |
headers = {
|
| 21 |
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
| 22 |
}
|
| 23 |
-
query = f"{name}
|
| 24 |
params = {
|
| 25 |
"q": query,
|
| 26 |
"count": count,
|
|
@@ -35,8 +35,8 @@ def get_actor_images(name: str, count: int = 50, api_key: str = BING_API_KEY):
|
|
| 35 |
params=params
|
| 36 |
)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
| 42 |
"""Read and filter the list of actors"""
|
|
@@ -77,10 +77,15 @@ def store_all_actor_images_data(
|
|
| 77 |
|
| 78 |
print(f"Start retrieving images from Bing for {len(df)} actors")
|
| 79 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
| 85 |
df_im_tmp["nconst"] = row["nconst"]
|
| 86 |
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
|
@@ -99,9 +104,9 @@ def store_all_actor_images_data(
|
|
| 99 |
|
| 100 |
if __name__ == "__main__":
|
| 101 |
store_all_actor_images_data(
|
| 102 |
-
output_file="data/
|
| 103 |
-
max_actors=
|
| 104 |
images_per_actor=20,
|
| 105 |
last_year_active=datetime.now().year - 5,
|
| 106 |
-
max_api_calls_per_second=
|
| 107 |
)
|
|
|
|
| 20 |
headers = {
|
| 21 |
"Ocp-Apim-Subscription-Key": BING_API_KEY
|
| 22 |
}
|
| 23 |
+
query = f'"{name}"'
|
| 24 |
params = {
|
| 25 |
"q": query,
|
| 26 |
"count": count,
|
|
|
|
| 35 |
params=params
|
| 36 |
)
|
| 37 |
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
return response.json()
|
| 40 |
|
| 41 |
def read_actors_list(max_actors: int = None, last_year_active: int = None, sort_by: str = None):
|
| 42 |
"""Read and filter the list of actors"""
|
|
|
|
| 77 |
|
| 78 |
print(f"Start retrieving images from Bing for {len(df)} actors")
|
| 79 |
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 80 |
+
try:
|
| 81 |
+
images_data = get_actor_images(
|
| 82 |
+
name=row["primaryName"],
|
| 83 |
+
count=images_per_actor
|
| 84 |
+
)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(e)
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
df_im_tmp = pd.DataFrame(images_data["value"])
|
| 90 |
df_im_tmp["nconst"] = row["nconst"]
|
| 91 |
df_im_tmp["resultPosition"] = list(range(0, len(df_im_tmp)))
|
|
|
|
| 104 |
|
| 105 |
if __name__ == "__main__":
|
| 106 |
store_all_actor_images_data(
|
| 107 |
+
output_file="data/actors_images_new.csv",
|
| 108 |
+
max_actors=2000,
|
| 109 |
images_per_actor=20,
|
| 110 |
last_year_active=datetime.now().year - 5,
|
| 111 |
+
max_api_calls_per_second=100
|
| 112 |
)
|
process_images.py
CHANGED
|
@@ -21,7 +21,7 @@ def get_embeddings(url: str):
|
|
| 21 |
print(e)
|
| 22 |
|
| 23 |
def process_all_images(input_file, output_file):
|
| 24 |
-
df = pd.read_csv(input_file)[["nconst","contentUrl"]]
|
| 25 |
|
| 26 |
try:
|
| 27 |
df_emb = pd.read_csv(output_file)
|
|
@@ -31,11 +31,13 @@ def process_all_images(input_file, output_file):
|
|
| 31 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
| 32 |
|
| 33 |
print(f"Start processing of {df.shape[0]} images")
|
| 34 |
-
df = df.
|
|
|
|
| 35 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 36 |
embeddings = get_embeddings(row["contentUrl"])
|
| 37 |
new_row = row.copy()
|
| 38 |
new_row["embeddings"] = embeddings
|
|
|
|
| 39 |
df_emb = df_emb.append(new_row, ignore_index=True)
|
| 40 |
|
| 41 |
if i % 5 == 0:
|
|
|
|
| 21 |
print(e)
|
| 22 |
|
| 23 |
def process_all_images(input_file, output_file):
|
| 24 |
+
df = pd.read_csv(input_file)[["nconst","contentUrl","resultPosition"]]
|
| 25 |
|
| 26 |
try:
|
| 27 |
df_emb = pd.read_csv(output_file)
|
|
|
|
| 31 |
df_emb = pd.DataFrame(columns=list(df.columns) + ["embeddings"])
|
| 32 |
|
| 33 |
print(f"Start processing of {df.shape[0]} images")
|
| 34 |
+
df = df.sort_values("resultPosition", ascending=True)
|
| 35 |
+
#df = df.sample(frac=1) # shuffle so you get some images for everybody while it's running
|
| 36 |
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
|
| 37 |
embeddings = get_embeddings(row["contentUrl"])
|
| 38 |
new_row = row.copy()
|
| 39 |
new_row["embeddings"] = embeddings
|
| 40 |
+
new_row = new_row[["nconst", "contentUrl", "embeddings"]]
|
| 41 |
df_emb = df_emb.append(new_row, ignore_index=True)
|
| 42 |
|
| 43 |
if i % 5 == 0:
|