Use bundled exploration dataset by default
Browse files
README.md
CHANGED
|
@@ -11,9 +11,12 @@ license: cc-by-sa-4.0
|
|
| 11 |
short_description: Explore yearly ad and non-ad distributions in Impresso
|
| 12 |
---
|
| 13 |
|
| 14 |
-
The app loads this
|
| 15 |
|
| 16 |
-
`
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
Expected row shape:
|
| 19 |
|
|
@@ -33,7 +36,7 @@ Expected row shape:
|
|
| 33 |
]
|
| 34 |
```
|
| 35 |
|
| 36 |
-
Optional S3 env vars:
|
| 37 |
- `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
|
| 38 |
- `AWS_REGION` or `S3_REGION`
|
| 39 |
- `AWS_PROFILE` or `S3_PROFILE`
|
|
|
|
| 11 |
short_description: Explore yearly ad and non-ad distributions in Impresso
|
| 12 |
---
|
| 13 |
|
| 14 |
+
The app loads this aggregated JSON file from the repo by default:
|
| 15 |
|
| 16 |
+
`content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json`
|
| 17 |
+
|
| 18 |
+
You can override the source with the `DATA_SOURCE` environment variable.
|
| 19 |
+
Supported values are local paths, `http(s)` URLs, and `s3://` URLs.
|
| 20 |
|
| 21 |
Expected row shape:
|
| 22 |
|
|
|
|
| 36 |
]
|
| 37 |
```
|
| 38 |
|
| 39 |
+
Optional S3 env vars when `DATA_SOURCE` uses `s3://`:
|
| 40 |
- `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
|
| 41 |
- `AWS_REGION` or `S3_REGION`
|
| 42 |
- `AWS_PROFILE` or `S3_PROFILE`
|
app.py
CHANGED
|
@@ -1,25 +1,60 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import random
|
|
|
|
| 4 |
from urllib.parse import urlparse
|
| 5 |
from urllib.request import urlopen
|
| 6 |
|
| 7 |
-
import boto3
|
| 8 |
import gradio as gr
|
| 9 |
import pandas as pd
|
| 10 |
import plotly.graph_objects as go
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# -------------------------------------------------------------------
|
| 13 |
# Load data
|
| 14 |
# -------------------------------------------------------------------
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"s3://140-processed-data-sandbox/content-item-classification/"
|
| 18 |
"content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
|
| 19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def load_json_from_s3(source: str):
|
|
|
|
|
|
|
|
|
|
| 23 |
parsed = urlparse(source)
|
| 24 |
bucket = parsed.netloc
|
| 25 |
key = parsed.path.lstrip("/")
|
|
@@ -57,7 +92,11 @@ def load_data(source: str):
|
|
| 57 |
with urlopen(source) as response:
|
| 58 |
return json.load(response)
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return json.load(handle)
|
| 62 |
|
| 63 |
|
|
@@ -332,7 +371,7 @@ with gr.Blocks() as demo:
|
|
| 332 |
gr.Markdown("## Ad classification exploration")
|
| 333 |
gr.Markdown(
|
| 334 |
"Explore yearly ad-share distributions by provider and newspaper. "
|
| 335 |
-
f"Source: `{
|
| 336 |
)
|
| 337 |
|
| 338 |
with gr.Row():
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
+
from pathlib import Path
|
| 5 |
from urllib.parse import urlparse
|
| 6 |
from urllib.request import urlopen
|
| 7 |
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
import pandas as pd
|
| 10 |
import plotly.graph_objects as go
|
| 11 |
|
| 12 |
+
try:
|
| 13 |
+
import boto3
|
| 14 |
+
except ImportError:
|
| 15 |
+
boto3 = None
|
| 16 |
+
|
| 17 |
# -------------------------------------------------------------------
|
| 18 |
# Load data
|
| 19 |
# -------------------------------------------------------------------
|
| 20 |
|
| 21 |
+
APP_DIR = Path(__file__).resolve().parent
|
| 22 |
+
DEFAULT_DATA_FILENAME = (
|
| 23 |
+
"content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
|
| 24 |
+
)
|
| 25 |
+
DEFAULT_LOCAL_DATA_SOURCE = APP_DIR / DEFAULT_DATA_FILENAME
|
| 26 |
+
DEFAULT_REMOTE_DATA_SOURCE = (
|
| 27 |
"s3://140-processed-data-sandbox/content-item-classification/"
|
| 28 |
"content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
|
| 29 |
)
|
| 30 |
+
DATA_SOURCE = os.environ.get("DATA_SOURCE") or (
|
| 31 |
+
str(DEFAULT_LOCAL_DATA_SOURCE)
|
| 32 |
+
if DEFAULT_LOCAL_DATA_SOURCE.exists()
|
| 33 |
+
else DEFAULT_REMOTE_DATA_SOURCE
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def format_source_label(source: str) -> str:
|
| 38 |
+
if source.startswith(("s3://", "http://", "https://")):
|
| 39 |
+
return source
|
| 40 |
+
|
| 41 |
+
path = Path(source)
|
| 42 |
+
if not path.is_absolute():
|
| 43 |
+
return source
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
return path.relative_to(APP_DIR).as_posix()
|
| 47 |
+
except ValueError:
|
| 48 |
+
return str(path)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
SOURCE_LABEL = format_source_label(DATA_SOURCE)
|
| 52 |
|
| 53 |
|
| 54 |
def load_json_from_s3(source: str):
|
| 55 |
+
if boto3 is None:
|
| 56 |
+
raise ImportError("boto3 is required to load data from S3.")
|
| 57 |
+
|
| 58 |
parsed = urlparse(source)
|
| 59 |
bucket = parsed.netloc
|
| 60 |
key = parsed.path.lstrip("/")
|
|
|
|
| 92 |
with urlopen(source) as response:
|
| 93 |
return json.load(response)
|
| 94 |
|
| 95 |
+
path = Path(source)
|
| 96 |
+
if not path.is_absolute():
|
| 97 |
+
path = APP_DIR / path
|
| 98 |
+
|
| 99 |
+
with path.open(encoding="utf-8") as handle:
|
| 100 |
return json.load(handle)
|
| 101 |
|
| 102 |
|
|
|
|
| 371 |
gr.Markdown("## Ad classification exploration")
|
| 372 |
gr.Markdown(
|
| 373 |
"Explore yearly ad-share distributions by provider and newspaper. "
|
| 374 |
+
f"Source: `{SOURCE_LABEL}`"
|
| 375 |
)
|
| 376 |
|
| 377 |
with gr.Row():
|
content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|