Maslionok commited on
Commit
2a60f29
·
1 Parent(s): f838384

Use bundled exploration dataset by default

Browse files
README.md CHANGED
@@ -11,9 +11,12 @@ license: cc-by-sa-4.0
11
  short_description: Explore yearly ad and non-ad distributions in Impresso
12
  ---
13
 
14
- The app loads this hardcoded aggregated JSON source:
15
 
16
- `s3://140-processed-data-sandbox/content-item-classification/content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json`
 
 
 
17
 
18
  Expected row shape:
19
 
@@ -33,7 +36,7 @@ Expected row shape:
33
  ]
34
  ```
35
 
36
- Optional S3 env vars:
37
  - `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
38
  - `AWS_REGION` or `S3_REGION`
39
  - `AWS_PROFILE` or `S3_PROFILE`
 
11
  short_description: Explore yearly ad and non-ad distributions in Impresso
12
  ---
13
 
14
+ The app loads this aggregated JSON file from the repo by default:
15
 
16
+ `content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json`
17
+
18
+ You can override the source with the `DATA_SOURCE` environment variable.
19
+ Supported values are local paths, `http(s)` URLs, and `s3://` URLs.
20
 
21
  Expected row shape:
22
 
 
36
  ]
37
  ```
38
 
39
+ Optional S3 env vars when `DATA_SOURCE` uses `s3://`:
40
  - `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
41
  - `AWS_REGION` or `S3_REGION`
42
  - `AWS_PROFILE` or `S3_PROFILE`
app.py CHANGED
@@ -1,25 +1,60 @@
1
  import json
2
  import os
3
  import random
 
4
  from urllib.parse import urlparse
5
  from urllib.request import urlopen
6
 
7
- import boto3
8
  import gradio as gr
9
  import pandas as pd
10
  import plotly.graph_objects as go
11
 
 
 
 
 
 
12
  # -------------------------------------------------------------------
13
  # Load data
14
  # -------------------------------------------------------------------
15
 
16
- DATA_SOURCE = (
 
 
 
 
 
17
  "s3://140-processed-data-sandbox/content-item-classification/"
18
  "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
19
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def load_json_from_s3(source: str):
 
 
 
23
  parsed = urlparse(source)
24
  bucket = parsed.netloc
25
  key = parsed.path.lstrip("/")
@@ -57,7 +92,11 @@ def load_data(source: str):
57
  with urlopen(source) as response:
58
  return json.load(response)
59
 
60
- with open(source, encoding="utf-8") as handle:
 
 
 
 
61
  return json.load(handle)
62
 
63
 
@@ -332,7 +371,7 @@ with gr.Blocks() as demo:
332
  gr.Markdown("## Ad classification exploration")
333
  gr.Markdown(
334
  "Explore yearly ad-share distributions by provider and newspaper. "
335
- f"Source: `{DATA_SOURCE}`"
336
  )
337
 
338
  with gr.Row():
 
1
  import json
2
  import os
3
  import random
4
+ from pathlib import Path
5
  from urllib.parse import urlparse
6
  from urllib.request import urlopen
7
 
 
8
  import gradio as gr
9
  import pandas as pd
10
  import plotly.graph_objects as go
11
 
12
+ try:
13
+ import boto3
14
+ except ImportError:
15
+ boto3 = None
16
+
17
  # -------------------------------------------------------------------
18
  # Load data
19
  # -------------------------------------------------------------------
20
 
21
+ APP_DIR = Path(__file__).resolve().parent
22
+ DEFAULT_DATA_FILENAME = (
23
+ "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
24
+ )
25
+ DEFAULT_LOCAL_DATA_SOURCE = APP_DIR / DEFAULT_DATA_FILENAME
26
+ DEFAULT_REMOTE_DATA_SOURCE = (
27
  "s3://140-processed-data-sandbox/content-item-classification/"
28
  "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
29
  )
30
+ DATA_SOURCE = os.environ.get("DATA_SOURCE") or (
31
+ str(DEFAULT_LOCAL_DATA_SOURCE)
32
+ if DEFAULT_LOCAL_DATA_SOURCE.exists()
33
+ else DEFAULT_REMOTE_DATA_SOURCE
34
+ )
35
+
36
+
37
+ def format_source_label(source: str) -> str:
38
+ if source.startswith(("s3://", "http://", "https://")):
39
+ return source
40
+
41
+ path = Path(source)
42
+ if not path.is_absolute():
43
+ return source
44
+
45
+ try:
46
+ return path.relative_to(APP_DIR).as_posix()
47
+ except ValueError:
48
+ return str(path)
49
+
50
+
51
+ SOURCE_LABEL = format_source_label(DATA_SOURCE)
52
 
53
 
54
  def load_json_from_s3(source: str):
55
+ if boto3 is None:
56
+ raise ImportError("boto3 is required to load data from S3.")
57
+
58
  parsed = urlparse(source)
59
  bucket = parsed.netloc
60
  key = parsed.path.lstrip("/")
 
92
  with urlopen(source) as response:
93
  return json.load(response)
94
 
95
+ path = Path(source)
96
+ if not path.is_absolute():
97
+ path = APP_DIR / path
98
+
99
+ with path.open(encoding="utf-8") as handle:
100
  return json.load(handle)
101
 
102
 
 
371
  gr.Markdown("## Ad classification exploration")
372
  gr.Markdown(
373
  "Explore yearly ad-share distributions by provider and newspaper. "
374
+ f"Source: `{SOURCE_LABEL}`"
375
  )
376
 
377
  with gr.Row():
content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json ADDED
The diff for this file is too large to render. See raw diff