Spaces:

impresso-project
/

ad-classification-exploration

Sleeping

App Files Files Community

Maslionok commited on Apr 2

Commit

2a60f29

1 Parent(s): f838384

Use bundled exploration dataset by default

Browse files

Files changed (3) hide show

README.md +6 -3
app.py +43 -4
content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json +0 -0

README.md CHANGED Viewed

@@ -11,9 +11,12 @@ license: cc-by-sa-4.0
 short_description: Explore yearly ad and non-ad distributions in Impresso
 ---
-The app loads this hardcoded aggregated JSON source:
-`s3://140-processed-data-sandbox/content-item-classification/content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json`
 Expected row shape:
@@ -33,7 +36,7 @@ Expected row shape:
 ]
 ```
-Optional S3 env vars:
 - `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
 - `AWS_REGION` or `S3_REGION`
 - `AWS_PROFILE` or `S3_PROFILE`

 short_description: Explore yearly ad and non-ad distributions in Impresso
 ---
+The app loads this aggregated JSON file from the repo by default:
+`content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json`
+You can override the source with the `DATA_SOURCE` environment variable.
+Supported values are local paths, `http(s)` URLs, and `s3://` URLs.
 Expected row shape:
 ]
 ```
+Optional S3 env vars when `DATA_SOURCE` uses `s3://`:
 - `AWS_ENDPOINT_URL` or `S3_ENDPOINT_URL`
 - `AWS_REGION` or `S3_REGION`
 - `AWS_PROFILE` or `S3_PROFILE`

app.py CHANGED Viewed

@@ -1,25 +1,60 @@
 import json
 import os
 import random
 from urllib.parse import urlparse
 from urllib.request import urlopen
-import boto3
 import gradio as gr
 import pandas as pd
 import plotly.graph_objects as go
 # -------------------------------------------------------------------
 # Load data
 # -------------------------------------------------------------------
-DATA_SOURCE = (
     "s3://140-processed-data-sandbox/content-item-classification/"
     "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
 )
 def load_json_from_s3(source: str):
     parsed = urlparse(source)
     bucket = parsed.netloc
     key = parsed.path.lstrip("/")
@@ -57,7 +92,11 @@ def load_data(source: str):
         with urlopen(source) as response:
             return json.load(response)
-    with open(source, encoding="utf-8") as handle:
         return json.load(handle)
@@ -332,7 +371,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## Ad classification exploration")
     gr.Markdown(
         "Explore yearly ad-share distributions by provider and newspaper. "
-        f"Source: `{DATA_SOURCE}`"
     )
     with gr.Row():

 import json
 import os
 import random
+from pathlib import Path
 from urllib.parse import urlparse
 from urllib.request import urlopen
 import gradio as gr
 import pandas as pd
 import plotly.graph_objects as go
+try:
+    import boto3
+except ImportError:
+    boto3 = None
 # -------------------------------------------------------------------
 # Load data
 # -------------------------------------------------------------------
+APP_DIR = Path(__file__).resolve().parent
+DEFAULT_DATA_FILENAME = (
+    "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
+)
+DEFAULT_LOCAL_DATA_SOURCE = APP_DIR / DEFAULT_DATA_FILENAME
+DEFAULT_REMOTE_DATA_SOURCE = (
     "s3://140-processed-data-sandbox/content-item-classification/"
     "content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json"
 )
+DATA_SOURCE = os.environ.get("DATA_SOURCE") or (
+    str(DEFAULT_LOCAL_DATA_SOURCE)
+    if DEFAULT_LOCAL_DATA_SOURCE.exists()
+    else DEFAULT_REMOTE_DATA_SOURCE
+)
+def format_source_label(source: str) -> str:
+    if source.startswith(("s3://", "http://", "https://")):
+        return source
+    path = Path(source)
+    if not path.is_absolute():
+        return source
+    try:
+        return path.relative_to(APP_DIR).as_posix()
+    except ValueError:
+        return str(path)
+SOURCE_LABEL = format_source_label(DATA_SOURCE)
 def load_json_from_s3(source: str):
+    if boto3 is None:
+        raise ImportError("boto3 is required to load data from S3.")
     parsed = urlparse(source)
     bucket = parsed.netloc
     key = parsed.path.lstrip("/")
         with urlopen(source) as response:
             return json.load(response)
+    path = Path(source)
+    if not path.is_absolute():
+        path = APP_DIR / path
+    with path.open(encoding="utf-8") as handle:
         return json.load(handle)
     gr.Markdown("## Ad classification exploration")
     gr.Markdown(
         "Explore yearly ad-share distributions by provider and newspaper. "
+        f"Source: `{SOURCE_LABEL}`"
     )
     with gr.Row():

content-item-classification-base-multilingual_v1-0-0_aggregated_for_exploration.json ADDED Viewed

The diff for this file is too large to render. See raw diff