Spaces:
Runtime error
Runtime error
Commit ·
03da54f
1
Parent(s): 03b47cd
Upload E-Commerce Product Intelligence Dashboard
Browse files- app.py +21 -20
- backend/scraper.py +1 -27
app.py
CHANGED
|
@@ -4,63 +4,60 @@ from fastapi import FastAPI
|
|
| 4 |
from fastapi.staticfiles import StaticFiles
|
| 5 |
from fastapi.responses import HTMLResponse, JSONResponse
|
| 6 |
from pathlib import Path
|
| 7 |
-
from huggingface_hub import hf_hub_download
|
| 8 |
|
| 9 |
logging.basicConfig(level=logging.INFO)
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
app = FastAPI(title="E-Commerce Product Intelligence Platform")
|
| 13 |
|
| 14 |
-
HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
|
| 15 |
-
HF_CSV_FILENAME = "ecommerce_products.csv"
|
| 16 |
LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
|
| 17 |
|
| 18 |
-
def load_data():
|
| 19 |
-
"""Load CSV từ HF Dataset."""
|
| 20 |
-
data_dir = Path("data")
|
| 21 |
-
data_dir.mkdir(parents=True, exist_ok=True)
|
| 22 |
|
|
|
|
|
|
|
| 23 |
if not LOCAL_CSV_PATH.exists():
|
| 24 |
-
|
| 25 |
-
local_path = hf_hub_download(
|
| 26 |
-
repo_id=HF_DATASET_REPO,
|
| 27 |
-
filename=HF_CSV_FILENAME,
|
| 28 |
-
repo_type="dataset",
|
| 29 |
-
cache_dir=str(data_dir)
|
| 30 |
-
)
|
| 31 |
-
logger.info(f"Downloaded to: {local_path}")
|
| 32 |
|
| 33 |
logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
|
| 34 |
return pd.read_csv(LOCAL_CSV_PATH)
|
| 35 |
|
|
|
|
| 36 |
@app.get("/")
|
| 37 |
def root():
|
| 38 |
return {"status": "E-Commerce Product Intelligence API is running"}
|
| 39 |
|
|
|
|
| 40 |
@app.get("/data")
|
| 41 |
def get_data():
|
| 42 |
df = load_data()
|
| 43 |
return df.head(200).to_dict("records")
|
| 44 |
|
|
|
|
| 45 |
@app.get("/stats/categories")
|
| 46 |
def stats_categories():
|
| 47 |
df = load_data()
|
| 48 |
return df["category"].value_counts().head(10).to_dict()
|
| 49 |
|
|
|
|
| 50 |
@app.get("/stats/brands")
|
| 51 |
def stats_brands():
|
| 52 |
df = load_data()
|
| 53 |
return df["brand"].value_counts().head(10).to_dict()
|
| 54 |
|
|
|
|
| 55 |
@app.get("/stats/price")
|
| 56 |
def stats_price():
|
| 57 |
df = load_data()
|
| 58 |
-
return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
|
|
|
|
|
|
| 59 |
|
| 60 |
@app.get("/stats/rating")
|
| 61 |
def stats_rating():
|
| 62 |
df = load_data()
|
| 63 |
-
return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
|
|
|
|
|
|
| 64 |
|
| 65 |
@app.get("/insights")
|
| 66 |
def insights():
|
|
@@ -73,25 +70,28 @@ def insights():
|
|
| 73 |
"avg_rating": df["rating"].mean(),
|
| 74 |
})
|
| 75 |
|
|
|
|
| 76 |
@app.get("/search")
|
| 77 |
def search(query: str):
|
| 78 |
df = load_data()
|
| 79 |
q = query.lower()
|
| 80 |
mask = (
|
| 81 |
-
|
| 82 |
-
|
| 83 |
)
|
| 84 |
return df[mask].head(100).to_dict("records")
|
| 85 |
|
|
|
|
| 86 |
@app.get("/recommend")
|
| 87 |
def recommend(category: str):
|
| 88 |
df = load_data()
|
| 89 |
subset = df[df["category"] == category]
|
| 90 |
return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
|
| 91 |
|
|
|
|
| 92 |
@app.post("/run-scraper")
|
| 93 |
def trigger_scraper():
|
| 94 |
-
"""Trigger
|
| 95 |
import subprocess
|
| 96 |
result = subprocess.run(["python", "backend/scraper.py"], capture_output=True, text=True)
|
| 97 |
if result.returncode == 0:
|
|
@@ -99,6 +99,7 @@ def trigger_scraper():
|
|
| 99 |
else:
|
| 100 |
return {"status": "Scraper failed", "error": result.stderr}
|
| 101 |
|
|
|
|
| 102 |
frontend_dir = Path("frontend")
|
| 103 |
if frontend_dir.exists():
|
| 104 |
app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
|
|
|
|
| 4 |
from fastapi.staticfiles import StaticFiles
|
| 5 |
from fastapi.responses import HTMLResponse, JSONResponse
|
| 6 |
from pathlib import Path
|
|
|
|
| 7 |
|
| 8 |
logging.basicConfig(level=logging.INFO)
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
app = FastAPI(title="E-Commerce Product Intelligence Platform")
|
| 12 |
|
|
|
|
|
|
|
| 13 |
LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
def load_data():
|
| 17 |
+
"""Load CSV từ local."""
|
| 18 |
if not LOCAL_CSV_PATH.exists():
|
| 19 |
+
raise FileNotFoundError(f"CSV not found: {LOCAL_CSV_PATH}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
|
| 22 |
return pd.read_csv(LOCAL_CSV_PATH)
|
| 23 |
|
| 24 |
+
|
| 25 |
@app.get("/")
|
| 26 |
def root():
|
| 27 |
return {"status": "E-Commerce Product Intelligence API is running"}
|
| 28 |
|
| 29 |
+
|
| 30 |
@app.get("/data")
|
| 31 |
def get_data():
|
| 32 |
df = load_data()
|
| 33 |
return df.head(200).to_dict("records")
|
| 34 |
|
| 35 |
+
|
| 36 |
@app.get("/stats/categories")
|
| 37 |
def stats_categories():
|
| 38 |
df = load_data()
|
| 39 |
return df["category"].value_counts().head(10).to_dict()
|
| 40 |
|
| 41 |
+
|
| 42 |
@app.get("/stats/brands")
|
| 43 |
def stats_brands():
|
| 44 |
df = load_data()
|
| 45 |
return df["brand"].value_counts().head(10).to_dict()
|
| 46 |
|
| 47 |
+
|
| 48 |
@app.get("/stats/price")
|
| 49 |
def stats_price():
|
| 50 |
df = load_data()
|
| 51 |
+
return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
| 52 |
+
"records")
|
| 53 |
+
|
| 54 |
|
| 55 |
@app.get("/stats/rating")
|
| 56 |
def stats_rating():
|
| 57 |
df = load_data()
|
| 58 |
+
return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
| 59 |
+
"records")
|
| 60 |
+
|
| 61 |
|
| 62 |
@app.get("/insights")
|
| 63 |
def insights():
|
|
|
|
| 70 |
"avg_rating": df["rating"].mean(),
|
| 71 |
})
|
| 72 |
|
| 73 |
+
|
| 74 |
@app.get("/search")
|
| 75 |
def search(query: str):
|
| 76 |
df = load_data()
|
| 77 |
q = query.lower()
|
| 78 |
mask = (
|
| 79 |
+
df["title"].str.contains(q, case=False, na=False) |
|
| 80 |
+
df["description"].str.contains(q, case=False, na=False)
|
| 81 |
)
|
| 82 |
return df[mask].head(100).to_dict("records")
|
| 83 |
|
| 84 |
+
|
| 85 |
@app.get("/recommend")
|
| 86 |
def recommend(category: str):
|
| 87 |
df = load_data()
|
| 88 |
subset = df[df["category"] == category]
|
| 89 |
return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
|
| 90 |
|
| 91 |
+
|
| 92 |
@app.post("/run-scraper")
|
| 93 |
def trigger_scraper():
|
| 94 |
+
"""Trigger download Kaggle → save CSV."""
|
| 95 |
import subprocess
|
| 96 |
result = subprocess.run(["python", "backend/scraper.py"], capture_output=True, text=True)
|
| 97 |
if result.returncode == 0:
|
|
|
|
| 99 |
else:
|
| 100 |
return {"status": "Scraper failed", "error": result.stderr}
|
| 101 |
|
| 102 |
+
|
| 103 |
frontend_dir = Path("frontend")
|
| 104 |
if frontend_dir.exists():
|
| 105 |
app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
|
backend/scraper.py
CHANGED
|
@@ -3,7 +3,6 @@ import logging
|
|
| 3 |
import pandas as pd
|
| 4 |
from pathlib import Path
|
| 5 |
import shutil
|
| 6 |
-
from huggingface_hub import upload_file
|
| 7 |
|
| 8 |
# Set Kaggle env vars TRƯỚC khi import Kaggle
|
| 9 |
token = os.getenv("KAGGLE_API_TOKEN")
|
|
@@ -12,17 +11,12 @@ if token:
|
|
| 12 |
os.environ['KAGGLE_KEY'] = token_value
|
| 13 |
os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
|
| 14 |
|
| 15 |
-
# Lấy HF_TOKEN từ Space (mặc định có sẵn)
|
| 16 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
-
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 22 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 23 |
OUTPUT_CSV = Path("data/ecommerce_products.csv")
|
| 24 |
-
HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
|
| 25 |
-
HF_CSV_FILENAME = "ecommerce_products.csv"
|
| 26 |
|
| 27 |
os.makedirs("data", exist_ok=True)
|
| 28 |
|
|
@@ -71,33 +65,13 @@ def save_csv(df: pd.DataFrame):
|
|
| 71 |
logger.info(f"Saved to: {OUTPUT_CSV}")
|
| 72 |
|
| 73 |
|
| 74 |
-
def upload_to_hf():
|
| 75 |
-
"""Upload CSV lên HF Dataset."""
|
| 76 |
-
if not OUTPUT_CSV.exists():
|
| 77 |
-
raise FileNotFoundError(f"CSV not found: {OUTPUT_CSV}")
|
| 78 |
-
|
| 79 |
-
if not HF_TOKEN:
|
| 80 |
-
logger.warning("HF_TOKEN not found, trying without authentication...")
|
| 81 |
-
|
| 82 |
-
logger.info(f"Uploading CSV to HF Dataset: {HF_DATASET_REPO}")
|
| 83 |
-
upload_file(
|
| 84 |
-
path_or_fileobj=str(OUTPUT_CSV),
|
| 85 |
-
path_in_repo=HF_CSV_FILENAME,
|
| 86 |
-
repo_id=HF_DATASET_REPO,
|
| 87 |
-
repo_type="dataset",
|
| 88 |
-
token=HF_TOKEN if HF_TOKEN else None
|
| 89 |
-
)
|
| 90 |
-
logger.info("Upload completed successfully.")
|
| 91 |
-
|
| 92 |
-
|
| 93 |
def run_scraper():
|
| 94 |
-
"""Full pipeline: download Kaggle → save CSV
|
| 95 |
try:
|
| 96 |
download_dataset()
|
| 97 |
csv_files = find_csv_files(TEMP_DIR)
|
| 98 |
df = load_and_concatenate(csv_files)
|
| 99 |
save_csv(df)
|
| 100 |
-
upload_to_hf()
|
| 101 |
finally:
|
| 102 |
shutil.rmtree(TEMP_DIR, ignore_errors=True)
|
| 103 |
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
from pathlib import Path
|
| 5 |
import shutil
|
|
|
|
| 6 |
|
| 7 |
# Set Kaggle env vars TRƯỚC khi import Kaggle
|
| 8 |
token = os.getenv("KAGGLE_API_TOKEN")
|
|
|
|
| 11 |
os.environ['KAGGLE_KEY'] = token_value
|
| 12 |
os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 18 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 19 |
OUTPUT_CSV = Path("data/ecommerce_products.csv")
|
|
|
|
|
|
|
| 20 |
|
| 21 |
os.makedirs("data", exist_ok=True)
|
| 22 |
|
|
|
|
| 65 |
logger.info(f"Saved to: {OUTPUT_CSV}")
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def run_scraper():
|
| 69 |
+
"""Full pipeline: download Kaggle → save CSV (không upload HF)."""
|
| 70 |
try:
|
| 71 |
download_dataset()
|
| 72 |
csv_files = find_csv_files(TEMP_DIR)
|
| 73 |
df = load_and_concatenate(csv_files)
|
| 74 |
save_csv(df)
|
|
|
|
| 75 |
finally:
|
| 76 |
shutil.rmtree(TEMP_DIR, ignore_errors=True)
|
| 77 |
|