Spaces:
Runtime error
Runtime error
Commit ·
9058528
1
Parent(s): 2c7b3a2
Upload E-Commerce Product Intelligence Dashboard
Browse files- Dockerfile +7 -1
- app.py +27 -10
- backend/scraper.py +16 -15
- requirements.txt +3 -6
Dockerfile
CHANGED
|
@@ -7,6 +7,12 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 7 |
|
| 8 |
COPY . /app
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
EXPOSE 8000
|
| 11 |
|
| 12 |
-
CMD ["
|
|
|
|
| 7 |
|
| 8 |
COPY . /app
|
| 9 |
|
| 10 |
+
# Tạo data dir
|
| 11 |
+
RUN mkdir -p /app/data
|
| 12 |
+
|
| 13 |
+
# Script start: chạy scraper + uvicorn
|
| 14 |
+
RUN echo '#!/bin/bash\nexport KAGGLE_API_TOKEN="$KAGGLE_API_TOKEN"\npython backend/scraper.py && uvicorn app:app --host 0.0.0.0 --port 8000' > /app/start.sh && chmod +x /app/start.sh
|
| 15 |
+
|
| 16 |
EXPOSE 8000
|
| 17 |
|
| 18 |
+
CMD ["/app/start.sh"]
|
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import logging
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from fastapi import FastAPI
|
| 4 |
from fastapi.staticfiles import StaticFiles
|
|
@@ -10,43 +11,56 @@ logger = logging.getLogger(__name__)
|
|
| 10 |
|
| 11 |
app = FastAPI(title="E-Commerce Product Intelligence Platform")
|
| 12 |
|
| 13 |
-
# ==================== Load data ====================
|
|
|
|
|
|
|
|
|
|
| 14 |
def load_data():
|
| 15 |
-
"""Load parquet."""
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# ==================== API Routes ====================
|
| 22 |
@app.get("/")
|
| 23 |
def root():
|
| 24 |
return {"status": "E-Commerce Product Intelligence API is running"}
|
| 25 |
|
|
|
|
| 26 |
@app.get("/data")
|
| 27 |
def get_data():
|
| 28 |
df = load_data()
|
| 29 |
return df.head(200).to_dict("records")
|
| 30 |
|
|
|
|
| 31 |
@app.get("/stats/categories")
|
| 32 |
def stats_categories():
|
| 33 |
df = load_data()
|
| 34 |
return df["category"].value_counts().head(10).to_dict()
|
| 35 |
|
|
|
|
| 36 |
@app.get("/stats/brands")
|
| 37 |
def stats_brands():
|
| 38 |
df = load_data()
|
| 39 |
return df["brand"].value_counts().head(10).to_dict()
|
| 40 |
|
|
|
|
| 41 |
@app.get("/stats/price")
|
| 42 |
def stats_price():
|
| 43 |
df = load_data()
|
| 44 |
-
return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@app.get("/stats/rating")
|
| 47 |
def stats_rating():
|
| 48 |
df = load_data()
|
| 49 |
-
return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
|
|
|
|
|
|
| 50 |
|
| 51 |
@app.get("/insights")
|
| 52 |
def insights():
|
|
@@ -59,22 +73,25 @@ def insights():
|
|
| 59 |
"avg_rating": df["rating"].mean(),
|
| 60 |
})
|
| 61 |
|
|
|
|
| 62 |
@app.get("/search")
|
| 63 |
def search(query: str):
|
| 64 |
df = load_data()
|
| 65 |
q = query.lower()
|
| 66 |
mask = (
|
| 67 |
-
|
| 68 |
-
|
| 69 |
)
|
| 70 |
return df[mask].head(100).to_dict("records")
|
| 71 |
|
|
|
|
| 72 |
@app.get("/recommend")
|
| 73 |
def recommend(category: str):
|
| 74 |
df = load_data()
|
| 75 |
subset = df[df["category"] == category]
|
| 76 |
return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
|
| 77 |
|
|
|
|
| 78 |
# ==================== Frontend ====================
|
| 79 |
frontend_dir = Path("frontend")
|
| 80 |
if frontend_dir.exists():
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import os
|
| 3 |
import pandas as pd
|
| 4 |
from fastapi import FastAPI
|
| 5 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
| 11 |
|
| 12 |
app = FastAPI(title="E-Commerce Product Intelligence Platform")
|
| 13 |
|
| 14 |
+
# ==================== Load data (local parquet) ====================
|
| 15 |
+
LOCAL_PARQUET_PATH = Path("data/ecommerce_products.parquet")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
def load_data():
|
| 19 |
+
"""Load parquet từ local."""
|
| 20 |
+
if not LOCAL_PARQUET_PATH.exists():
|
| 21 |
+
raise FileNotFoundError(f"Parquet not found: {LOCAL_PARQUET_PATH}")
|
| 22 |
+
|
| 23 |
+
logger.info(f"Loading parquet from: {LOCAL_PARQUET_PATH}")
|
| 24 |
+
return pd.read_parquet(LOCAL_PARQUET_PATH)
|
| 25 |
+
|
| 26 |
|
| 27 |
# ==================== API Routes ====================
|
| 28 |
@app.get("/")
|
| 29 |
def root():
|
| 30 |
return {"status": "E-Commerce Product Intelligence API is running"}
|
| 31 |
|
| 32 |
+
|
| 33 |
@app.get("/data")
|
| 34 |
def get_data():
|
| 35 |
df = load_data()
|
| 36 |
return df.head(200).to_dict("records")
|
| 37 |
|
| 38 |
+
|
| 39 |
@app.get("/stats/categories")
|
| 40 |
def stats_categories():
|
| 41 |
df = load_data()
|
| 42 |
return df["category"].value_counts().head(10).to_dict()
|
| 43 |
|
| 44 |
+
|
| 45 |
@app.get("/stats/brands")
|
| 46 |
def stats_brands():
|
| 47 |
df = load_data()
|
| 48 |
return df["brand"].value_counts().head(10).to_dict()
|
| 49 |
|
| 50 |
+
|
| 51 |
@app.get("/stats/price")
|
| 52 |
def stats_price():
|
| 53 |
df = load_data()
|
| 54 |
+
return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
| 55 |
+
"records")
|
| 56 |
+
|
| 57 |
|
| 58 |
@app.get("/stats/rating")
|
| 59 |
def stats_rating():
|
| 60 |
df = load_data()
|
| 61 |
+
return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
|
| 62 |
+
"records")
|
| 63 |
+
|
| 64 |
|
| 65 |
@app.get("/insights")
|
| 66 |
def insights():
|
|
|
|
| 73 |
"avg_rating": df["rating"].mean(),
|
| 74 |
})
|
| 75 |
|
| 76 |
+
|
| 77 |
@app.get("/search")
|
| 78 |
def search(query: str):
|
| 79 |
df = load_data()
|
| 80 |
q = query.lower()
|
| 81 |
mask = (
|
| 82 |
+
df["title"].str.contains(q, case=False, na=False) |
|
| 83 |
+
df["description"].str.contains(q, case=False, na=False)
|
| 84 |
)
|
| 85 |
return df[mask].head(100).to_dict("records")
|
| 86 |
|
| 87 |
+
|
| 88 |
@app.get("/recommend")
|
| 89 |
def recommend(category: str):
|
| 90 |
df = load_data()
|
| 91 |
subset = df[df["category"] == category]
|
| 92 |
return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
|
| 93 |
|
| 94 |
+
|
| 95 |
# ==================== Frontend ====================
|
| 96 |
frontend_dir = Path("frontend")
|
| 97 |
if frontend_dir.exists():
|
backend/scraper.py
CHANGED
|
@@ -1,9 +1,13 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
-
from pathlib import Path
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
| 5 |
import shutil
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 8 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 9 |
OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
|
|
@@ -12,32 +16,29 @@ os.makedirs("data", exist_ok=True)
|
|
| 12 |
|
| 13 |
|
| 14 |
def setup_kaggle_api():
|
| 15 |
-
"""Auth Kaggle API
|
| 16 |
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 17 |
-
|
| 18 |
token = os.getenv("KAGGLE_API_TOKEN")
|
| 19 |
if not token:
|
| 20 |
raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
|
| 21 |
|
| 22 |
api = KaggleApi()
|
| 23 |
api.api_token = token
|
| 24 |
-
|
| 25 |
return api
|
| 26 |
|
| 27 |
|
| 28 |
def download_dataset():
|
| 29 |
-
"""Download
|
| 30 |
api = setup_kaggle_api()
|
| 31 |
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 32 |
|
| 33 |
-
|
| 34 |
api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
|
| 35 |
-
|
| 36 |
return TEMP_DIR
|
| 37 |
|
| 38 |
|
| 39 |
def find_csv_files(temp_dir: Path):
|
| 40 |
-
"""
|
| 41 |
csv_files = list(temp_dir.glob("**/*.csv"))
|
| 42 |
if not csv_files:
|
| 43 |
raise FileNotFoundError("No CSV files found.")
|
|
@@ -45,23 +46,23 @@ def find_csv_files(temp_dir: Path):
|
|
| 45 |
|
| 46 |
|
| 47 |
def load_and_concatenate(csv_files):
|
| 48 |
-
"""Concatenate
|
| 49 |
dfs = []
|
| 50 |
for f in csv_files:
|
| 51 |
-
|
| 52 |
df = pd.read_csv(f)
|
| 53 |
dfs.append(df)
|
| 54 |
return pd.concat(dfs, ignore_index=True)
|
| 55 |
|
| 56 |
|
| 57 |
def save_parquet(df: pd.DataFrame):
|
| 58 |
-
"""Save to Parquet
|
| 59 |
df.to_parquet(OUTPUT_PARQUET, index=False)
|
| 60 |
-
|
| 61 |
|
| 62 |
|
| 63 |
def run_scraper():
|
| 64 |
-
"""Full pipeline."""
|
| 65 |
try:
|
| 66 |
download_dataset()
|
| 67 |
csv_files = find_csv_files(TEMP_DIR)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import logging
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 6 |
import shutil
|
| 7 |
|
| 8 |
+
logging.basicConfig(level=logging.INFO)
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 12 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 13 |
OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def setup_kaggle_api():
|
| 19 |
+
"""Auth Kaggle API."""
|
|
|
|
|
|
|
| 20 |
token = os.getenv("KAGGLE_API_TOKEN")
|
| 21 |
if not token:
|
| 22 |
raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
|
| 23 |
|
| 24 |
api = KaggleApi()
|
| 25 |
api.api_token = token
|
|
|
|
| 26 |
return api
|
| 27 |
|
| 28 |
|
| 29 |
def download_dataset():
|
| 30 |
+
"""Download dataset từ Kaggle."""
|
| 31 |
api = setup_kaggle_api()
|
| 32 |
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 33 |
|
| 34 |
+
logger.info(f"Downloading dataset: {DATASET_SLUG}")
|
| 35 |
api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
|
| 36 |
+
logger.info("Download complete.")
|
| 37 |
return TEMP_DIR
|
| 38 |
|
| 39 |
|
| 40 |
def find_csv_files(temp_dir: Path):
|
| 41 |
+
"""Tìm tất CSV files."""
|
| 42 |
csv_files = list(temp_dir.glob("**/*.csv"))
|
| 43 |
if not csv_files:
|
| 44 |
raise FileNotFoundError("No CSV files found.")
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
def load_and_concatenate(csv_files):
|
| 49 |
+
"""Concatenate tất CSVs."""
|
| 50 |
dfs = []
|
| 51 |
for f in csv_files:
|
| 52 |
+
logger.info(f"Loading: {f}")
|
| 53 |
df = pd.read_csv(f)
|
| 54 |
dfs.append(df)
|
| 55 |
return pd.concat(dfs, ignore_index=True)
|
| 56 |
|
| 57 |
|
| 58 |
def save_parquet(df: pd.DataFrame):
|
| 59 |
+
"""Save to Parquet."""
|
| 60 |
df.to_parquet(OUTPUT_PARQUET, index=False)
|
| 61 |
+
logger.info(f"Saved to: {OUTPUT_PARQUET}")
|
| 62 |
|
| 63 |
|
| 64 |
def run_scraper():
|
| 65 |
+
"""Full pipeline: download Kaggle → save parquet."""
|
| 66 |
try:
|
| 67 |
download_dataset()
|
| 68 |
csv_files = find_csv_files(TEMP_DIR)
|
requirements.txt
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
fastapi==0.109.2
|
| 2 |
-
uvicorn
|
| 3 |
-
pandas
|
| 4 |
-
kaggle
|
| 5 |
-
pyarrow==18.0.0
|
| 6 |
-
apscheduler==3.11.0
|
| 7 |
-
httpx
|
|
|
|
| 1 |
fastapi==0.109.2
|
| 2 |
+
uvicorn
|
| 3 |
+
pandas
|
| 4 |
+
kaggle
|
|
|
|
|
|
|
|
|