Vincentran commited on
Commit
4332540
·
1 Parent(s): d939d66

Upload E-Commerce Product Intelligence Dashboard

Browse files
Files changed (3) hide show
  1. app.py +35 -20
  2. backend/scraper.py +20 -1
  3. requirements.txt +1 -1
app.py CHANGED
@@ -1,66 +1,73 @@
1
  import logging
2
- import os
3
  import pandas as pd
4
  from fastapi import FastAPI
5
  from fastapi.staticfiles import StaticFiles
6
  from fastapi.responses import HTMLResponse, JSONResponse
7
  from pathlib import Path
 
8
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
13
 
14
- # ==================== Load data (local CSV) ====================
 
 
15
  LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
16
 
17
-
18
  def load_data():
19
- """Load CSV từ local."""
 
 
 
20
  if not LOCAL_CSV_PATH.exists():
21
- raise FileNotFoundError(f"CSV not found: {LOCAL_CSV_PATH}")
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
24
  return pd.read_csv(LOCAL_CSV_PATH)
25
 
26
-
27
  # ==================== API Routes ====================
28
  @app.get("/")
29
  def root():
30
  return {"status": "E-Commerce Product Intelligence API is running"}
31
 
32
-
33
  @app.get("/data")
34
  def get_data():
35
  df = load_data()
36
  return df.head(200).to_dict("records")
37
 
38
-
39
  @app.get("/stats/categories")
40
  def stats_categories():
41
  df = load_data()
42
  return df["category"].value_counts().head(10).to_dict()
43
 
44
-
45
  @app.get("/stats/brands")
46
  def stats_brands():
47
  df = load_data()
48
  return df["brand"].value_counts().head(10).to_dict()
49
 
50
-
51
  @app.get("/stats/price")
52
  def stats_price():
53
  df = load_data()
54
- return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
55
- "records")
56
-
57
 
58
  @app.get("/stats/rating")
59
  def stats_rating():
60
  df = load_data()
61
- return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
62
- "records")
63
-
64
 
65
  @app.get("/insights")
66
  def insights():
@@ -73,24 +80,32 @@ def insights():
73
  "avg_rating": df["rating"].mean(),
74
  })
75
 
76
-
77
  @app.get("/search")
78
  def search(query: str):
79
  df = load_data()
80
  q = query.lower()
81
  mask = (
82
- df["title"].str.contains(q, case=False, na=False) |
83
- df["description"].str.contains(q, case=False, na=False)
84
  )
85
  return df[mask].head(100).to_dict("records")
86
 
87
-
88
  @app.get("/recommend")
89
  def recommend(category: str):
90
  df = load_data()
91
  subset = df[df["category"] == category]
92
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
93
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # ==================== Frontend ====================
96
  frontend_dir = Path("frontend")
 
1
  import logging
 
2
  import pandas as pd
3
  from fastapi import FastAPI
4
  from fastapi.staticfiles import StaticFiles
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from pathlib import Path
7
+ from huggingface_hub import hf_hub_download
8
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
13
 
14
+ # ==================== HF Dataset Config ====================
15
+ HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
16
+ HF_CSV_FILENAME = "ecommerce_products.csv"
17
  LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
18
 
19
+ # ==================== Load data từ HF ====================
20
  def load_data():
21
+ """Load CSV từ HF Dataset (download nếu chưa có)."""
22
+ data_dir = Path("data")
23
+ data_dir.mkdir(parents=True, exist_ok=True)
24
+
25
  if not LOCAL_CSV_PATH.exists():
26
+ logger.info(f"Downloading CSV from HF Dataset: {HF_DATASET_REPO}")
27
+ try:
28
+ local_path = hf_hub_download(
29
+ repo_id=HF_DATASET_REPO,
30
+ filename=HF_CSV_FILENAME,
31
+ repo_type="dataset",
32
+ cache_dir=str(data_dir)
33
+ )
34
+ logger.info(f"Downloaded to: {local_path}")
35
+ except Exception as e:
36
+ logger.error(f"Failed to download CSV: {e}")
37
+ raise FileNotFoundError(f"CSV not found on HF: {e}")
38
 
39
  logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
40
  return pd.read_csv(LOCAL_CSV_PATH)
41
 
 
42
  # ==================== API Routes ====================
43
  @app.get("/")
44
  def root():
45
  return {"status": "E-Commerce Product Intelligence API is running"}
46
 
 
47
  @app.get("/data")
48
  def get_data():
49
  df = load_data()
50
  return df.head(200).to_dict("records")
51
 
 
52
  @app.get("/stats/categories")
53
  def stats_categories():
54
  df = load_data()
55
  return df["category"].value_counts().head(10).to_dict()
56
 
 
57
  @app.get("/stats/brands")
58
  def stats_brands():
59
  df = load_data()
60
  return df["brand"].value_counts().head(10).to_dict()
61
 
 
62
  @app.get("/stats/price")
63
  def stats_price():
64
  df = load_data()
65
+ return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
66
 
67
  @app.get("/stats/rating")
68
  def stats_rating():
69
  df = load_data()
70
+ return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
71
 
72
  @app.get("/insights")
73
  def insights():
 
80
  "avg_rating": df["rating"].mean(),
81
  })
82
 
 
83
  @app.get("/search")
84
  def search(query: str):
85
  df = load_data()
86
  q = query.lower()
87
  mask = (
88
+ df["title"].str.contains(q, case=False, na=False) |
89
+ df["description"].str.contains(q, case=False, na=False)
90
  )
91
  return df[mask].head(100).to_dict("records")
92
 
 
93
  @app.get("/recommend")
94
  def recommend(category: str):
95
  df = load_data()
96
  subset = df[df["category"] == category]
97
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
98
 
99
+ # ==================== Scraper Trigger ====================
100
+ @app.post("/run-scraper")
101
+ def trigger_scraper():
102
+ """Trigger upload CSV lên HF Dataset."""
103
+ import subprocess
104
+ result = subprocess.run(["python", "backend/scraper.py"], capture_output=True, text=True)
105
+ if result.returncode == 0:
106
+ return {"status": "Scraper completed successfully", "output": result.stdout}
107
+ else:
108
+ return {"status": "Scraper failed", "error": result.stderr}
109
 
110
  # ==================== Frontend ====================
111
  frontend_dir = Path("frontend")
backend/scraper.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
  import pandas as pd
4
  from pathlib import Path
5
  import shutil
 
6
 
7
  # Set Kaggle env vars TRƯỚC khi import Kaggle
8
  token = os.getenv("KAGGLE_API_TOKEN")
@@ -17,6 +18,8 @@ logger = logging.getLogger(__name__)
17
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
18
  TEMP_DIR = Path("data/temp_kaggle")
19
  OUTPUT_CSV = Path("data/ecommerce_products.csv")
 
 
20
 
21
  os.makedirs("data", exist_ok=True)
22
 
@@ -65,13 +68,29 @@ def save_csv(df: pd.DataFrame):
65
  logger.info(f"Saved to: {OUTPUT_CSV}")
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def run_scraper():
69
- """Full pipeline: download Kaggle → save CSV."""
70
  try:
71
  download_dataset()
72
  csv_files = find_csv_files(TEMP_DIR)
73
  df = load_and_concatenate(csv_files)
74
  save_csv(df)
 
75
  finally:
76
  shutil.rmtree(TEMP_DIR, ignore_errors=True)
77
 
 
3
  import pandas as pd
4
  from pathlib import Path
5
  import shutil
6
+ from huggingface_hub import upload_file
7
 
8
  # Set Kaggle env vars TRƯỚC khi import Kaggle
9
  token = os.getenv("KAGGLE_API_TOKEN")
 
18
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
19
  TEMP_DIR = Path("data/temp_kaggle")
20
  OUTPUT_CSV = Path("data/ecommerce_products.csv")
21
+ HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
22
+ HF_CSV_FILENAME = "ecommerce_products.csv"
23
 
24
  os.makedirs("data", exist_ok=True)
25
 
 
68
  logger.info(f"Saved to: {OUTPUT_CSV}")
69
 
70
 
71
+ def upload_to_hf():
72
+ """Upload CSV lên HF Dataset."""
73
+ if not OUTPUT_CSV.exists():
74
+ raise FileNotFoundError(f"CSV not found: {OUTPUT_CSV}")
75
+
76
+ logger.info(f"Uploading CSV to HF Dataset: {HF_DATASET_REPO}")
77
+ upload_file(
78
+ path_or_fileobj=str(OUTPUT_CSV),
79
+ path_in_repo=HF_CSV_FILENAME,
80
+ repo_id=HF_DATASET_REPO,
81
+ repo_type="dataset"
82
+ )
83
+ logger.info("Upload completed successfully.")
84
+
85
+
86
  def run_scraper():
87
+ """Full pipeline: download Kaggle → save CSV → upload HF Dataset."""
88
  try:
89
  download_dataset()
90
  csv_files = find_csv_files(TEMP_DIR)
91
  df = load_and_concatenate(csv_files)
92
  save_csv(df)
93
+ upload_to_hf()
94
  finally:
95
  shutil.rmtree(TEMP_DIR, ignore_errors=True)
96
 
requirements.txt CHANGED
@@ -2,4 +2,4 @@ fastapi==0.109.2
2
  uvicorn
3
  pandas
4
  kaggle
5
- pyarrow
 
2
  uvicorn
3
  pandas
4
  kaggle
5
+ huggingface_hub