Vincentran commited on
Commit
03da54f
·
1 Parent(s): 03b47cd

Upload E-Commerce Product Intelligence Dashboard

Browse files
Files changed (2) hide show
  1. app.py +21 -20
  2. backend/scraper.py +1 -27
app.py CHANGED
@@ -4,63 +4,60 @@ from fastapi import FastAPI
4
  from fastapi.staticfiles import StaticFiles
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from pathlib import Path
7
- from huggingface_hub import hf_hub_download
8
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
13
 
14
- HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
15
- HF_CSV_FILENAME = "ecommerce_products.csv"
16
  LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
17
 
18
- def load_data():
19
- """Load CSV từ HF Dataset."""
20
- data_dir = Path("data")
21
- data_dir.mkdir(parents=True, exist_ok=True)
22
 
 
 
23
  if not LOCAL_CSV_PATH.exists():
24
- logger.info(f"Downloading CSV from HF Dataset: {HF_DATASET_REPO}")
25
- local_path = hf_hub_download(
26
- repo_id=HF_DATASET_REPO,
27
- filename=HF_CSV_FILENAME,
28
- repo_type="dataset",
29
- cache_dir=str(data_dir)
30
- )
31
- logger.info(f"Downloaded to: {local_path}")
32
 
33
  logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
34
  return pd.read_csv(LOCAL_CSV_PATH)
35
 
 
36
  @app.get("/")
37
  def root():
38
  return {"status": "E-Commerce Product Intelligence API is running"}
39
 
 
40
  @app.get("/data")
41
  def get_data():
42
  df = load_data()
43
  return df.head(200).to_dict("records")
44
 
 
45
  @app.get("/stats/categories")
46
  def stats_categories():
47
  df = load_data()
48
  return df["category"].value_counts().head(10).to_dict()
49
 
 
50
  @app.get("/stats/brands")
51
  def stats_brands():
52
  df = load_data()
53
  return df["brand"].value_counts().head(10).to_dict()
54
 
 
55
  @app.get("/stats/price")
56
  def stats_price():
57
  df = load_data()
58
- return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
59
 
60
  @app.get("/stats/rating")
61
  def stats_rating():
62
  df = load_data()
63
- return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
64
 
65
  @app.get("/insights")
66
  def insights():
@@ -73,25 +70,28 @@ def insights():
73
  "avg_rating": df["rating"].mean(),
74
  })
75
 
 
76
  @app.get("/search")
77
  def search(query: str):
78
  df = load_data()
79
  q = query.lower()
80
  mask = (
81
- df["title"].str.contains(q, case=False, na=False) |
82
- df["description"].str.contains(q, case=False, na=False)
83
  )
84
  return df[mask].head(100).to_dict("records")
85
 
 
86
  @app.get("/recommend")
87
  def recommend(category: str):
88
  df = load_data()
89
  subset = df[df["category"] == category]
90
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
91
 
 
92
  @app.post("/run-scraper")
93
  def trigger_scraper():
94
- """Trigger upload CSV lên HF Dataset."""
95
  import subprocess
96
  result = subprocess.run(["python", "backend/scraper.py"], capture_output=True, text=True)
97
  if result.returncode == 0:
@@ -99,6 +99,7 @@ def trigger_scraper():
99
  else:
100
  return {"status": "Scraper failed", "error": result.stderr}
101
 
 
102
  frontend_dir = Path("frontend")
103
  if frontend_dir.exists():
104
  app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
 
4
  from fastapi.staticfiles import StaticFiles
5
  from fastapi.responses import HTMLResponse, JSONResponse
6
  from pathlib import Path
 
7
 
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
12
 
 
 
13
  LOCAL_CSV_PATH = Path("data/ecommerce_products.csv")
14
 
 
 
 
 
15
 
16
+ def load_data():
17
+ """Load CSV từ local."""
18
  if not LOCAL_CSV_PATH.exists():
19
+ raise FileNotFoundError(f"CSV not found: {LOCAL_CSV_PATH}")
 
 
 
 
 
 
 
20
 
21
  logger.info(f"Loading CSV from: {LOCAL_CSV_PATH}")
22
  return pd.read_csv(LOCAL_CSV_PATH)
23
 
24
+
25
  @app.get("/")
26
  def root():
27
  return {"status": "E-Commerce Product Intelligence API is running"}
28
 
29
+
30
  @app.get("/data")
31
  def get_data():
32
  df = load_data()
33
  return df.head(200).to_dict("records")
34
 
35
+
36
  @app.get("/stats/categories")
37
  def stats_categories():
38
  df = load_data()
39
  return df["category"].value_counts().head(10).to_dict()
40
 
41
+
42
  @app.get("/stats/brands")
43
  def stats_brands():
44
  df = load_data()
45
  return df["brand"].value_counts().head(10).to_dict()
46
 
47
+
48
  @app.get("/stats/price")
49
  def stats_price():
50
  df = load_data()
51
+ return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
52
+ "records")
53
+
54
 
55
  @app.get("/stats/rating")
56
  def stats_rating():
57
  df = load_data()
58
+ return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
59
+ "records")
60
+
61
 
62
  @app.get("/insights")
63
  def insights():
 
70
  "avg_rating": df["rating"].mean(),
71
  })
72
 
73
+
74
  @app.get("/search")
75
  def search(query: str):
76
  df = load_data()
77
  q = query.lower()
78
  mask = (
79
+ df["title"].str.contains(q, case=False, na=False) |
80
+ df["description"].str.contains(q, case=False, na=False)
81
  )
82
  return df[mask].head(100).to_dict("records")
83
 
84
+
85
  @app.get("/recommend")
86
  def recommend(category: str):
87
  df = load_data()
88
  subset = df[df["category"] == category]
89
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
90
 
91
+
92
  @app.post("/run-scraper")
93
  def trigger_scraper():
94
+ """Trigger download Kaggle save CSV."""
95
  import subprocess
96
  result = subprocess.run(["python", "backend/scraper.py"], capture_output=True, text=True)
97
  if result.returncode == 0:
 
99
  else:
100
  return {"status": "Scraper failed", "error": result.stderr}
101
 
102
+
103
  frontend_dir = Path("frontend")
104
  if frontend_dir.exists():
105
  app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
backend/scraper.py CHANGED
@@ -3,7 +3,6 @@ import logging
3
  import pandas as pd
4
  from pathlib import Path
5
  import shutil
6
- from huggingface_hub import upload_file
7
 
8
  # Set Kaggle env vars TRƯỚC khi import Kaggle
9
  token = os.getenv("KAGGLE_API_TOKEN")
@@ -12,17 +11,12 @@ if token:
12
  os.environ['KAGGLE_KEY'] = token_value
13
  os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
14
 
15
- # Lấy HF_TOKEN từ Space (mặc định có sẵn)
16
- HF_TOKEN = os.getenv("HF_TOKEN")
17
-
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
22
  TEMP_DIR = Path("data/temp_kaggle")
23
  OUTPUT_CSV = Path("data/ecommerce_products.csv")
24
- HF_DATASET_REPO = "Vincentran/ecommerce-dataset"
25
- HF_CSV_FILENAME = "ecommerce_products.csv"
26
 
27
  os.makedirs("data", exist_ok=True)
28
 
@@ -71,33 +65,13 @@ def save_csv(df: pd.DataFrame):
71
  logger.info(f"Saved to: {OUTPUT_CSV}")
72
 
73
 
74
- def upload_to_hf():
75
- """Upload CSV lên HF Dataset."""
76
- if not OUTPUT_CSV.exists():
77
- raise FileNotFoundError(f"CSV not found: {OUTPUT_CSV}")
78
-
79
- if not HF_TOKEN:
80
- logger.warning("HF_TOKEN not found, trying without authentication...")
81
-
82
- logger.info(f"Uploading CSV to HF Dataset: {HF_DATASET_REPO}")
83
- upload_file(
84
- path_or_fileobj=str(OUTPUT_CSV),
85
- path_in_repo=HF_CSV_FILENAME,
86
- repo_id=HF_DATASET_REPO,
87
- repo_type="dataset",
88
- token=HF_TOKEN if HF_TOKEN else None
89
- )
90
- logger.info("Upload completed successfully.")
91
-
92
-
93
  def run_scraper():
94
- """Full pipeline: download Kaggle → save CSV upload HF Dataset."""
95
  try:
96
  download_dataset()
97
  csv_files = find_csv_files(TEMP_DIR)
98
  df = load_and_concatenate(csv_files)
99
  save_csv(df)
100
- upload_to_hf()
101
  finally:
102
  shutil.rmtree(TEMP_DIR, ignore_errors=True)
103
 
 
3
  import pandas as pd
4
  from pathlib import Path
5
  import shutil
 
6
 
7
  # Set Kaggle env vars TRƯỚC khi import Kaggle
8
  token = os.getenv("KAGGLE_API_TOKEN")
 
11
  os.environ['KAGGLE_KEY'] = token_value
12
  os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
13
 
 
 
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
18
  TEMP_DIR = Path("data/temp_kaggle")
19
  OUTPUT_CSV = Path("data/ecommerce_products.csv")
 
 
20
 
21
  os.makedirs("data", exist_ok=True)
22
 
 
65
  logger.info(f"Saved to: {OUTPUT_CSV}")
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def run_scraper():
69
+ """Full pipeline: download Kaggle → save CSV (không upload HF)."""
70
  try:
71
  download_dataset()
72
  csv_files = find_csv_files(TEMP_DIR)
73
  df = load_and_concatenate(csv_files)
74
  save_csv(df)
 
75
  finally:
76
  shutil.rmtree(TEMP_DIR, ignore_errors=True)
77