Vincentran commited on
Commit
9058528
·
1 Parent(s): 2c7b3a2

Upload E-Commerce Product Intelligence Dashboard

Browse files
Files changed (4) hide show
  1. Dockerfile +7 -1
  2. app.py +27 -10
  3. backend/scraper.py +16 -15
  4. requirements.txt +3 -6
Dockerfile CHANGED
@@ -7,6 +7,12 @@ RUN pip install --no-cache-dir -r requirements.txt
7
 
8
  COPY . /app
9
 
 
 
 
 
 
 
10
  EXPOSE 8000
11
 
12
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
 
7
 
8
  COPY . /app
9
 
10
+ # Tạo data dir
11
+ RUN mkdir -p /app/data
12
+
13
+ # Script start: chạy scraper + uvicorn
14
+ RUN echo '#!/bin/bash\nexport KAGGLE_API_TOKEN="$KAGGLE_API_TOKEN"\npython backend/scraper.py && uvicorn app:app --host 0.0.0.0 --port 8000' > /app/start.sh && chmod +x /app/start.sh
15
+
16
  EXPOSE 8000
17
 
18
+ CMD ["/app/start.sh"]
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import logging
 
2
  import pandas as pd
3
  from fastapi import FastAPI
4
  from fastapi.staticfiles import StaticFiles
@@ -10,43 +11,56 @@ logger = logging.getLogger(__name__)
10
 
11
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
12
 
13
- # ==================== Load data ====================
 
 
 
14
  def load_data():
15
- """Load parquet."""
16
- parquet_path = Path("data/ecommerce_products.parquet")
17
- if not parquet_path.exists():
18
- raise FileNotFoundError(f"Parquet not found: {parquet_path}")
19
- return pd.read_parquet(parquet_path)
 
 
20
 
21
  # ==================== API Routes ====================
22
  @app.get("/")
23
  def root():
24
  return {"status": "E-Commerce Product Intelligence API is running"}
25
 
 
26
  @app.get("/data")
27
  def get_data():
28
  df = load_data()
29
  return df.head(200).to_dict("records")
30
 
 
31
  @app.get("/stats/categories")
32
  def stats_categories():
33
  df = load_data()
34
  return df["category"].value_counts().head(10).to_dict()
35
 
 
36
  @app.get("/stats/brands")
37
  def stats_brands():
38
  df = load_data()
39
  return df["brand"].value_counts().head(10).to_dict()
40
 
 
41
  @app.get("/stats/price")
42
  def stats_price():
43
  df = load_data()
44
- return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
45
 
46
  @app.get("/stats/rating")
47
  def stats_rating():
48
  df = load_data()
49
- return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict("records")
 
 
50
 
51
  @app.get("/insights")
52
  def insights():
@@ -59,22 +73,25 @@ def insights():
59
  "avg_rating": df["rating"].mean(),
60
  })
61
 
 
62
  @app.get("/search")
63
  def search(query: str):
64
  df = load_data()
65
  q = query.lower()
66
  mask = (
67
- df["title"].str.contains(q, case=False, na=False) |
68
- df["description"].str.contains(q, case=False, na=False)
69
  )
70
  return df[mask].head(100).to_dict("records")
71
 
 
72
  @app.get("/recommend")
73
  def recommend(category: str):
74
  df = load_data()
75
  subset = df[df["category"] == category]
76
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
77
 
 
78
  # ==================== Frontend ====================
79
  frontend_dir = Path("frontend")
80
  if frontend_dir.exists():
 
1
  import logging
2
+ import os
3
  import pandas as pd
4
  from fastapi import FastAPI
5
  from fastapi.staticfiles import StaticFiles
 
11
 
12
  app = FastAPI(title="E-Commerce Product Intelligence Platform")
13
 
14
+ # ==================== Load data (local parquet) ====================
15
+ LOCAL_PARQUET_PATH = Path("data/ecommerce_products.parquet")
16
+
17
+
18
  def load_data():
19
+ """Load parquet từ local."""
20
+ if not LOCAL_PARQUET_PATH.exists():
21
+ raise FileNotFoundError(f"Parquet not found: {LOCAL_PARQUET_PATH}")
22
+
23
+ logger.info(f"Loading parquet from: {LOCAL_PARQUET_PATH}")
24
+ return pd.read_parquet(LOCAL_PARQUET_PATH)
25
+
26
 
27
  # ==================== API Routes ====================
28
  @app.get("/")
29
  def root():
30
  return {"status": "E-Commerce Product Intelligence API is running"}
31
 
32
+
33
  @app.get("/data")
34
  def get_data():
35
  df = load_data()
36
  return df.head(200).to_dict("records")
37
 
38
+
39
  @app.get("/stats/categories")
40
  def stats_categories():
41
  df = load_data()
42
  return df["category"].value_counts().head(10).to_dict()
43
 
44
+
45
  @app.get("/stats/brands")
46
  def stats_brands():
47
  df = load_data()
48
  return df["brand"].value_counts().head(10).to_dict()
49
 
50
+
51
  @app.get("/stats/price")
52
  def stats_price():
53
  df = load_data()
54
+ return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
55
+ "records")
56
+
57
 
58
  @app.get("/stats/rating")
59
  def stats_rating():
60
  df = load_data()
61
+ return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index().to_dict(
62
+ "records")
63
+
64
 
65
  @app.get("/insights")
66
  def insights():
 
73
  "avg_rating": df["rating"].mean(),
74
  })
75
 
76
+
77
  @app.get("/search")
78
  def search(query: str):
79
  df = load_data()
80
  q = query.lower()
81
  mask = (
82
+ df["title"].str.contains(q, case=False, na=False) |
83
+ df["description"].str.contains(q, case=False, na=False)
84
  )
85
  return df[mask].head(100).to_dict("records")
86
 
87
+
88
  @app.get("/recommend")
89
  def recommend(category: str):
90
  df = load_data()
91
  subset = df[df["category"] == category]
92
  return subset.sort_values("rating", ascending=False).head(10).to_dict("records")
93
 
94
+
95
  # ==================== Frontend ====================
96
  frontend_dir = Path("frontend")
97
  if frontend_dir.exists():
backend/scraper.py CHANGED
@@ -1,9 +1,13 @@
1
  import os
2
- import json
3
- from pathlib import Path
4
  import pandas as pd
 
 
5
  import shutil
6
 
 
 
 
7
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
8
  TEMP_DIR = Path("data/temp_kaggle")
9
  OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
@@ -12,32 +16,29 @@ os.makedirs("data", exist_ok=True)
12
 
13
 
14
  def setup_kaggle_api():
15
- """Auth Kaggle API từ environment variable."""
16
- from kaggle.api.kaggle_api_extended import KaggleApi
17
-
18
  token = os.getenv("KAGGLE_API_TOKEN")
19
  if not token:
20
  raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
21
 
22
  api = KaggleApi()
23
  api.api_token = token
24
-
25
  return api
26
 
27
 
28
  def download_dataset():
29
- """Download full dataset."""
30
  api = setup_kaggle_api()
31
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
32
 
33
- print(f"Downloading dataset: {DATASET_SLUG}")
34
  api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
35
- print("Download complete.")
36
  return TEMP_DIR
37
 
38
 
39
  def find_csv_files(temp_dir: Path):
40
- """Find all CSV files."""
41
  csv_files = list(temp_dir.glob("**/*.csv"))
42
  if not csv_files:
43
  raise FileNotFoundError("No CSV files found.")
@@ -45,23 +46,23 @@ def find_csv_files(temp_dir: Path):
45
 
46
 
47
  def load_and_concatenate(csv_files):
48
- """Concatenate all CSVs."""
49
  dfs = []
50
  for f in csv_files:
51
- print(f"Loading: {f}")
52
  df = pd.read_csv(f)
53
  dfs.append(df)
54
  return pd.concat(dfs, ignore_index=True)
55
 
56
 
57
  def save_parquet(df: pd.DataFrame):
58
- """Save to Parquet, overwrite."""
59
  df.to_parquet(OUTPUT_PARQUET, index=False)
60
- print(f"Saved to: {OUTPUT_PARQUET}")
61
 
62
 
63
  def run_scraper():
64
- """Full pipeline."""
65
  try:
66
  download_dataset()
67
  csv_files = find_csv_files(TEMP_DIR)
 
1
  import os
2
+ import logging
 
3
  import pandas as pd
4
+ from pathlib import Path
5
+ from kaggle.api.kaggle_api_extended import KaggleApi
6
  import shutil
7
 
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
12
  TEMP_DIR = Path("data/temp_kaggle")
13
  OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
 
16
 
17
 
18
  def setup_kaggle_api():
19
+ """Auth Kaggle API."""
 
 
20
  token = os.getenv("KAGGLE_API_TOKEN")
21
  if not token:
22
  raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
23
 
24
  api = KaggleApi()
25
  api.api_token = token
 
26
  return api
27
 
28
 
29
  def download_dataset():
30
+ """Download dataset từ Kaggle."""
31
  api = setup_kaggle_api()
32
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
33
 
34
+ logger.info(f"Downloading dataset: {DATASET_SLUG}")
35
  api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
36
+ logger.info("Download complete.")
37
  return TEMP_DIR
38
 
39
 
40
  def find_csv_files(temp_dir: Path):
41
+ """Tìm tất CSV files."""
42
  csv_files = list(temp_dir.glob("**/*.csv"))
43
  if not csv_files:
44
  raise FileNotFoundError("No CSV files found.")
 
46
 
47
 
48
  def load_and_concatenate(csv_files):
49
+ """Concatenate tất CSVs."""
50
  dfs = []
51
  for f in csv_files:
52
+ logger.info(f"Loading: {f}")
53
  df = pd.read_csv(f)
54
  dfs.append(df)
55
  return pd.concat(dfs, ignore_index=True)
56
 
57
 
58
  def save_parquet(df: pd.DataFrame):
59
+ """Save to Parquet."""
60
  df.to_parquet(OUTPUT_PARQUET, index=False)
61
+ logger.info(f"Saved to: {OUTPUT_PARQUET}")
62
 
63
 
64
  def run_scraper():
65
+ """Full pipeline: download Kaggle → save parquet."""
66
  try:
67
  download_dataset()
68
  csv_files = find_csv_files(TEMP_DIR)
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
  fastapi==0.109.2
2
- uvicorn[standard]==0.27.1
3
- pandas==2.2.0
4
- kaggle==1.6.17
5
- pyarrow==18.0.0
6
- apscheduler==3.11.0
7
- httpx
 
1
  fastapi==0.109.2
2
+ uvicorn
3
+ pandas
4
+ kaggle