Vincentran commited on
Commit
c5afea2
·
1 Parent(s): 9058528

Upload E-Commerce Product Intelligence Dashboard

Browse files
Files changed (1) hide show
  1. backend/scraper.py +12 -15
backend/scraper.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import logging
3
  import pandas as pd
4
  from pathlib import Path
5
- from kaggle.api.kaggle_api_extended import KaggleApi
6
  import shutil
7
 
8
  logging.basicConfig(level=logging.INFO)
@@ -15,24 +15,21 @@ OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
15
  os.makedirs("data", exist_ok=True)
16
 
17
 
18
- def setup_kaggle_api():
19
- """Auth Kaggle API."""
20
- token = os.getenv("KAGGLE_API_TOKEN")
21
- if not token:
22
- raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
23
-
24
- api = KaggleApi()
25
- api.api_token = token
26
- return api
27
-
28
-
29
  def download_dataset():
30
- """Download dataset từ Kaggle."""
31
- api = setup_kaggle_api()
32
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
33
 
34
  logger.info(f"Downloading dataset: {DATASET_SLUG}")
35
- api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
 
 
 
 
 
 
 
 
 
36
  logger.info("Download complete.")
37
  return TEMP_DIR
38
 
 
2
  import logging
3
  import pandas as pd
4
  from pathlib import Path
5
+ import subprocess
6
  import shutil
7
 
8
  logging.basicConfig(level=logging.INFO)
 
15
  os.makedirs("data", exist_ok=True)
16
 
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  def download_dataset():
19
+ """Download dataset từ Kaggle bằng CLI."""
 
20
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
21
 
22
  logger.info(f"Downloading dataset: {DATASET_SLUG}")
23
+
24
+ result = subprocess.run(
25
+ ["kaggle", "datasets", "download", "-d", DATASET_SLUG, "-p", str(TEMP_DIR), "-u"],
26
+ capture_output=True, text=True
27
+ )
28
+
29
+ if result.returncode != 0:
30
+ logger.error(f"Download failed: {result.stderr}")
31
+ raise RuntimeError(f"Kaggle download failed: {result.stderr}")
32
+
33
  logger.info("Download complete.")
34
  return TEMP_DIR
35