Vincentran commited on
Commit
d3157ae
·
1 Parent(s): 72ea64a

Upload E-Commerce Product Intelligence Dashboard (frontend + backend)

Browse files
Files changed (1) hide show
  1. backend/scraper.py +44 -60
backend/scraper.py CHANGED
@@ -1,82 +1,66 @@
1
- import os
2
- import logging
3
- import pandas as pd
4
- from pathlib import Path
5
- import shutil
6
 
7
- # Set Kaggle env vars TRƯỚC khi import Kaggle
8
- token = os.getenv("KAGGLE_API_TOKEN")
9
- if token:
10
- # Kaggle API v2:直接使用 access_token
11
- os.environ['KAGGLE_API_TOKEN'] = token
12
- os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
13
 
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
 
17
- DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
18
- TEMP_DIR = Path("data/temp_kaggle")
19
- OUTPUT_CSV = Path("data/ecommerce_products.csv")
 
20
 
21
- os.makedirs("data", exist_ok=True)
 
 
 
22
 
 
 
 
23
 
24
- def setup_kaggle_api():
25
- """Auth Kaggle API."""
26
- from kaggle.api.kaggle_api_extended import KaggleApi
27
 
28
- api = KaggleApi()
29
- api.authenticate()
30
- return api
31
 
32
 
33
  def download_dataset():
34
- """Download dataset từ Kaggle."""
35
  api = setup_kaggle_api()
36
- TEMP_DIR.mkdir(parents=True, exist_ok=True)
37
 
38
- logger.info(f"Downloading dataset: {DATASET_SLUG}")
39
- api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
40
- logger.info("Download complete.")
41
- return TEMP_DIR
42
 
 
 
 
43
 
44
- def find_csv_files(temp_dir: Path):
45
- """Tìm tất CSV files."""
46
- csv_files = list(temp_dir.glob("**/*.csv"))
47
- if not csv_files:
48
- raise FileNotFoundError("No CSV files found.")
49
- return csv_files
50
 
51
-
52
- def load_and_concatenate(csv_files):
53
- """Concatenate tất CSVs."""
54
- dfs = []
55
- for f in csv_files:
56
- logger.info(f"Loading: {f}")
57
- df = pd.read_csv(f)
58
- dfs.append(df)
59
- return pd.concat(dfs, ignore_index=True)
60
-
61
-
62
- def save_csv(df: pd.DataFrame):
63
- """Save to CSV."""
64
- df.to_csv(OUTPUT_CSV, index=False)
65
- logger.info(f"Saved to: {OUTPUT_CSV}")
66
 
67
 
68
  def run_scraper():
69
- """Full pipeline: download Kaggle → save CSV (không upload HF)."""
70
- try:
71
- download_dataset()
72
- csv_files = find_csv_files(TEMP_DIR)
73
- df = load_and_concatenate(csv_files)
74
- save_csv(df)
75
- finally:
76
- shutil.rmtree(TEMP_DIR, ignore_errors=True)
 
77
 
78
- return df
79
 
 
 
 
80
 
 
81
  if __name__ == "__main__":
82
- run_scraper()
 
 
1
+ # backend/scraper.py
 
 
 
 
2
 
3
+ import os
4
+ import json
5
+ from kaggle.api.kaggle_api_extended import KaggleApi # ❌ XÃI import này
 
 
 
6
 
 
 
7
 
8
+ def setup_kaggle_api():
9
+ """Setup Kaggle API."""
10
+ # Check if kaggle.json exists
11
+ kaggle_json_path = os.path.join(os.environ.get('HOME', '/root'), '.config/kaggle/kaggle.json')
12
 
13
+ if not os.path.exists(kaggle_json_path):
14
+ print(f"⚠️ kaggle.json not found at {kaggle_json_path}")
15
+ print("⚠️ Kaggle API will not work. Upload kaggle.json to HF Space secrets.")
16
+ return None
17
 
18
+ # Setup kaggle
19
+ with open(kaggle_json_path) as f:
20
+ kaggle_credentials = json.load(f)
21
 
22
+ os.environ['KAGGLE_USERNAME'] = kaggle_credentials['username']
23
+ os.environ['KAGGLE_KEY'] = kaggle_credentials['key']
 
24
 
25
+ return KaggleApi()
 
 
26
 
27
 
28
  def download_dataset():
29
+ """Download dataset from Kaggle."""
30
  api = setup_kaggle_api()
 
31
 
32
+ if api is None:
33
+ print("❌ Kaggle API not available. Skipping download.")
34
+ return False
 
35
 
36
+ # Download dataset
37
+ dataset_name = "jackdaug/ecommerce-products-dataset"
38
+ os.makedirs("data", exist_ok=True)
39
 
40
+ print(f"📥 Downloading dataset: {dataset_name}")
41
+ api.dataset_download_files(dataset_name, path="data", unzip=True)
42
+ print("✅ Dataset downloaded!")
 
 
 
43
 
44
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  def run_scraper():
48
+ """Run scraper."""
49
+ print("🚀 Running scraper...")
50
+
51
+ if download_dataset():
52
+ print("✅ Scraper completed successfully!")
53
+ return True
54
+ else:
55
+ print("❌ Scraper failed!")
56
+ return False
57
 
 
58
 
59
+ # ✅ XÓA HOÀN TOÀN:
60
+ # if __name__ == "__main__":
61
+ # run_scraper()
62
 
63
+ # ✅ Chỉ để:
64
  if __name__ == "__main__":
65
+ print("📦 scraper.py imported (not running automatically)")
66
+ print("📦 Call run_scraper() manually or via API")