Vincentran commited on
Commit
e48c3a5
·
1 Parent(s): 019d08d

Upload E-Commerce Product Intelligence Dashboard

Browse files
Files changed (2) hide show
  1. backend/scheduler.py +6 -1
  2. backend/scraper.py +23 -4
backend/scheduler.py CHANGED
@@ -1,20 +1,24 @@
1
  from apscheduler.schedulers.background import BackgroundScheduler
2
  from apscheduler.triggers.cron import CronTrigger
3
- from backend.scraper import run_scraper
4
  import logging
5
 
6
  logger = logging.getLogger("scheduler")
7
 
8
  scheduler = BackgroundScheduler()
9
 
 
10
  def scheduled_scraper_job():
 
11
  logger.info("Running scheduled scraper job...")
12
  try:
 
 
13
  run_scraper()
14
  logger.info("Scheduled scraper job completed.")
15
  except Exception as e:
16
  logger.error(f"Scheduled scraper job failed: {e}")
17
 
 
18
  def init_scheduler():
19
  """Schedule scraper to run daily at 02:00 AM."""
20
  scheduler.add_job(
@@ -26,5 +30,6 @@ def init_scheduler():
26
  logger.info("Scheduled scraper job added: daily at 02:00 AM")
27
  scheduler.start()
28
 
 
29
  def shutdown_scheduler():
30
  scheduler.shutdown()
 
1
  from apscheduler.schedulers.background import BackgroundScheduler
2
  from apscheduler.triggers.cron import CronTrigger
 
3
  import logging
4
 
5
  logger = logging.getLogger("scheduler")
6
 
7
  scheduler = BackgroundScheduler()
8
 
9
+
10
  def scheduled_scraper_job():
11
+ """Chạy scraper trong job."""
12
  logger.info("Running scheduled scraper job...")
13
  try:
14
+ # Import dynamic khi cần
15
+ from backend.scraper import run_scraper
16
  run_scraper()
17
  logger.info("Scheduled scraper job completed.")
18
  except Exception as e:
19
  logger.error(f"Scheduled scraper job failed: {e}")
20
 
21
+
22
  def init_scheduler():
23
  """Schedule scraper to run daily at 02:00 AM."""
24
  scheduler.add_job(
 
30
  logger.info("Scheduled scraper job added: daily at 02:00 AM")
31
  scheduler.start()
32
 
33
+
34
  def shutdown_scheduler():
35
  scheduler.shutdown()
backend/scraper.py CHANGED
@@ -1,23 +1,37 @@
1
  import os
2
  import json
3
- import zipfile
4
  from pathlib import Path
5
- from kaggle.api.kaggle_api_extended import KaggleApi
6
  import pandas as pd
7
  import shutil
8
 
 
 
 
9
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
10
  TEMP_DIR = Path("data/temp_kaggle")
11
  OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
12
 
13
  os.makedirs("data", exist_ok=True)
14
 
 
15
  def setup_kaggle_api():
16
- """Auth Kaggle API from environment or file."""
 
 
 
 
 
 
17
  api = KaggleApi()
18
- api.authenticate()
 
 
 
 
 
19
  return api
20
 
 
21
  def download_dataset():
22
  """Download full dataset."""
23
  api = setup_kaggle_api()
@@ -28,6 +42,7 @@ def download_dataset():
28
  print("Download complete.")
29
  return TEMP_DIR
30
 
 
31
  def find_csv_files(temp_dir: Path):
32
  """Find all CSV files."""
33
  csv_files = list(temp_dir.glob("**/*.csv"))
@@ -35,6 +50,7 @@ def find_csv_files(temp_dir: Path):
35
  raise FileNotFoundError("No CSV files found.")
36
  return csv_files
37
 
 
38
  def load_and_concatenate(csv_files):
39
  """Concatenate all CSVs."""
40
  dfs = []
@@ -44,11 +60,13 @@ def load_and_concatenate(csv_files):
44
  dfs.append(df)
45
  return pd.concat(dfs, ignore_index=True)
46
 
 
47
  def save_parquet(df: pd.DataFrame):
48
  """Save to Parquet, overwrite."""
49
  df.to_parquet(OUTPUT_PARQUET, index=False)
50
  print(f"Saved to: {OUTPUT_PARQUET}")
51
 
 
52
  def run_scraper():
53
  """Full pipeline."""
54
  try:
@@ -61,5 +79,6 @@ def run_scraper():
61
 
62
  return df
63
 
 
64
  if __name__ == "__main__":
65
  run_scraper()
 
1
  import os
2
  import json
 
3
  from pathlib import Path
 
4
  import pandas as pd
5
  import shutil
6
 
7
+ # Không import KaggleApi ở đây!
8
+ # Sẽ import và authenticate khi cần
9
+
10
  DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
11
  TEMP_DIR = Path("data/temp_kaggle")
12
  OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
13
 
14
  os.makedirs("data", exist_ok=True)
15
 
16
+
17
  def setup_kaggle_api():
18
+ """Auth Kaggle API từ environment variable."""
19
+ from kaggle.api.kaggle_api_extended import KaggleApi
20
+
21
+ token = os.getenv("KAGGLE_API_TOKEN")
22
+ if not token:
23
+ raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
24
+
25
  api = KaggleApi()
26
+
27
+ # Tự authenticate từ token
28
+ # Token format: KGAT_xxxxx
29
+ # Kaggle cần: username + key
30
+ api.api_token = token
31
+
32
  return api
33
 
34
+
35
  def download_dataset():
36
  """Download full dataset."""
37
  api = setup_kaggle_api()
 
42
  print("Download complete.")
43
  return TEMP_DIR
44
 
45
+
46
  def find_csv_files(temp_dir: Path):
47
  """Find all CSV files."""
48
  csv_files = list(temp_dir.glob("**/*.csv"))
 
50
  raise FileNotFoundError("No CSV files found.")
51
  return csv_files
52
 
53
+
54
  def load_and_concatenate(csv_files):
55
  """Concatenate all CSVs."""
56
  dfs = []
 
60
  dfs.append(df)
61
  return pd.concat(dfs, ignore_index=True)
62
 
63
+
64
  def save_parquet(df: pd.DataFrame):
65
  """Save to Parquet, overwrite."""
66
  df.to_parquet(OUTPUT_PARQUET, index=False)
67
  print(f"Saved to: {OUTPUT_PARQUET}")
68
 
69
+
70
  def run_scraper():
71
  """Full pipeline."""
72
  try:
 
79
 
80
  return df
81
 
82
+
83
  if __name__ == "__main__":
84
  run_scraper()