Spaces:
Runtime error
Runtime error
Commit ·
e48c3a5
1
Parent(s): 019d08d
Upload E-Commerce Product Intelligence Dashboard
Browse files- backend/scheduler.py +6 -1
- backend/scraper.py +23 -4
backend/scheduler.py
CHANGED
|
@@ -1,20 +1,24 @@
|
|
| 1 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 2 |
from apscheduler.triggers.cron import CronTrigger
|
| 3 |
-
from backend.scraper import run_scraper
|
| 4 |
import logging
|
| 5 |
|
| 6 |
logger = logging.getLogger("scheduler")
|
| 7 |
|
| 8 |
scheduler = BackgroundScheduler()
|
| 9 |
|
|
|
|
| 10 |
def scheduled_scraper_job():
|
|
|
|
| 11 |
logger.info("Running scheduled scraper job...")
|
| 12 |
try:
|
|
|
|
|
|
|
| 13 |
run_scraper()
|
| 14 |
logger.info("Scheduled scraper job completed.")
|
| 15 |
except Exception as e:
|
| 16 |
logger.error(f"Scheduled scraper job failed: {e}")
|
| 17 |
|
|
|
|
| 18 |
def init_scheduler():
|
| 19 |
"""Schedule scraper to run daily at 02:00 AM."""
|
| 20 |
scheduler.add_job(
|
|
@@ -26,5 +30,6 @@ def init_scheduler():
|
|
| 26 |
logger.info("Scheduled scraper job added: daily at 02:00 AM")
|
| 27 |
scheduler.start()
|
| 28 |
|
|
|
|
| 29 |
def shutdown_scheduler():
|
| 30 |
scheduler.shutdown()
|
|
|
|
| 1 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 2 |
from apscheduler.triggers.cron import CronTrigger
|
|
|
|
| 3 |
import logging
|
| 4 |
|
| 5 |
logger = logging.getLogger("scheduler")
|
| 6 |
|
| 7 |
scheduler = BackgroundScheduler()
|
| 8 |
|
| 9 |
+
|
| 10 |
def scheduled_scraper_job():
|
| 11 |
+
"""Chạy scraper trong job."""
|
| 12 |
logger.info("Running scheduled scraper job...")
|
| 13 |
try:
|
| 14 |
+
# Import dynamic khi cần
|
| 15 |
+
from backend.scraper import run_scraper
|
| 16 |
run_scraper()
|
| 17 |
logger.info("Scheduled scraper job completed.")
|
| 18 |
except Exception as e:
|
| 19 |
logger.error(f"Scheduled scraper job failed: {e}")
|
| 20 |
|
| 21 |
+
|
| 22 |
def init_scheduler():
|
| 23 |
"""Schedule scraper to run daily at 02:00 AM."""
|
| 24 |
scheduler.add_job(
|
|
|
|
| 30 |
logger.info("Scheduled scraper job added: daily at 02:00 AM")
|
| 31 |
scheduler.start()
|
| 32 |
|
| 33 |
+
|
| 34 |
def shutdown_scheduler():
|
| 35 |
scheduler.shutdown()
|
backend/scraper.py
CHANGED
|
@@ -1,23 +1,37 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
-
import zipfile
|
| 4 |
from pathlib import Path
|
| 5 |
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 6 |
import pandas as pd
|
| 7 |
import shutil
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 10 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 11 |
OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
|
| 12 |
|
| 13 |
os.makedirs("data", exist_ok=True)
|
| 14 |
|
|
|
|
| 15 |
def setup_kaggle_api():
|
| 16 |
-
"""Auth Kaggle API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
api = KaggleApi()
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
return api
|
| 20 |
|
|
|
|
| 21 |
def download_dataset():
|
| 22 |
"""Download full dataset."""
|
| 23 |
api = setup_kaggle_api()
|
|
@@ -28,6 +42,7 @@ def download_dataset():
|
|
| 28 |
print("Download complete.")
|
| 29 |
return TEMP_DIR
|
| 30 |
|
|
|
|
| 31 |
def find_csv_files(temp_dir: Path):
|
| 32 |
"""Find all CSV files."""
|
| 33 |
csv_files = list(temp_dir.glob("**/*.csv"))
|
|
@@ -35,6 +50,7 @@ def find_csv_files(temp_dir: Path):
|
|
| 35 |
raise FileNotFoundError("No CSV files found.")
|
| 36 |
return csv_files
|
| 37 |
|
|
|
|
| 38 |
def load_and_concatenate(csv_files):
|
| 39 |
"""Concatenate all CSVs."""
|
| 40 |
dfs = []
|
|
@@ -44,11 +60,13 @@ def load_and_concatenate(csv_files):
|
|
| 44 |
dfs.append(df)
|
| 45 |
return pd.concat(dfs, ignore_index=True)
|
| 46 |
|
|
|
|
| 47 |
def save_parquet(df: pd.DataFrame):
|
| 48 |
"""Save to Parquet, overwrite."""
|
| 49 |
df.to_parquet(OUTPUT_PARQUET, index=False)
|
| 50 |
print(f"Saved to: {OUTPUT_PARQUET}")
|
| 51 |
|
|
|
|
| 52 |
def run_scraper():
|
| 53 |
"""Full pipeline."""
|
| 54 |
try:
|
|
@@ -61,5 +79,6 @@ def run_scraper():
|
|
| 61 |
|
| 62 |
return df
|
| 63 |
|
|
|
|
| 64 |
if __name__ == "__main__":
|
| 65 |
run_scraper()
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
|
|
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import shutil
|
| 6 |
|
| 7 |
+
# Không import KaggleApi ở đây!
|
| 8 |
+
# Sẽ import và authenticate khi cần
|
| 9 |
+
|
| 10 |
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 11 |
TEMP_DIR = Path("data/temp_kaggle")
|
| 12 |
OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
|
| 13 |
|
| 14 |
os.makedirs("data", exist_ok=True)
|
| 15 |
|
| 16 |
+
|
| 17 |
def setup_kaggle_api():
|
| 18 |
+
"""Auth Kaggle API từ environment variable."""
|
| 19 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 20 |
+
|
| 21 |
+
token = os.getenv("KAGGLE_API_TOKEN")
|
| 22 |
+
if not token:
|
| 23 |
+
raise ValueError("KAGGLE_API_TOKEN environment variable not set!")
|
| 24 |
+
|
| 25 |
api = KaggleApi()
|
| 26 |
+
|
| 27 |
+
# Tự authenticate từ token
|
| 28 |
+
# Token format: KGAT_xxxxx
|
| 29 |
+
# Kaggle cần: username + key
|
| 30 |
+
api.api_token = token
|
| 31 |
+
|
| 32 |
return api
|
| 33 |
|
| 34 |
+
|
| 35 |
def download_dataset():
|
| 36 |
"""Download full dataset."""
|
| 37 |
api = setup_kaggle_api()
|
|
|
|
| 42 |
print("Download complete.")
|
| 43 |
return TEMP_DIR
|
| 44 |
|
| 45 |
+
|
| 46 |
def find_csv_files(temp_dir: Path):
|
| 47 |
"""Find all CSV files."""
|
| 48 |
csv_files = list(temp_dir.glob("**/*.csv"))
|
|
|
|
| 50 |
raise FileNotFoundError("No CSV files found.")
|
| 51 |
return csv_files
|
| 52 |
|
| 53 |
+
|
| 54 |
def load_and_concatenate(csv_files):
|
| 55 |
"""Concatenate all CSVs."""
|
| 56 |
dfs = []
|
|
|
|
| 60 |
dfs.append(df)
|
| 61 |
return pd.concat(dfs, ignore_index=True)
|
| 62 |
|
| 63 |
+
|
| 64 |
def save_parquet(df: pd.DataFrame):
|
| 65 |
"""Save to Parquet, overwrite."""
|
| 66 |
df.to_parquet(OUTPUT_PARQUET, index=False)
|
| 67 |
print(f"Saved to: {OUTPUT_PARQUET}")
|
| 68 |
|
| 69 |
+
|
| 70 |
def run_scraper():
|
| 71 |
"""Full pipeline."""
|
| 72 |
try:
|
|
|
|
| 79 |
|
| 80 |
return df
|
| 81 |
|
| 82 |
+
|
| 83 |
if __name__ == "__main__":
|
| 84 |
run_scraper()
|