Spaces:
Runtime error
Runtime error
Commit ·
d3157ae
1
Parent(s): 72ea64a
Upload E-Commerce Product Intelligence Dashboard (frontend + backend)
Browse files- backend/scraper.py +44 -60
backend/scraper.py
CHANGED
|
@@ -1,82 +1,66 @@
|
|
| 1 |
-
|
| 2 |
-
import logging
|
| 3 |
-
import pandas as pd
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import shutil
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
# Kaggle API v2:直接使用 access_token
|
| 11 |
-
os.environ['KAGGLE_API_TOKEN'] = token
|
| 12 |
-
os.environ['KAGGLE_USERNAME'] = 'johnsontrann'
|
| 13 |
|
| 14 |
-
logging.basicConfig(level=logging.INFO)
|
| 15 |
-
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
-
os.
|
|
|
|
|
|
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 27 |
|
| 28 |
-
|
| 29 |
-
api.authenticate()
|
| 30 |
-
return api
|
| 31 |
|
| 32 |
|
| 33 |
def download_dataset():
|
| 34 |
-
"""Download dataset
|
| 35 |
api = setup_kaggle_api()
|
| 36 |
-
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
return TEMP_DIR
|
| 42 |
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if not csv_files:
|
| 48 |
-
raise FileNotFoundError("No CSV files found.")
|
| 49 |
-
return csv_files
|
| 50 |
|
| 51 |
-
|
| 52 |
-
def load_and_concatenate(csv_files):
|
| 53 |
-
"""Concatenate tất CSVs."""
|
| 54 |
-
dfs = []
|
| 55 |
-
for f in csv_files:
|
| 56 |
-
logger.info(f"Loading: {f}")
|
| 57 |
-
df = pd.read_csv(f)
|
| 58 |
-
dfs.append(df)
|
| 59 |
-
return pd.concat(dfs, ignore_index=True)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def save_csv(df: pd.DataFrame):
|
| 63 |
-
"""Save to CSV."""
|
| 64 |
-
df.to_csv(OUTPUT_CSV, index=False)
|
| 65 |
-
logger.info(f"Saved to: {OUTPUT_CSV}")
|
| 66 |
|
| 67 |
|
| 68 |
def run_scraper():
|
| 69 |
-
"""
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
| 77 |
|
| 78 |
-
return df
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
|
|
|
|
| 81 |
if __name__ == "__main__":
|
| 82 |
-
|
|
|
|
|
|
| 1 |
+
# backend/scraper.py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
from kaggle.api.kaggle_api_extended import KaggleApi # ❌ XÃI import này
|
|
|
|
|
|
|
|
|
|
| 6 |
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
def setup_kaggle_api():
|
| 9 |
+
"""Setup Kaggle API."""
|
| 10 |
+
# Check if kaggle.json exists
|
| 11 |
+
kaggle_json_path = os.path.join(os.environ.get('HOME', '/root'), '.config/kaggle/kaggle.json')
|
| 12 |
|
| 13 |
+
if not os.path.exists(kaggle_json_path):
|
| 14 |
+
print(f"⚠️ kaggle.json not found at {kaggle_json_path}")
|
| 15 |
+
print("⚠️ Kaggle API will not work. Upload kaggle.json to HF Space secrets.")
|
| 16 |
+
return None
|
| 17 |
|
| 18 |
+
# Setup kaggle
|
| 19 |
+
with open(kaggle_json_path) as f:
|
| 20 |
+
kaggle_credentials = json.load(f)
|
| 21 |
|
| 22 |
+
os.environ['KAGGLE_USERNAME'] = kaggle_credentials['username']
|
| 23 |
+
os.environ['KAGGLE_KEY'] = kaggle_credentials['key']
|
|
|
|
| 24 |
|
| 25 |
+
return KaggleApi()
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def download_dataset():
|
| 29 |
+
"""Download dataset from Kaggle."""
|
| 30 |
api = setup_kaggle_api()
|
|
|
|
| 31 |
|
| 32 |
+
if api is None:
|
| 33 |
+
print("❌ Kaggle API not available. Skipping download.")
|
| 34 |
+
return False
|
|
|
|
| 35 |
|
| 36 |
+
# Download dataset
|
| 37 |
+
dataset_name = "jackdaug/ecommerce-products-dataset"
|
| 38 |
+
os.makedirs("data", exist_ok=True)
|
| 39 |
|
| 40 |
+
print(f"📥 Downloading dataset: {dataset_name}")
|
| 41 |
+
api.dataset_download_files(dataset_name, path="data", unzip=True)
|
| 42 |
+
print("✅ Dataset downloaded!")
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
def run_scraper():
|
| 48 |
+
"""Run scraper."""
|
| 49 |
+
print("🚀 Running scraper...")
|
| 50 |
+
|
| 51 |
+
if download_dataset():
|
| 52 |
+
print("✅ Scraper completed successfully!")
|
| 53 |
+
return True
|
| 54 |
+
else:
|
| 55 |
+
print("❌ Scraper failed!")
|
| 56 |
+
return False
|
| 57 |
|
|
|
|
| 58 |
|
| 59 |
+
# ✅ XÓA HOÀN TOÀN:
|
| 60 |
+
# if __name__ == "__main__":
|
| 61 |
+
# run_scraper()
|
| 62 |
|
| 63 |
+
# ✅ Chỉ để:
|
| 64 |
if __name__ == "__main__":
|
| 65 |
+
print("📦 scraper.py imported (not running automatically)")
|
| 66 |
+
print("📦 Call run_scraper() manually or via API")
|