Vincentran commited on
Commit
019d08d
·
1 Parent(s): e1a120a

Upload E-Commerce Product Intelligence Dashboard

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ data/
3
+ __pycache__/
4
+ *.pyc
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt /app/requirements.txt
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . /app
9
+
10
+ EXPOSE 8000
11
+
12
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from contextlib import asynccontextmanager
3
+ from fastapi import FastAPI
4
+ from fastapi.staticfiles import StaticFiles
5
+ from fastapi.responses import HTMLResponse
6
+ from pathlib import Path
7
+ from backend.api import app as api_app
8
+ from backend.scheduler import init_scheduler, shutdown_scheduler
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ @asynccontextmanager
14
+ async def lifespan(app: FastAPI):
15
+ logger.info("Starting app...")
16
+ init_scheduler()
17
+ yield
18
+ logger.info("Shutting down app...")
19
+ shutdown_scheduler()
20
+
21
+ app = FastAPI(title="E-Commerce Product Intelligence Platform", lifespan=lifespan)
22
+
23
+ app.include_router(api_app)
24
+
25
+ frontend_dir = Path("frontend")
26
+ if frontend_dir.exists():
27
+ app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
28
+ else:
29
+ @app.get("/")
30
+ def frontend_placeholder():
31
+ return HTMLResponse(
32
+ content="<h1>E-Commerce Product Intelligence Dashboard</h1><p>Frontend placeholder.</p>"
33
+ )
backend/agent.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def generate_insights(df: pd.DataFrame):
4
+ return {
5
+ "total_products": len(df),
6
+ "categories": df["category"].nunique(),
7
+ "brands": df["brand"].nunique(),
8
+ "avg_price": df["price"].mean(),
9
+ "avg_rating": df["rating"].mean(),
10
+ }
11
+
12
+ def semantic_search(query: str, df: pd.DataFrame):
13
+ q = query.lower()
14
+ mask = (
15
+ df["title"].str.contains(q, case=False, na=False) |
16
+ df["description"].str.contains(q, case=False, na=False)
17
+ )
18
+ return df[mask].head(100)
19
+
20
+ def recommend_by_category(df: pd.DataFrame, category: str):
21
+ subset = df[df["category"] == category]
22
+ return subset.sort_values("rating", ascending=False).head(10)
backend/api.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from backend.services import load_data, get_top_categories, get_top_brands, get_price_stats, get_rating_stats
4
+ from backend.agent import generate_insights, semantic_search, recommend_by_category
5
+
6
+ app = FastAPI(title="E-Commerce Product Intelligence API")
7
+
8
+ @app.get("/")
9
+ def root():
10
+ return {"status": "E-Commerce Product Intelligence API is running"}
11
+
12
+ @app.get("/data")
13
+ def get_data():
14
+ df = load_data()
15
+ return df.head(200).to_dict("records")
16
+
17
+ @app.get("/stats/categories")
18
+ def stats_categories():
19
+ df = load_data()
20
+ return get_top_categories(df, n=10).to_dict()
21
+
22
+ @app.get("/stats/brands")
23
+ def stats_brands():
24
+ df = load_data()
25
+ return get_top_brands(df, n=10).to_dict()
26
+
27
+ @app.get("/stats/price")
28
+ def stats_price():
29
+ df = load_data()
30
+ return get_price_stats(df).to_dict("records")
31
+
32
+ @app.get("/stats/rating")
33
+ def stats_rating():
34
+ df = load_data()
35
+ return get_rating_stats(df).to_dict("records")
36
+
37
+ @app.get("/insights")
38
+ def insights():
39
+ df = load_data()
40
+ return JSONResponse(content=generate_insights(df))
41
+
42
+ @app.get("/search")
43
+ def search(query: str):
44
+ df = load_data()
45
+ return semantic_search(query, df).head(100).to_dict("records")
46
+
47
+ @app.get("/recommend")
48
+ def recommend(category: str):
49
+ df = load_data()
50
+ return recommend_by_category(df, category).to_dict("records")
backend/scheduler.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.background import BackgroundScheduler
2
+ from apscheduler.triggers.cron import CronTrigger
3
+ from backend.scraper import run_scraper
4
+ import logging
5
+
6
+ logger = logging.getLogger("scheduler")
7
+
8
+ scheduler = BackgroundScheduler()
9
+
10
+ def scheduled_scraper_job():
11
+ logger.info("Running scheduled scraper job...")
12
+ try:
13
+ run_scraper()
14
+ logger.info("Scheduled scraper job completed.")
15
+ except Exception as e:
16
+ logger.error(f"Scheduled scraper job failed: {e}")
17
+
18
+ def init_scheduler():
19
+ """Schedule scraper to run daily at 02:00 AM."""
20
+ scheduler.add_job(
21
+ scheduled_scraper_job,
22
+ CronTrigger(hour=2, minute=0),
23
+ id="daily_scraper",
24
+ replace_existing=True
25
+ )
26
+ logger.info("Scheduled scraper job added: daily at 02:00 AM")
27
+ scheduler.start()
28
+
29
+ def shutdown_scheduler():
30
+ scheduler.shutdown()
backend/scraper.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import zipfile
4
+ from pathlib import Path
5
+ from kaggle.api.kaggle_api_extended import KaggleApi
6
+ import pandas as pd
7
+ import shutil
8
+
9
+ DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
10
+ TEMP_DIR = Path("data/temp_kaggle")
11
+ OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
12
+
13
+ os.makedirs("data", exist_ok=True)
14
+
15
+ def setup_kaggle_api():
16
+ """Auth Kaggle API from environment or file."""
17
+ api = KaggleApi()
18
+ api.authenticate()
19
+ return api
20
+
21
+ def download_dataset():
22
+ """Download full dataset."""
23
+ api = setup_kaggle_api()
24
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
25
+
26
+ print(f"Downloading dataset: {DATASET_SLUG}")
27
+ api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
28
+ print("Download complete.")
29
+ return TEMP_DIR
30
+
31
+ def find_csv_files(temp_dir: Path):
32
+ """Find all CSV files."""
33
+ csv_files = list(temp_dir.glob("**/*.csv"))
34
+ if not csv_files:
35
+ raise FileNotFoundError("No CSV files found.")
36
+ return csv_files
37
+
38
+ def load_and_concatenate(csv_files):
39
+ """Concatenate all CSVs."""
40
+ dfs = []
41
+ for f in csv_files:
42
+ print(f"Loading: {f}")
43
+ df = pd.read_csv(f)
44
+ dfs.append(df)
45
+ return pd.concat(dfs, ignore_index=True)
46
+
47
+ def save_parquet(df: pd.DataFrame):
48
+ """Save to Parquet, overwrite."""
49
+ df.to_parquet(OUTPUT_PARQUET, index=False)
50
+ print(f"Saved to: {OUTPUT_PARQUET}")
51
+
52
+ def run_scraper():
53
+ """Full pipeline."""
54
+ try:
55
+ download_dataset()
56
+ csv_files = find_csv_files(TEMP_DIR)
57
+ df = load_and_concatenate(csv_files)
58
+ save_parquet(df)
59
+ finally:
60
+ shutil.rmtree(TEMP_DIR, ignore_errors=True)
61
+
62
+ return df
63
+
64
+ if __name__ == "__main__":
65
+ run_scraper()
backend/services.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+
4
+ PARQUET_PATH = Path("data/ecommerce_products.parquet")
5
+
6
+ def load_data():
7
+ """Load parquet."""
8
+ if not PARQUET_PATH.exists():
9
+ raise FileNotFoundError(f"Parquet not found: {PARQUET_PATH}")
10
+ return pd.read_parquet(PARQUET_PATH)
11
+
12
+ def get_top_categories(df: pd.DataFrame, n: int = 10):
13
+ return df["category"].value_counts().head(n)
14
+
15
+ def get_top_brands(df: pd.DataFrame, n: int = 10):
16
+ return df["brand"].value_counts().head(n)
17
+
18
+ def get_price_stats(df: pd.DataFrame):
19
+ return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index()
20
+
21
+ def get_rating_stats(df: pd.DataFrame):
22
+ return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.34.0
3
+ pandas==2.2.0
4
+ kaggle==1.6.17
5
+ pyarrow==18.0.0
6
+ apscheduler==3.11.0
7
+ httpx