Spaces:
Runtime error
Runtime error
Commit ·
019d08d
1
Parent(s): e1a120a
Upload E-Commerce Product Intelligence Dashboard
Browse files- .gitignore +4 -0
- Dockerfile +12 -0
- app.py +33 -0
- backend/agent.py +22 -0
- backend/api.py +50 -0
- backend/scheduler.py +30 -0
- backend/scraper.py +65 -0
- backend/services.py +22 -0
- requirements.txt +7 -0
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
data/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt /app/requirements.txt
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . /app
|
| 9 |
+
|
| 10 |
+
EXPOSE 8000
|
| 11 |
+
|
| 12 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
app.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from fastapi.staticfiles import StaticFiles
|
| 5 |
+
from fastapi.responses import HTMLResponse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from backend.api import app as api_app
|
| 8 |
+
from backend.scheduler import init_scheduler, shutdown_scheduler
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
@asynccontextmanager
|
| 14 |
+
async def lifespan(app: FastAPI):
|
| 15 |
+
logger.info("Starting app...")
|
| 16 |
+
init_scheduler()
|
| 17 |
+
yield
|
| 18 |
+
logger.info("Shutting down app...")
|
| 19 |
+
shutdown_scheduler()
|
| 20 |
+
|
| 21 |
+
app = FastAPI(title="E-Commerce Product Intelligence Platform", lifespan=lifespan)
|
| 22 |
+
|
| 23 |
+
app.include_router(api_app)
|
| 24 |
+
|
| 25 |
+
frontend_dir = Path("frontend")
|
| 26 |
+
if frontend_dir.exists():
|
| 27 |
+
app.mount("/", StaticFiles(directory=str(frontend), html=True), name="frontend")
|
| 28 |
+
else:
|
| 29 |
+
@app.get("/")
|
| 30 |
+
def frontend_placeholder():
|
| 31 |
+
return HTMLResponse(
|
| 32 |
+
content="<h1>E-Commerce Product Intelligence Dashboard</h1><p>Frontend placeholder.</p>"
|
| 33 |
+
)
|
backend/agent.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def generate_insights(df: pd.DataFrame):
|
| 4 |
+
return {
|
| 5 |
+
"total_products": len(df),
|
| 6 |
+
"categories": df["category"].nunique(),
|
| 7 |
+
"brands": df["brand"].nunique(),
|
| 8 |
+
"avg_price": df["price"].mean(),
|
| 9 |
+
"avg_rating": df["rating"].mean(),
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
def semantic_search(query: str, df: pd.DataFrame):
|
| 13 |
+
q = query.lower()
|
| 14 |
+
mask = (
|
| 15 |
+
df["title"].str.contains(q, case=False, na=False) |
|
| 16 |
+
df["description"].str.contains(q, case=False, na=False)
|
| 17 |
+
)
|
| 18 |
+
return df[mask].head(100)
|
| 19 |
+
|
| 20 |
+
def recommend_by_category(df: pd.DataFrame, category: str):
|
| 21 |
+
subset = df[df["category"] == category]
|
| 22 |
+
return subset.sort_values("rating", ascending=False).head(10)
|
backend/api.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from backend.services import load_data, get_top_categories, get_top_brands, get_price_stats, get_rating_stats
|
| 4 |
+
from backend.agent import generate_insights, semantic_search, recommend_by_category
|
| 5 |
+
|
| 6 |
+
app = FastAPI(title="E-Commerce Product Intelligence API")
|
| 7 |
+
|
| 8 |
+
@app.get("/")
|
| 9 |
+
def root():
|
| 10 |
+
return {"status": "E-Commerce Product Intelligence API is running"}
|
| 11 |
+
|
| 12 |
+
@app.get("/data")
|
| 13 |
+
def get_data():
|
| 14 |
+
df = load_data()
|
| 15 |
+
return df.head(200).to_dict("records")
|
| 16 |
+
|
| 17 |
+
@app.get("/stats/categories")
|
| 18 |
+
def stats_categories():
|
| 19 |
+
df = load_data()
|
| 20 |
+
return get_top_categories(df, n=10).to_dict()
|
| 21 |
+
|
| 22 |
+
@app.get("/stats/brands")
|
| 23 |
+
def stats_brands():
|
| 24 |
+
df = load_data()
|
| 25 |
+
return get_top_brands(df, n=10).to_dict()
|
| 26 |
+
|
| 27 |
+
@app.get("/stats/price")
|
| 28 |
+
def stats_price():
|
| 29 |
+
df = load_data()
|
| 30 |
+
return get_price_stats(df).to_dict("records")
|
| 31 |
+
|
| 32 |
+
@app.get("/stats/rating")
|
| 33 |
+
def stats_rating():
|
| 34 |
+
df = load_data()
|
| 35 |
+
return get_rating_stats(df).to_dict("records")
|
| 36 |
+
|
| 37 |
+
@app.get("/insights")
|
| 38 |
+
def insights():
|
| 39 |
+
df = load_data()
|
| 40 |
+
return JSONResponse(content=generate_insights(df))
|
| 41 |
+
|
| 42 |
+
@app.get("/search")
|
| 43 |
+
def search(query: str):
|
| 44 |
+
df = load_data()
|
| 45 |
+
return semantic_search(query, df).head(100).to_dict("records")
|
| 46 |
+
|
| 47 |
+
@app.get("/recommend")
|
| 48 |
+
def recommend(category: str):
|
| 49 |
+
df = load_data()
|
| 50 |
+
return recommend_by_category(df, category).to_dict("records")
|
backend/scheduler.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 2 |
+
from apscheduler.triggers.cron import CronTrigger
|
| 3 |
+
from backend.scraper import run_scraper
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger("scheduler")
|
| 7 |
+
|
| 8 |
+
scheduler = BackgroundScheduler()
|
| 9 |
+
|
| 10 |
+
def scheduled_scraper_job():
|
| 11 |
+
logger.info("Running scheduled scraper job...")
|
| 12 |
+
try:
|
| 13 |
+
run_scraper()
|
| 14 |
+
logger.info("Scheduled scraper job completed.")
|
| 15 |
+
except Exception as e:
|
| 16 |
+
logger.error(f"Scheduled scraper job failed: {e}")
|
| 17 |
+
|
| 18 |
+
def init_scheduler():
|
| 19 |
+
"""Schedule scraper to run daily at 02:00 AM."""
|
| 20 |
+
scheduler.add_job(
|
| 21 |
+
scheduled_scraper_job,
|
| 22 |
+
CronTrigger(hour=2, minute=0),
|
| 23 |
+
id="daily_scraper",
|
| 24 |
+
replace_existing=True
|
| 25 |
+
)
|
| 26 |
+
logger.info("Scheduled scraper job added: daily at 02:00 AM")
|
| 27 |
+
scheduler.start()
|
| 28 |
+
|
| 29 |
+
def shutdown_scheduler():
|
| 30 |
+
scheduler.shutdown()
|
backend/scraper.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import zipfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import shutil
|
| 8 |
+
|
| 9 |
+
DATASET_SLUG = "anujsaha0123456789/e-commerce-product-intelligence-dataset"
|
| 10 |
+
TEMP_DIR = Path("data/temp_kaggle")
|
| 11 |
+
OUTPUT_PARQUET = Path("data/ecommerce_products.parquet")
|
| 12 |
+
|
| 13 |
+
os.makedirs("data", exist_ok=True)
|
| 14 |
+
|
| 15 |
+
def setup_kaggle_api():
|
| 16 |
+
"""Auth Kaggle API from environment or file."""
|
| 17 |
+
api = KaggleApi()
|
| 18 |
+
api.authenticate()
|
| 19 |
+
return api
|
| 20 |
+
|
| 21 |
+
def download_dataset():
|
| 22 |
+
"""Download full dataset."""
|
| 23 |
+
api = setup_kaggle_api()
|
| 24 |
+
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
print(f"Downloading dataset: {DATASET_SLUG}")
|
| 27 |
+
api.dataset_download_files(DATASET_SLUG, path=str(TEMP_DIR), unzip=True)
|
| 28 |
+
print("Download complete.")
|
| 29 |
+
return TEMP_DIR
|
| 30 |
+
|
| 31 |
+
def find_csv_files(temp_dir: Path):
|
| 32 |
+
"""Find all CSV files."""
|
| 33 |
+
csv_files = list(temp_dir.glob("**/*.csv"))
|
| 34 |
+
if not csv_files:
|
| 35 |
+
raise FileNotFoundError("No CSV files found.")
|
| 36 |
+
return csv_files
|
| 37 |
+
|
| 38 |
+
def load_and_concatenate(csv_files):
|
| 39 |
+
"""Concatenate all CSVs."""
|
| 40 |
+
dfs = []
|
| 41 |
+
for f in csv_files:
|
| 42 |
+
print(f"Loading: {f}")
|
| 43 |
+
df = pd.read_csv(f)
|
| 44 |
+
dfs.append(df)
|
| 45 |
+
return pd.concat(dfs, ignore_index=True)
|
| 46 |
+
|
| 47 |
+
def save_parquet(df: pd.DataFrame):
|
| 48 |
+
"""Save to Parquet, overwrite."""
|
| 49 |
+
df.to_parquet(OUTPUT_PARQUET, index=False)
|
| 50 |
+
print(f"Saved to: {OUTPUT_PARQUET}")
|
| 51 |
+
|
| 52 |
+
def run_scraper():
|
| 53 |
+
"""Full pipeline."""
|
| 54 |
+
try:
|
| 55 |
+
download_dataset()
|
| 56 |
+
csv_files = find_csv_files(TEMP_DIR)
|
| 57 |
+
df = load_and_concatenate(csv_files)
|
| 58 |
+
save_parquet(df)
|
| 59 |
+
finally:
|
| 60 |
+
shutil.rmtree(TEMP_DIR, ignore_errors=True)
|
| 61 |
+
|
| 62 |
+
return df
|
| 63 |
+
|
| 64 |
+
if __name__ == "__main__":
|
| 65 |
+
run_scraper()
|
backend/services.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
PARQUET_PATH = Path("data/ecommerce_products.parquet")
|
| 5 |
+
|
| 6 |
+
def load_data():
|
| 7 |
+
"""Load parquet."""
|
| 8 |
+
if not PARQUET_PATH.exists():
|
| 9 |
+
raise FileNotFoundError(f"Parquet not found: {PARQUET_PATH}")
|
| 10 |
+
return pd.read_parquet(PARQUET_PATH)
|
| 11 |
+
|
| 12 |
+
def get_top_categories(df: pd.DataFrame, n: int = 10):
|
| 13 |
+
return df["category"].value_counts().head(n)
|
| 14 |
+
|
| 15 |
+
def get_top_brands(df: pd.DataFrame, n: int = 10):
|
| 16 |
+
return df["brand"].value_counts().head(n)
|
| 17 |
+
|
| 18 |
+
def get_price_stats(df: pd.DataFrame):
|
| 19 |
+
return df.groupby("category")["price"].agg(["mean", "median", "min", "max", "count"]).reset_index()
|
| 20 |
+
|
| 21 |
+
def get_rating_stats(df: pd.DataFrame):
|
| 22 |
+
return df.groupby("category")["rating"].agg(["mean", "median", "min", "max", "count"]).reset_index()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn==0.34.0
|
| 3 |
+
pandas==2.2.0
|
| 4 |
+
kaggle==1.6.17
|
| 5 |
+
pyarrow==18.0.0
|
| 6 |
+
apscheduler==3.11.0
|
| 7 |
+
httpx
|