Spaces:
Sleeping
Sleeping
Upload 51 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .env +30 -0
- .gitattributes +1 -0
- app/__init__.py +80 -0
- app/__pycache__/__init__.cpython-310.pyc +0 -0
- app/config/__init__.py +3 -0
- app/config/__pycache__/__init__.cpython-310.pyc +0 -0
- app/config/__pycache__/settings.cpython-310.pyc +0 -0
- app/config/settings.py +107 -0
- app/middleware/__init__.py +3 -0
- app/middleware/__pycache__/__init__.cpython-310.pyc +0 -0
- app/middleware/__pycache__/security.cpython-310.pyc +0 -0
- app/middleware/security.py +67 -0
- app/models/__init__.py +22 -0
- app/models/__pycache__/__init__.cpython-310.pyc +0 -0
- app/models/__pycache__/models.cpython-310.pyc +0 -0
- app/models/models.py +371 -0
- app/routes/__init__.py +4 -0
- app/routes/__pycache__/__init__.cpython-310.pyc +0 -0
- app/routes/__pycache__/jobs.cpython-310.pyc +0 -0
- app/routes/__pycache__/main.cpython-310.pyc +0 -0
- app/routes/jobs.py +265 -0
- app/routes/main.py +34 -0
- app/scrapers/__init__.py +3 -0
- app/scrapers/__pycache__/__init__.cpython-310.pyc +0 -0
- app/scrapers/__pycache__/engine.cpython-310.pyc +0 -0
- app/scrapers/engine.py +572 -0
- app/services/__init__.py +28 -0
- app/services/__pycache__/__init__.cpython-310.pyc +0 -0
- app/services/__pycache__/export_service.cpython-310.pyc +0 -0
- app/services/__pycache__/job_service.cpython-310.pyc +0 -0
- app/services/export_service.py +165 -0
- app/services/job_service.py +281 -0
- app/static/css/style.css +379 -0
- app/static/js/app.js +51 -0
- app/templates/base.html +80 -0
- app/templates/pages/dashboard.html +113 -0
- app/templates/pages/error.html +30 -0
- app/templates/pages/job_detail.html +262 -0
- app/templates/pages/jobs.html +122 -0
- app/templates/pages/new_job.html +181 -0
- app/utils/__init__.py +4 -0
- app/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- app/utils/__pycache__/logging_config.cpython-310.pyc +0 -0
- app/utils/__pycache__/validators.cpython-310.pyc +0 -0
- app/utils/logging_config.py +60 -0
- app/utils/validators.py +113 -0
- database/scraper.db +3 -0
- env.example +30 -0
- logs/app.log +262 -0
- logs/errors.log +0 -0
.env
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Application
|
| 2 |
+
FLASK_ENV=development
|
| 3 |
+
SECRET_KEY=dev-secret-key-change-in-production-use-32-chars
|
| 4 |
+
DEBUG=True
|
| 5 |
+
|
| 6 |
+
# Database (leave unset to use default SQLite in project/database/)
|
| 7 |
+
# DATABASE_URL=sqlite:///database/scraper.db
|
| 8 |
+
|
| 9 |
+
# Security
|
| 10 |
+
WTF_CSRF_ENABLED=True
|
| 11 |
+
SESSION_COOKIE_SECURE=False
|
| 12 |
+
SESSION_COOKIE_HTTPONLY=True
|
| 13 |
+
SESSION_COOKIE_SAMESITE=Lax
|
| 14 |
+
|
| 15 |
+
# Rate Limiting
|
| 16 |
+
RATELIMIT_DEFAULT=100 per hour
|
| 17 |
+
RATELIMIT_STORAGE_URL=memory://
|
| 18 |
+
|
| 19 |
+
# Scraping
|
| 20 |
+
MAX_CONCURRENT_JOBS=5
|
| 21 |
+
REQUEST_TIMEOUT=30
|
| 22 |
+
MAX_RETRIES=3
|
| 23 |
+
DEFAULT_DELAY=1.0
|
| 24 |
+
|
| 25 |
+
# Exports
|
| 26 |
+
EXPORT_DIR=exports
|
| 27 |
+
|
| 28 |
+
# Logging
|
| 29 |
+
LOG_LEVEL=DEBUG
|
| 30 |
+
LOG_DIR=logs
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
database/scraper.db filter=lfs diff=lfs merge=lfs -text
|
app/__init__.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Application factory.
|
| 3 |
+
Create and configure the Flask app with all extensions, blueprints, and middleware.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
from flask import Flask
|
| 10 |
+
from flask_wtf.csrf import CSRFProtect
|
| 11 |
+
from flask_limiter import Limiter
|
| 12 |
+
from flask_limiter.util import get_remote_address
|
| 13 |
+
|
| 14 |
+
from app.config import get_config
|
| 15 |
+
from app.models import db
|
| 16 |
+
from app.middleware import register_middleware
|
| 17 |
+
from app.utils.logging_config import configure_logging
|
| 18 |
+
|
| 19 |
+
csrf = CSRFProtect()
|
| 20 |
+
limiter = Limiter(key_func=get_remote_address)
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def create_app(config_class=None) -> Flask:
|
| 26 |
+
"""Application factory — creates a fully configured Flask app."""
|
| 27 |
+
app = Flask(
|
| 28 |
+
__name__,
|
| 29 |
+
template_folder="templates",
|
| 30 |
+
static_folder="static",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# --- Load config ---
|
| 34 |
+
cfg = config_class or get_config()
|
| 35 |
+
app.config.from_object(cfg)
|
| 36 |
+
cfg.init_app(app)
|
| 37 |
+
|
| 38 |
+
# --- Logging ---
|
| 39 |
+
configure_logging(
|
| 40 |
+
log_level=app.config.get("LOG_LEVEL", "INFO"),
|
| 41 |
+
log_dir=str(app.config.get("LOG_DIR", "logs")),
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# --- Extensions ---
|
| 45 |
+
db.init_app(app)
|
| 46 |
+
csrf.init_app(app)
|
| 47 |
+
limiter.init_app(app)
|
| 48 |
+
|
| 49 |
+
# --- Middleware ---
|
| 50 |
+
register_middleware(app)
|
| 51 |
+
|
| 52 |
+
# --- Blueprints ---
|
| 53 |
+
from app.routes import main_bp, jobs_bp
|
| 54 |
+
app.register_blueprint(main_bp)
|
| 55 |
+
app.register_blueprint(jobs_bp)
|
| 56 |
+
|
| 57 |
+
# Apply rate limiting to API routes
|
| 58 |
+
limiter.limit("60 per minute")(jobs_bp)
|
| 59 |
+
|
| 60 |
+
# --- Database init ---
|
| 61 |
+
with app.app_context():
|
| 62 |
+
db.create_all()
|
| 63 |
+
_seed_defaults()
|
| 64 |
+
|
| 65 |
+
logger.info("WebScraper Platform started [%s]", app.config.get("FLASK_ENV", "development"))
|
| 66 |
+
return app
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _seed_defaults() -> None:
|
| 70 |
+
"""Seed default app settings if not present."""
|
| 71 |
+
from app.models import AppSetting
|
| 72 |
+
defaults = [
|
| 73 |
+
("max_concurrent_jobs", "5", "Maximum parallel scrape jobs"),
|
| 74 |
+
("default_delay", "1.0", "Default seconds between requests"),
|
| 75 |
+
("default_timeout", "30", "Default HTTP timeout in seconds"),
|
| 76 |
+
]
|
| 77 |
+
for key, value, desc in defaults:
|
| 78 |
+
if not AppSetting.query.filter_by(key=key).first():
|
| 79 |
+
db.session.add(AppSetting(key=key, value=value, description=desc))
|
| 80 |
+
db.session.commit()
|
app/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
app/config/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .settings import get_config, BaseConfig, DevelopmentConfig, ProductionConfig, TestingConfig
|
| 2 |
+
|
| 3 |
+
__all__ = ["get_config", "BaseConfig", "DevelopmentConfig", "ProductionConfig", "TestingConfig"]
|
app/config/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (312 Bytes). View file
|
|
|
app/config/__pycache__/settings.cpython-310.pyc
ADDED
|
Binary file (3.51 kB). View file
|
|
|
app/config/settings.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Application configuration classes for different environments.
|
| 3 |
+
Uses environment variables with sensible defaults.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# app/config/settings.py -> parent (config) -> parent (app) -> parent (project root)
|
| 12 |
+
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BaseConfig:
|
| 16 |
+
"""Base configuration shared across all environments."""
|
| 17 |
+
|
| 18 |
+
# Flask
|
| 19 |
+
SECRET_KEY: str = os.getenv("SECRET_KEY", "change-me-in-production-32-chars!!")
|
| 20 |
+
DEBUG: bool = False
|
| 21 |
+
TESTING: bool = False
|
| 22 |
+
|
| 23 |
+
# SQLAlchemy
|
| 24 |
+
SQLALCHEMY_DATABASE_URI: str = os.getenv(
|
| 25 |
+
"DATABASE_URL", f"sqlite:///{BASE_DIR / 'database' / 'scraper.db'}"
|
| 26 |
+
)
|
| 27 |
+
SQLALCHEMY_TRACK_MODIFICATIONS: bool = False
|
| 28 |
+
SQLALCHEMY_ENGINE_OPTIONS: dict = {
|
| 29 |
+
"pool_pre_ping": True,
|
| 30 |
+
"pool_recycle": 300,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# CSRF
|
| 34 |
+
WTF_CSRF_ENABLED: bool = os.getenv("WTF_CSRF_ENABLED", "True") == "True"
|
| 35 |
+
WTF_CSRF_TIME_LIMIT: int = 3600
|
| 36 |
+
|
| 37 |
+
# Session
|
| 38 |
+
SESSION_COOKIE_SECURE: bool = os.getenv("SESSION_COOKIE_SECURE", "False") == "True"
|
| 39 |
+
SESSION_COOKIE_HTTPONLY: bool = True
|
| 40 |
+
SESSION_COOKIE_SAMESITE: str = "Lax"
|
| 41 |
+
PERMANENT_SESSION_LIFETIME: int = 86400
|
| 42 |
+
|
| 43 |
+
# Rate limiting
|
| 44 |
+
RATELIMIT_DEFAULT: str = os.getenv("RATELIMIT_DEFAULT", "200 per hour")
|
| 45 |
+
RATELIMIT_STORAGE_URL: str = os.getenv("RATELIMIT_STORAGE_URL", "memory://")
|
| 46 |
+
RATELIMIT_HEADERS_ENABLED: bool = True
|
| 47 |
+
|
| 48 |
+
# Scraping engine
|
| 49 |
+
MAX_CONCURRENT_JOBS: int = int(os.getenv("MAX_CONCURRENT_JOBS", "5"))
|
| 50 |
+
REQUEST_TIMEOUT: int = int(os.getenv("REQUEST_TIMEOUT", "30"))
|
| 51 |
+
MAX_RETRIES: int = int(os.getenv("MAX_RETRIES", "3"))
|
| 52 |
+
DEFAULT_DELAY: float = float(os.getenv("DEFAULT_DELAY", "1.0"))
|
| 53 |
+
MAX_PAGES: int = int(os.getenv("MAX_PAGES", "50"))
|
| 54 |
+
|
| 55 |
+
# Directories
|
| 56 |
+
EXPORT_DIR: Path = BASE_DIR / os.getenv("EXPORT_DIR", "exports")
|
| 57 |
+
LOG_DIR: Path = BASE_DIR / os.getenv("LOG_DIR", "logs")
|
| 58 |
+
|
| 59 |
+
# Logging
|
| 60 |
+
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
| 61 |
+
LOG_FORMAT: str = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
| 62 |
+
|
| 63 |
+
@classmethod
|
| 64 |
+
def init_app(cls, app) -> None:
|
| 65 |
+
"""Hook for environment-specific initialization."""
|
| 66 |
+
# Ensure required directories exist
|
| 67 |
+
cls.EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
cls.LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
(BASE_DIR / "database").mkdir(parents=True, exist_ok=True)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class DevelopmentConfig(BaseConfig):
|
| 73 |
+
DEBUG: bool = True
|
| 74 |
+
LOG_LEVEL: str = "DEBUG"
|
| 75 |
+
SESSION_COOKIE_SECURE: bool = False
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class ProductionConfig(BaseConfig):
|
| 79 |
+
DEBUG: bool = False
|
| 80 |
+
SESSION_COOKIE_SECURE: bool = True
|
| 81 |
+
LOG_LEVEL: str = "WARNING"
|
| 82 |
+
SQLALCHEMY_ENGINE_OPTIONS: dict = {
|
| 83 |
+
"pool_pre_ping": True,
|
| 84 |
+
"pool_recycle": 300,
|
| 85 |
+
"pool_size": 10,
|
| 86 |
+
"max_overflow": 20,
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class TestingConfig(BaseConfig):
|
| 91 |
+
TESTING: bool = True
|
| 92 |
+
DEBUG: bool = True
|
| 93 |
+
WTF_CSRF_ENABLED: bool = False
|
| 94 |
+
SQLALCHEMY_DATABASE_URI: str = "sqlite:///:memory:"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
config_map = {
|
| 98 |
+
"development": DevelopmentConfig,
|
| 99 |
+
"production": ProductionConfig,
|
| 100 |
+
"testing": TestingConfig,
|
| 101 |
+
"default": DevelopmentConfig,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def get_config() -> type:
|
| 106 |
+
env = os.getenv("FLASK_ENV", "development").lower()
|
| 107 |
+
return config_map.get(env, DevelopmentConfig)
|
app/middleware/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .security import register_middleware
|
| 2 |
+
|
| 3 |
+
__all__ = ["register_middleware"]
|
app/middleware/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (228 Bytes). View file
|
|
|
app/middleware/__pycache__/security.cpython-310.pyc
ADDED
|
Binary file (2.96 kB). View file
|
|
|
app/middleware/security.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Flask middleware:
|
| 3 |
+
- Request timing
|
| 4 |
+
- Security headers (XSS, clickjacking, content-type sniffing)
|
| 5 |
+
- Global error handlers (404, 429, 500)
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
from flask import Flask, jsonify, request, render_template
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def register_middleware(app: Flask) -> None:
|
| 18 |
+
"""Register all middleware and error handlers on the Flask app."""
|
| 19 |
+
|
| 20 |
+
@app.before_request
|
| 21 |
+
def start_timer() -> None:
|
| 22 |
+
request._start_time = time.monotonic()
|
| 23 |
+
|
| 24 |
+
@app.after_request
|
| 25 |
+
def add_security_headers(response):
|
| 26 |
+
duration = time.monotonic() - getattr(request, "_start_time", time.monotonic())
|
| 27 |
+
response.headers["X-Response-Time"] = f"{duration * 1000:.2f}ms"
|
| 28 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 29 |
+
response.headers["X-Frame-Options"] = "SAMEORIGIN"
|
| 30 |
+
response.headers["X-XSS-Protection"] = "1; mode=block"
|
| 31 |
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
| 32 |
+
response.headers["Permissions-Policy"] = "geolocation=(), microphone=()"
|
| 33 |
+
if app.config.get("SESSION_COOKIE_SECURE"):
|
| 34 |
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 35 |
+
logger.debug(
|
| 36 |
+
"%s %s -> %d (%.2fms)",
|
| 37 |
+
request.method,
|
| 38 |
+
request.path,
|
| 39 |
+
response.status_code,
|
| 40 |
+
duration * 1000,
|
| 41 |
+
)
|
| 42 |
+
return response
|
| 43 |
+
|
| 44 |
+
@app.errorhandler(400)
|
| 45 |
+
def bad_request(exc):
|
| 46 |
+
if request.is_json:
|
| 47 |
+
return jsonify({"error": "Bad request", "detail": str(exc)}), 400
|
| 48 |
+
return render_template("pages/error.html", code=400, message="Bad Request"), 400
|
| 49 |
+
|
| 50 |
+
@app.errorhandler(404)
|
| 51 |
+
def not_found(exc):
|
| 52 |
+
if request.is_json:
|
| 53 |
+
return jsonify({"error": "Not found"}), 404
|
| 54 |
+
return render_template("pages/error.html", code=404, message="Page Not Found"), 404
|
| 55 |
+
|
| 56 |
+
@app.errorhandler(429)
|
| 57 |
+
def rate_limited(exc):
|
| 58 |
+
if request.is_json:
|
| 59 |
+
return jsonify({"error": "Rate limit exceeded. Please slow down."}), 429
|
| 60 |
+
return render_template("pages/error.html", code=429, message="Rate limit exceeded"), 429
|
| 61 |
+
|
| 62 |
+
@app.errorhandler(500)
|
| 63 |
+
def internal_error(exc):
|
| 64 |
+
logger.exception("Internal server error: %s", exc)
|
| 65 |
+
if request.is_json:
|
| 66 |
+
return jsonify({"error": "Internal server error"}), 500
|
| 67 |
+
return render_template("pages/error.html", code=500, message="Internal Server Error"), 500
|
app/models/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import (
|
| 2 |
+
db,
|
| 3 |
+
BaseModel,
|
| 4 |
+
User,
|
| 5 |
+
ScrapeJob,
|
| 6 |
+
ScrapeResult,
|
| 7 |
+
JobLog,
|
| 8 |
+
ExportRecord,
|
| 9 |
+
Schedule,
|
| 10 |
+
AppSetting,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"db",
|
| 15 |
+
"User",
|
| 16 |
+
"ScrapeJob",
|
| 17 |
+
"ScrapeResult",
|
| 18 |
+
"JobLog",
|
| 19 |
+
"ExportRecord",
|
| 20 |
+
"Schedule",
|
| 21 |
+
"AppSetting",
|
| 22 |
+
]
|
app/models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (402 Bytes). View file
|
|
|
app/models/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
app/models/models.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SQLAlchemy ORM models with relationships, indexes, and cascading deletes.
|
| 3 |
+
All timestamps are UTC. All text fields are sanitized at the service layer.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from flask_sqlalchemy import SQLAlchemy
|
| 12 |
+
from sqlalchemy import (
|
| 13 |
+
Boolean,
|
| 14 |
+
Column,
|
| 15 |
+
DateTime,
|
| 16 |
+
Float,
|
| 17 |
+
ForeignKey,
|
| 18 |
+
Index,
|
| 19 |
+
Integer,
|
| 20 |
+
String,
|
| 21 |
+
Text,
|
| 22 |
+
Enum,
|
| 23 |
+
UniqueConstraint,
|
| 24 |
+
)
|
| 25 |
+
from sqlalchemy.orm import relationship
|
| 26 |
+
|
| 27 |
+
db = SQLAlchemy()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class BaseModel(db.Model):
|
| 31 |
+
"""Base model with __allow_unmapped__ for legacy annotations."""
|
| 32 |
+
__abstract__ = True
|
| 33 |
+
__allow_unmapped__ = True
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def utcnow() -> datetime:
|
| 37 |
+
return datetime.now(timezone.utc).replace(tzinfo=None)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Enumerations (stored as VARCHAR for SQLite compatibility)
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
JOB_STATUS = ("pending", "running", "completed", "failed", "cancelled")
|
| 45 |
+
SCRAPE_TYPE = ("static", "dynamic")
|
| 46 |
+
EXTRACTION_TYPE = ("text", "images", "links", "attributes", "table", "json_ld", "full_html")
|
| 47 |
+
EXPORT_FORMAT = ("json", "csv", "excel")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
# Models
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class User(BaseModel):
|
| 56 |
+
"""Simple user model — single-user by default, extendable to multi-user."""
|
| 57 |
+
|
| 58 |
+
__tablename__ = "users"
|
| 59 |
+
|
| 60 |
+
id: int = Column(Integer, primary_key=True)
|
| 61 |
+
username: str = Column(String(80), unique=True, nullable=False, index=True)
|
| 62 |
+
email: str = Column(String(120), unique=True, nullable=False, index=True)
|
| 63 |
+
password_hash: str = Column(String(256), nullable=False)
|
| 64 |
+
is_active: bool = Column(Boolean, default=True, nullable=False)
|
| 65 |
+
is_admin: bool = Column(Boolean, default=False, nullable=False)
|
| 66 |
+
api_key: Optional[str] = Column(String(64), unique=True, index=True)
|
| 67 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 68 |
+
updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow, nullable=False)
|
| 69 |
+
|
| 70 |
+
# Relationships
|
| 71 |
+
jobs: list[ScrapeJob] = relationship(
|
| 72 |
+
"ScrapeJob", back_populates="user", cascade="all, delete-orphan", lazy="select"
|
| 73 |
+
)
|
| 74 |
+
schedules: list[Schedule] = relationship(
|
| 75 |
+
"Schedule", back_populates="user", cascade="all, delete-orphan", lazy="select"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def __repr__(self) -> str:
|
| 79 |
+
return f"<User {self.username}>"
|
| 80 |
+
|
| 81 |
+
def to_dict(self) -> dict:
|
| 82 |
+
return {
|
| 83 |
+
"id": self.id,
|
| 84 |
+
"username": self.username,
|
| 85 |
+
"email": self.email,
|
| 86 |
+
"is_active": self.is_active,
|
| 87 |
+
"is_admin": self.is_admin,
|
| 88 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class ScrapeJob(BaseModel):
|
| 93 |
+
"""Represents a single scraping job with all configuration."""
|
| 94 |
+
|
| 95 |
+
__tablename__ = "scrape_jobs"
|
| 96 |
+
__table_args__ = (
|
| 97 |
+
Index("ix_scrape_jobs_status", "status"),
|
| 98 |
+
Index("ix_scrape_jobs_created_at", "created_at"),
|
| 99 |
+
Index("ix_scrape_jobs_user_id", "user_id"),
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
id: int = Column(Integer, primary_key=True)
|
| 103 |
+
user_id: Optional[int] = Column(Integer, ForeignKey("users.id", ondelete="SET NULL"), nullable=True)
|
| 104 |
+
name: str = Column(String(200), nullable=False)
|
| 105 |
+
url: str = Column(Text, nullable=False)
|
| 106 |
+
|
| 107 |
+
# Selectors
|
| 108 |
+
html_tag: Optional[str] = Column(String(100))
|
| 109 |
+
css_selector: Optional[str] = Column(Text)
|
| 110 |
+
xpath_selector: Optional[str] = Column(Text)
|
| 111 |
+
attribute_name: Optional[str] = Column(String(100))
|
| 112 |
+
|
| 113 |
+
# Configuration
|
| 114 |
+
extraction_type: str = Column(
|
| 115 |
+
Enum(*EXTRACTION_TYPE, name="extraction_type"), default="text", nullable=False
|
| 116 |
+
)
|
| 117 |
+
scrape_type: str = Column(
|
| 118 |
+
Enum(*SCRAPE_TYPE, name="scrape_type"), default="static", nullable=False
|
| 119 |
+
)
|
| 120 |
+
status: str = Column(
|
| 121 |
+
Enum(*JOB_STATUS, name="job_status"), default="pending", nullable=False
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Advanced options
|
| 125 |
+
follow_pagination: bool = Column(Boolean, default=False)
|
| 126 |
+
max_pages: int = Column(Integer, default=1)
|
| 127 |
+
infinite_scroll: bool = Column(Boolean, default=False)
|
| 128 |
+
scroll_count: int = Column(Integer, default=3)
|
| 129 |
+
download_images: bool = Column(Boolean, default=False)
|
| 130 |
+
custom_headers: Optional[str] = Column(Text) # JSON string
|
| 131 |
+
user_agent: Optional[str] = Column(Text)
|
| 132 |
+
delay_seconds: float = Column(Float, default=1.0)
|
| 133 |
+
timeout_seconds: int = Column(Integer, default=30)
|
| 134 |
+
max_retries: int = Column(Integer, default=3)
|
| 135 |
+
check_robots_txt: bool = Column(Boolean, default=True)
|
| 136 |
+
deduplicate: bool = Column(Boolean, default=True)
|
| 137 |
+
|
| 138 |
+
# Stats
|
| 139 |
+
total_items: int = Column(Integer, default=0)
|
| 140 |
+
pages_scraped: int = Column(Integer, default=0)
|
| 141 |
+
error_count: int = Column(Integer, default=0)
|
| 142 |
+
duration_seconds: Optional[float] = Column(Float)
|
| 143 |
+
|
| 144 |
+
# Timestamps
|
| 145 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 146 |
+
updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
|
| 147 |
+
started_at: Optional[datetime] = Column(DateTime)
|
| 148 |
+
completed_at: Optional[datetime] = Column(DateTime)
|
| 149 |
+
|
| 150 |
+
# Relationships
|
| 151 |
+
user: Optional[User] = relationship("User", back_populates="jobs")
|
| 152 |
+
results: list[ScrapeResult] = relationship(
|
| 153 |
+
"ScrapeResult", back_populates="job", cascade="all, delete-orphan", lazy="select"
|
| 154 |
+
)
|
| 155 |
+
logs: list[JobLog] = relationship(
|
| 156 |
+
"JobLog", back_populates="job", cascade="all, delete-orphan", lazy="select"
|
| 157 |
+
)
|
| 158 |
+
exports: list[ExportRecord] = relationship(
|
| 159 |
+
"ExportRecord", back_populates="job", cascade="all, delete-orphan", lazy="select"
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
def __repr__(self) -> str:
|
| 163 |
+
return f"<ScrapeJob {self.id} [{self.status}] {self.url[:50]}>"
|
| 164 |
+
|
| 165 |
+
@property
|
| 166 |
+
def custom_headers_dict(self) -> dict:
|
| 167 |
+
if self.custom_headers:
|
| 168 |
+
try:
|
| 169 |
+
return json.loads(self.custom_headers)
|
| 170 |
+
except (json.JSONDecodeError, TypeError):
|
| 171 |
+
return {}
|
| 172 |
+
return {}
|
| 173 |
+
|
| 174 |
+
def to_dict(self) -> dict:
|
| 175 |
+
return {
|
| 176 |
+
"id": self.id,
|
| 177 |
+
"name": self.name,
|
| 178 |
+
"url": self.url,
|
| 179 |
+
"html_tag": self.html_tag,
|
| 180 |
+
"css_selector": self.css_selector,
|
| 181 |
+
"xpath_selector": self.xpath_selector,
|
| 182 |
+
"extraction_type": self.extraction_type,
|
| 183 |
+
"scrape_type": self.scrape_type,
|
| 184 |
+
"status": self.status,
|
| 185 |
+
"follow_pagination": self.follow_pagination,
|
| 186 |
+
"max_pages": self.max_pages,
|
| 187 |
+
"infinite_scroll": self.infinite_scroll,
|
| 188 |
+
"total_items": self.total_items,
|
| 189 |
+
"pages_scraped": self.pages_scraped,
|
| 190 |
+
"error_count": self.error_count,
|
| 191 |
+
"duration_seconds": self.duration_seconds,
|
| 192 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 193 |
+
"started_at": self.started_at.isoformat() if self.started_at else None,
|
| 194 |
+
"completed_at": self.completed_at.isoformat() if self.completed_at else None,
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
class ScrapeResult(BaseModel):
|
| 199 |
+
"""Individual scraped data item linked to a job."""
|
| 200 |
+
|
| 201 |
+
__tablename__ = "scrape_results"
|
| 202 |
+
__table_args__ = (
|
| 203 |
+
Index("ix_scrape_results_job_id", "job_id"),
|
| 204 |
+
Index("ix_scrape_results_page_num", "page_num"),
|
| 205 |
+
Index("ix_scrape_results_content_hash", "content_hash"),
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
id: int = Column(Integer, primary_key=True)
|
| 209 |
+
job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
|
| 210 |
+
page_url: str = Column(Text, nullable=False)
|
| 211 |
+
page_num: int = Column(Integer, default=1, nullable=False)
|
| 212 |
+
item_index: int = Column(Integer, default=0)
|
| 213 |
+
content: Optional[str] = Column(Text)
|
| 214 |
+
content_type: str = Column(String(50), default="text")
|
| 215 |
+
content_hash: Optional[str] = Column(String(64))
|
| 216 |
+
metadata_: Optional[str] = Column("metadata", Text) # JSON
|
| 217 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 218 |
+
|
| 219 |
+
# Relationship
|
| 220 |
+
job: ScrapeJob = relationship("ScrapeJob", back_populates="results")
|
| 221 |
+
|
| 222 |
+
def __repr__(self) -> str:
|
| 223 |
+
return f"<ScrapeResult job={self.job_id} page={self.page_num} idx={self.item_index}>"
|
| 224 |
+
|
| 225 |
+
@property
|
| 226 |
+
def metadata_dict(self) -> dict:
|
| 227 |
+
if self.metadata_:
|
| 228 |
+
try:
|
| 229 |
+
return json.loads(self.metadata_)
|
| 230 |
+
except (json.JSONDecodeError, TypeError):
|
| 231 |
+
return {}
|
| 232 |
+
return {}
|
| 233 |
+
|
| 234 |
+
def to_dict(self) -> dict:
|
| 235 |
+
return {
|
| 236 |
+
"id": self.id,
|
| 237 |
+
"job_id": self.job_id,
|
| 238 |
+
"page_url": self.page_url,
|
| 239 |
+
"page_num": self.page_num,
|
| 240 |
+
"item_index": self.item_index,
|
| 241 |
+
"content": self.content,
|
| 242 |
+
"content_type": self.content_type,
|
| 243 |
+
"metadata": self.metadata_dict,
|
| 244 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
class JobLog(BaseModel):
|
| 249 |
+
"""Structured logs for each scraping job."""
|
| 250 |
+
|
| 251 |
+
__tablename__ = "job_logs"
|
| 252 |
+
__table_args__ = (
|
| 253 |
+
Index("ix_job_logs_job_id", "job_id"),
|
| 254 |
+
Index("ix_job_logs_level", "level"),
|
| 255 |
+
Index("ix_job_logs_created_at", "created_at"),
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
id: int = Column(Integer, primary_key=True)
|
| 259 |
+
job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
|
| 260 |
+
level: str = Column(String(10), default="INFO", nullable=False)
|
| 261 |
+
message: str = Column(Text, nullable=False)
|
| 262 |
+
details: Optional[str] = Column(Text) # JSON for structured extras
|
| 263 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 264 |
+
|
| 265 |
+
job: ScrapeJob = relationship("ScrapeJob", back_populates="logs")
|
| 266 |
+
|
| 267 |
+
def to_dict(self) -> dict:
|
| 268 |
+
return {
|
| 269 |
+
"id": self.id,
|
| 270 |
+
"job_id": self.job_id,
|
| 271 |
+
"level": self.level,
|
| 272 |
+
"message": self.message,
|
| 273 |
+
"details": self.details,
|
| 274 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
class ExportRecord(BaseModel):
|
| 279 |
+
"""Tracks export files generated for each job."""
|
| 280 |
+
|
| 281 |
+
__tablename__ = "export_records"
|
| 282 |
+
__table_args__ = (Index("ix_export_records_job_id", "job_id"),)
|
| 283 |
+
|
| 284 |
+
id: int = Column(Integer, primary_key=True)
|
| 285 |
+
job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
|
| 286 |
+
format: str = Column(Enum(*EXPORT_FORMAT, name="export_format"), nullable=False)
|
| 287 |
+
filename: str = Column(String(255), nullable=False)
|
| 288 |
+
filepath: str = Column(Text, nullable=False)
|
| 289 |
+
file_size_bytes: Optional[int] = Column(Integer)
|
| 290 |
+
row_count: int = Column(Integer, default=0)
|
| 291 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 292 |
+
|
| 293 |
+
job: ScrapeJob = relationship("ScrapeJob", back_populates="exports")
|
| 294 |
+
|
| 295 |
+
def to_dict(self) -> dict:
|
| 296 |
+
return {
|
| 297 |
+
"id": self.id,
|
| 298 |
+
"job_id": self.job_id,
|
| 299 |
+
"format": self.format,
|
| 300 |
+
"filename": self.filename,
|
| 301 |
+
"file_size_bytes": self.file_size_bytes,
|
| 302 |
+
"row_count": self.row_count,
|
| 303 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
class Schedule(BaseModel):
|
| 308 |
+
"""APScheduler-backed recurring scrape schedules."""
|
| 309 |
+
|
| 310 |
+
__tablename__ = "schedules"
|
| 311 |
+
__table_args__ = (Index("ix_schedules_user_id", "user_id"),)
|
| 312 |
+
|
| 313 |
+
id: int = Column(Integer, primary_key=True)
|
| 314 |
+
user_id: Optional[int] = Column(Integer, ForeignKey("users.id", ondelete="SET NULL"), nullable=True)
|
| 315 |
+
name: str = Column(String(200), nullable=False)
|
| 316 |
+
cron_expression: str = Column(String(100), nullable=False) # e.g. "0 9 * * 1"
|
| 317 |
+
job_config: str = Column(Text, nullable=False) # JSON of ScrapeJob config
|
| 318 |
+
is_active: bool = Column(Boolean, default=True, nullable=False)
|
| 319 |
+
last_run_at: Optional[datetime] = Column(DateTime)
|
| 320 |
+
next_run_at: Optional[datetime] = Column(DateTime)
|
| 321 |
+
run_count: int = Column(Integer, default=0)
|
| 322 |
+
created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
|
| 323 |
+
updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
|
| 324 |
+
|
| 325 |
+
user: Optional[User] = relationship("User", back_populates="schedules")
|
| 326 |
+
|
| 327 |
+
@property
|
| 328 |
+
def job_config_dict(self) -> dict:
|
| 329 |
+
try:
|
| 330 |
+
return json.loads(self.job_config)
|
| 331 |
+
except (json.JSONDecodeError, TypeError):
|
| 332 |
+
return {}
|
| 333 |
+
|
| 334 |
+
def to_dict(self) -> dict:
|
| 335 |
+
return {
|
| 336 |
+
"id": self.id,
|
| 337 |
+
"name": self.name,
|
| 338 |
+
"cron_expression": self.cron_expression,
|
| 339 |
+
"is_active": self.is_active,
|
| 340 |
+
"last_run_at": self.last_run_at.isoformat() if self.last_run_at else None,
|
| 341 |
+
"next_run_at": self.next_run_at.isoformat() if self.next_run_at else None,
|
| 342 |
+
"run_count": self.run_count,
|
| 343 |
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
class AppSetting(BaseModel):
|
| 348 |
+
"""Key-value application settings stored in the DB."""
|
| 349 |
+
|
| 350 |
+
__tablename__ = "app_settings"
|
| 351 |
+
|
| 352 |
+
id: int = Column(Integer, primary_key=True)
|
| 353 |
+
key: str = Column(String(100), unique=True, nullable=False, index=True)
|
| 354 |
+
value: str = Column(Text, nullable=False)
|
| 355 |
+
description: Optional[str] = Column(Text)
|
| 356 |
+
updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
|
| 357 |
+
|
| 358 |
+
@classmethod
|
| 359 |
+
def get(cls, key: str, default: str = "") -> str:
|
| 360 |
+
row = cls.query.filter_by(key=key).first()
|
| 361 |
+
return row.value if row else default
|
| 362 |
+
|
| 363 |
+
@classmethod
|
| 364 |
+
def set(cls, key: str, value: str, description: str = "") -> None:
|
| 365 |
+
row = cls.query.filter_by(key=key).first()
|
| 366 |
+
if row:
|
| 367 |
+
row.value = value
|
| 368 |
+
else:
|
| 369 |
+
row = cls(key=key, value=value, description=description)
|
| 370 |
+
db.session.add(row)
|
| 371 |
+
db.session.commit()
|
app/routes/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .main import main_bp
|
| 2 |
+
from .jobs import jobs_bp
|
| 3 |
+
|
| 4 |
+
__all__ = ["main_bp", "jobs_bp"]
|
app/routes/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (251 Bytes). View file
|
|
|
app/routes/__pycache__/jobs.cpython-310.pyc
ADDED
|
Binary file (7.28 kB). View file
|
|
|
app/routes/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (1.11 kB). View file
|
|
|
app/routes/jobs.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scrape job routes (UI + REST API).
|
| 3 |
+
Blueprint: 'jobs' — prefix: /jobs
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from flask import (
|
| 11 |
+
Blueprint,
|
| 12 |
+
abort,
|
| 13 |
+
current_app,
|
| 14 |
+
flash,
|
| 15 |
+
jsonify,
|
| 16 |
+
redirect,
|
| 17 |
+
render_template,
|
| 18 |
+
request,
|
| 19 |
+
send_file,
|
| 20 |
+
url_for,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
from app.services import (
|
| 24 |
+
create_job,
|
| 25 |
+
cancel_job,
|
| 26 |
+
delete_job,
|
| 27 |
+
execute_job_async,
|
| 28 |
+
get_job,
|
| 29 |
+
get_job_logs,
|
| 30 |
+
get_job_results,
|
| 31 |
+
list_jobs,
|
| 32 |
+
export_json,
|
| 33 |
+
export_csv,
|
| 34 |
+
export_excel,
|
| 35 |
+
get_job_exports,
|
| 36 |
+
)
|
| 37 |
+
from app.utils.validators import validate_job_data, sanitize_string
|
| 38 |
+
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
jobs_bp = Blueprint("jobs", __name__, url_prefix="/jobs")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# UI Routes
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@jobs_bp.route("/")
|
| 50 |
+
def list_view():
|
| 51 |
+
page = request.args.get("page", 1, type=int)
|
| 52 |
+
status = request.args.get("status")
|
| 53 |
+
search = request.args.get("search")
|
| 54 |
+
pagination = list_jobs(page=page, per_page=20, status=status, search=search)
|
| 55 |
+
return render_template(
|
| 56 |
+
"pages/jobs.html",
|
| 57 |
+
pagination=pagination,
|
| 58 |
+
status_filter=status,
|
| 59 |
+
search=search,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@jobs_bp.route("/new", methods=["GET", "POST"])
|
| 64 |
+
def new_job():
|
| 65 |
+
if request.method == "POST":
|
| 66 |
+
data = {k: v for k, v in request.form.items()}
|
| 67 |
+
# Sanitize string fields
|
| 68 |
+
for field in ("name", "html_tag", "css_selector", "xpath_selector", "attribute_name", "user_agent"):
|
| 69 |
+
if field in data:
|
| 70 |
+
data[field] = sanitize_string(data[field])
|
| 71 |
+
|
| 72 |
+
# Checkboxes
|
| 73 |
+
data["follow_pagination"] = "follow_pagination" in request.form
|
| 74 |
+
data["infinite_scroll"] = "infinite_scroll" in request.form
|
| 75 |
+
data["download_images"] = "download_images" in request.form
|
| 76 |
+
data["check_robots_txt"] = "check_robots_txt" in request.form
|
| 77 |
+
data["deduplicate"] = "deduplicate" in request.form
|
| 78 |
+
|
| 79 |
+
valid, errors = validate_job_data(data)
|
| 80 |
+
if not valid:
|
| 81 |
+
return render_template("pages/new_job.html", errors=errors, form_data=data)
|
| 82 |
+
|
| 83 |
+
job = create_job(data)
|
| 84 |
+
execute_job_async(job.id, current_app._get_current_object())
|
| 85 |
+
flash(f"Job #{job.id} started successfully!", "success")
|
| 86 |
+
return redirect(url_for("jobs.detail", job_id=job.id))
|
| 87 |
+
|
| 88 |
+
return render_template("pages/new_job.html", errors={}, form_data={})
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@jobs_bp.route("/<int:job_id>")
|
| 92 |
+
def detail(job_id: int):
|
| 93 |
+
job = get_job(job_id)
|
| 94 |
+
if not job:
|
| 95 |
+
abort(404)
|
| 96 |
+
results_page = get_job_results(job_id, page=request.args.get("rpage", 1, type=int), per_page=50)
|
| 97 |
+
logs_page = get_job_logs(job_id, page=1, per_page=200)
|
| 98 |
+
exports = get_job_exports(job_id)
|
| 99 |
+
return render_template(
|
| 100 |
+
"pages/job_detail.html",
|
| 101 |
+
job=job,
|
| 102 |
+
results=results_page,
|
| 103 |
+
logs=logs_page,
|
| 104 |
+
exports=exports,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@jobs_bp.route("/<int:job_id>/delete", methods=["POST"])
|
| 109 |
+
def delete(job_id: int):
|
| 110 |
+
if not delete_job(job_id):
|
| 111 |
+
abort(404)
|
| 112 |
+
flash(f"Job #{job_id} deleted.", "info")
|
| 113 |
+
return redirect(url_for("jobs.list_view"))
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@jobs_bp.route("/<int:job_id>/cancel", methods=["POST"])
|
| 117 |
+
def cancel(job_id: int):
|
| 118 |
+
if not cancel_job(job_id):
|
| 119 |
+
flash("Cannot cancel this job.", "warning")
|
| 120 |
+
else:
|
| 121 |
+
flash(f"Job #{job_id} cancelled.", "info")
|
| 122 |
+
return redirect(url_for("jobs.detail", job_id=job_id))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@jobs_bp.route("/<int:job_id>/rerun", methods=["POST"])
|
| 126 |
+
def rerun(job_id: int):
|
| 127 |
+
job = get_job(job_id)
|
| 128 |
+
if not job:
|
| 129 |
+
abort(404)
|
| 130 |
+
job.status = "pending"
|
| 131 |
+
from app.models import db
|
| 132 |
+
db.session.commit()
|
| 133 |
+
execute_job_async(job_id, current_app._get_current_object())
|
| 134 |
+
flash(f"Job #{job_id} queued for re-run.", "success")
|
| 135 |
+
return redirect(url_for("jobs.detail", job_id=job_id))
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# ---------------------------------------------------------------------------
|
| 139 |
+
# Export Routes
|
| 140 |
+
# ---------------------------------------------------------------------------
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@jobs_bp.route("/<int:job_id>/export/<string:fmt>")
|
| 144 |
+
def export(job_id: int, fmt: str):
|
| 145 |
+
job = get_job(job_id)
|
| 146 |
+
if not job:
|
| 147 |
+
abort(404)
|
| 148 |
+
|
| 149 |
+
exporters = {"json": export_json, "csv": export_csv, "excel": export_excel}
|
| 150 |
+
if fmt not in exporters:
|
| 151 |
+
abort(400)
|
| 152 |
+
|
| 153 |
+
filepath = exporters[fmt](job_id)
|
| 154 |
+
if not filepath or not Path(filepath).exists():
|
| 155 |
+
flash("Export failed. No results to export.", "danger")
|
| 156 |
+
return redirect(url_for("jobs.detail", job_id=job_id))
|
| 157 |
+
|
| 158 |
+
mime_map = {
|
| 159 |
+
"json": "application/json",
|
| 160 |
+
"csv": "text/csv",
|
| 161 |
+
"excel": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 162 |
+
}
|
| 163 |
+
return send_file(filepath, mimetype=mime_map[fmt], as_attachment=True)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ---------------------------------------------------------------------------
|
| 167 |
+
# REST API Routes
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@jobs_bp.route("/api/jobs", methods=["GET"])
|
| 172 |
+
def api_list():
|
| 173 |
+
page = request.args.get("page", 1, type=int)
|
| 174 |
+
per_page = min(request.args.get("per_page", 20, type=int), 100)
|
| 175 |
+
status = request.args.get("status")
|
| 176 |
+
search = request.args.get("search")
|
| 177 |
+
pagination = list_jobs(page=page, per_page=per_page, status=status, search=search)
|
| 178 |
+
return jsonify(
|
| 179 |
+
{
|
| 180 |
+
"jobs": [j.to_dict() for j in pagination.items],
|
| 181 |
+
"total": pagination.total,
|
| 182 |
+
"page": pagination.page,
|
| 183 |
+
"pages": pagination.pages,
|
| 184 |
+
"per_page": pagination.per_page,
|
| 185 |
+
}
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
@jobs_bp.route("/api/jobs", methods=["POST"])
|
| 190 |
+
def api_create():
|
| 191 |
+
data = request.get_json(silent=True)
|
| 192 |
+
if not data:
|
| 193 |
+
return jsonify({"error": "JSON body required"}), 400
|
| 194 |
+
|
| 195 |
+
valid, errors = validate_job_data(data)
|
| 196 |
+
if not valid:
|
| 197 |
+
return jsonify({"error": "Validation failed", "details": errors}), 422
|
| 198 |
+
|
| 199 |
+
job = create_job(data)
|
| 200 |
+
execute_job_async(job.id, current_app._get_current_object())
|
| 201 |
+
return jsonify({"job": job.to_dict(), "message": "Job created and queued"}), 201
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@jobs_bp.route("/api/jobs/<int:job_id>", methods=["GET"])
|
| 205 |
+
def api_get(job_id: int):
|
| 206 |
+
job = get_job(job_id)
|
| 207 |
+
if not job:
|
| 208 |
+
return jsonify({"error": "Job not found"}), 404
|
| 209 |
+
return jsonify({"job": job.to_dict()})
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@jobs_bp.route("/api/jobs/<int:job_id>", methods=["DELETE"])
|
| 213 |
+
def api_delete(job_id: int):
|
| 214 |
+
if not delete_job(job_id):
|
| 215 |
+
return jsonify({"error": "Job not found"}), 404
|
| 216 |
+
return jsonify({"message": f"Job {job_id} deleted"}), 200
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
@jobs_bp.route("/api/jobs/<int:job_id>/results", methods=["GET"])
|
| 220 |
+
def api_results(job_id: int):
|
| 221 |
+
job = get_job(job_id)
|
| 222 |
+
if not job:
|
| 223 |
+
return jsonify({"error": "Job not found"}), 404
|
| 224 |
+
page = request.args.get("page", 1, type=int)
|
| 225 |
+
per_page = min(request.args.get("per_page", 50, type=int), 500)
|
| 226 |
+
pagination = get_job_results(job_id, page=page, per_page=per_page)
|
| 227 |
+
return jsonify(
|
| 228 |
+
{
|
| 229 |
+
"results": [r.to_dict() for r in pagination.items],
|
| 230 |
+
"total": pagination.total,
|
| 231 |
+
"page": pagination.page,
|
| 232 |
+
"pages": pagination.pages,
|
| 233 |
+
}
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
@jobs_bp.route("/api/jobs/<int:job_id>/logs", methods=["GET"])
|
| 238 |
+
def api_logs(job_id: int):
|
| 239 |
+
job = get_job(job_id)
|
| 240 |
+
if not job:
|
| 241 |
+
return jsonify({"error": "Job not found"}), 404
|
| 242 |
+
page = request.args.get("page", 1, type=int)
|
| 243 |
+
pagination = get_job_logs(job_id, page=page, per_page=100)
|
| 244 |
+
return jsonify(
|
| 245 |
+
{
|
| 246 |
+
"logs": [l.to_dict() for l in pagination.items],
|
| 247 |
+
"total": pagination.total,
|
| 248 |
+
}
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
@jobs_bp.route("/api/jobs/<int:job_id>/status", methods=["GET"])
|
| 253 |
+
def api_status(job_id: int):
|
| 254 |
+
"""Lightweight polling endpoint for live status updates."""
|
| 255 |
+
job = get_job(job_id)
|
| 256 |
+
if not job:
|
| 257 |
+
return jsonify({"error": "Job not found"}), 404
|
| 258 |
+
return jsonify(
|
| 259 |
+
{
|
| 260 |
+
"status": job.status,
|
| 261 |
+
"total_items": job.total_items,
|
| 262 |
+
"pages_scraped": job.pages_scraped,
|
| 263 |
+
"error_count": job.error_count,
|
| 264 |
+
}
|
| 265 |
+
)
|
app/routes/main.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dashboard and main UI routes.
|
| 3 |
+
Blueprint: 'main' — prefix: /
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
from flask import Blueprint, render_template, jsonify
|
| 10 |
+
|
| 11 |
+
from app.services import get_dashboard_stats
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
main_bp = Blueprint("main", __name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@main_bp.route("/")
|
| 19 |
+
def index():
|
| 20 |
+
stats = get_dashboard_stats()
|
| 21 |
+
return render_template("pages/dashboard.html", stats=stats)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@main_bp.route("/health")
|
| 25 |
+
def health():
|
| 26 |
+
"""Health check endpoint for load balancers and monitoring."""
|
| 27 |
+
return jsonify({"status": "ok", "service": "WebScraper Platform"}), 200
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@main_bp.route("/metrics")
|
| 31 |
+
def metrics():
|
| 32 |
+
"""Simple performance metrics endpoint."""
|
| 33 |
+
stats = get_dashboard_stats()
|
| 34 |
+
return jsonify(stats), 200
|
app/scrapers/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .engine import run_scrape, ScrapeRequest, ScrapeResponse, ScrapedItem
|
| 2 |
+
|
| 3 |
+
__all__ = ["run_scrape", "ScrapeRequest", "ScrapeResponse", "ScrapedItem"]
|
app/scrapers/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (283 Bytes). View file
|
|
|
app/scrapers/__pycache__/engine.cpython-310.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
app/scrapers/engine.py
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core scraping engine.
|
| 3 |
+
Handles static (Requests + BS4) and dynamic (Playwright) scraping.
|
| 4 |
+
Implements UA rotation, retry logic, robots.txt checking, pagination, and infinite scroll.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import hashlib
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import re
|
| 12 |
+
import time
|
| 13 |
+
import urllib.robotparser
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
from typing import Any, Generator, Optional
|
| 16 |
+
from urllib.parse import urljoin, urlparse
|
| 17 |
+
|
| 18 |
+
import requests
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
from fake_useragent import UserAgent
|
| 21 |
+
from tenacity import (
|
| 22 |
+
retry,
|
| 23 |
+
retry_if_exception_type,
|
| 24 |
+
stop_after_attempt,
|
| 25 |
+
wait_exponential,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Data structures
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ScrapeRequest:
|
| 37 |
+
"""Fully-typed request configuration for the scraping engine."""
|
| 38 |
+
|
| 39 |
+
url: str
|
| 40 |
+
html_tag: Optional[str] = None
|
| 41 |
+
css_selector: Optional[str] = None
|
| 42 |
+
xpath_selector: Optional[str] = None
|
| 43 |
+
attribute_name: Optional[str] = None
|
| 44 |
+
extraction_type: str = "text" # text | images | links | attributes | table | json_ld | full_html
|
| 45 |
+
scrape_type: str = "static" # static | dynamic
|
| 46 |
+
follow_pagination: bool = False
|
| 47 |
+
max_pages: int = 1
|
| 48 |
+
infinite_scroll: bool = False
|
| 49 |
+
scroll_count: int = 3
|
| 50 |
+
download_images: bool = False
|
| 51 |
+
custom_headers: dict = field(default_factory=dict)
|
| 52 |
+
user_agent: Optional[str] = None
|
| 53 |
+
delay_seconds: float = 1.0
|
| 54 |
+
timeout_seconds: int = 30
|
| 55 |
+
max_retries: int = 3
|
| 56 |
+
check_robots_txt: bool = True
|
| 57 |
+
deduplicate: bool = True
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class ScrapedItem:
|
| 62 |
+
"""A single scraped datum."""
|
| 63 |
+
|
| 64 |
+
content: str
|
| 65 |
+
content_type: str
|
| 66 |
+
page_url: str
|
| 67 |
+
page_num: int
|
| 68 |
+
item_index: int
|
| 69 |
+
content_hash: str
|
| 70 |
+
metadata: dict = field(default_factory=dict)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@dataclass
|
| 74 |
+
class ScrapeResponse:
|
| 75 |
+
"""Final response from the scraping engine."""
|
| 76 |
+
|
| 77 |
+
items: list[ScrapedItem]
|
| 78 |
+
pages_scraped: int
|
| 79 |
+
error_count: int
|
| 80 |
+
errors: list[str]
|
| 81 |
+
duration_seconds: float
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
# User-Agent rotation
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
|
| 88 |
+
_ua_instance: Optional[UserAgent] = None
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _get_user_agent(preferred: Optional[str] = None) -> str:
|
| 92 |
+
global _ua_instance
|
| 93 |
+
if preferred:
|
| 94 |
+
return preferred
|
| 95 |
+
try:
|
| 96 |
+
if _ua_instance is None:
|
| 97 |
+
_ua_instance = UserAgent(fallback="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
| 98 |
+
return _ua_instance.random
|
| 99 |
+
except Exception:
|
| 100 |
+
return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
# robots.txt checker
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _is_allowed_by_robots(url: str, user_agent: str = "*") -> bool:
|
| 109 |
+
"""Check whether a URL is allowed by the target site's robots.txt."""
|
| 110 |
+
try:
|
| 111 |
+
parsed = urlparse(url)
|
| 112 |
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 113 |
+
rp = urllib.robotparser.RobotFileParser()
|
| 114 |
+
rp.set_url(robots_url)
|
| 115 |
+
rp.read()
|
| 116 |
+
return rp.can_fetch(user_agent, url)
|
| 117 |
+
except Exception as exc:
|
| 118 |
+
logger.warning("robots.txt check failed for %s: %s", url, exc)
|
| 119 |
+
return True # Allow on error — be conservative
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
# Content hashing for deduplication
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _content_hash(content: str) -> str:
|
| 128 |
+
return hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()[:16]
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
# HTML Parsers
|
| 133 |
+
# ---------------------------------------------------------------------------
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _parse_html(html: str) -> BeautifulSoup:
|
| 137 |
+
try:
|
| 138 |
+
return BeautifulSoup(html, "lxml")
|
| 139 |
+
except Exception:
|
| 140 |
+
return BeautifulSoup(html, "html.parser")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _extract_with_css(soup: BeautifulSoup, selector: str) -> list[Any]:
|
| 144 |
+
return soup.select(selector)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _extract_with_xpath(html: str, xpath: str) -> list[str]:
|
| 148 |
+
"""XPath extraction via lxml."""
|
| 149 |
+
try:
|
| 150 |
+
from lxml import etree
|
| 151 |
+
|
| 152 |
+
tree = etree.fromstring(html.encode(), parser=etree.HTMLParser())
|
| 153 |
+
results = tree.xpath(xpath)
|
| 154 |
+
texts = []
|
| 155 |
+
for r in results:
|
| 156 |
+
if isinstance(r, str):
|
| 157 |
+
texts.append(r.strip())
|
| 158 |
+
elif hasattr(r, "text_content"):
|
| 159 |
+
texts.append(r.text_content().strip())
|
| 160 |
+
elif hasattr(r, "text"):
|
| 161 |
+
texts.append((r.text or "").strip())
|
| 162 |
+
return [t for t in texts if t]
|
| 163 |
+
except Exception as exc:
|
| 164 |
+
logger.warning("XPath extraction failed: %s", exc)
|
| 165 |
+
return []
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _extract_json_ld(soup: BeautifulSoup) -> list[dict]:
|
| 169 |
+
results = []
|
| 170 |
+
for tag in soup.find_all("script", type="application/ld+json"):
|
| 171 |
+
try:
|
| 172 |
+
data = json.loads(tag.string or "")
|
| 173 |
+
results.append(data)
|
| 174 |
+
except (json.JSONDecodeError, TypeError):
|
| 175 |
+
pass
|
| 176 |
+
return results
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _extract_tables(soup: BeautifulSoup) -> list[list[list[str]]]:
|
| 180 |
+
tables = []
|
| 181 |
+
for table in soup.find_all("table"):
|
| 182 |
+
rows = []
|
| 183 |
+
for tr in table.find_all("tr"):
|
| 184 |
+
cells = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
|
| 185 |
+
if cells:
|
| 186 |
+
rows.append(cells)
|
| 187 |
+
if rows:
|
| 188 |
+
tables.append(rows)
|
| 189 |
+
return tables
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# ---------------------------------------------------------------------------
|
| 193 |
+
# Content extractors
|
| 194 |
+
# ---------------------------------------------------------------------------
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _extract_items(
|
| 198 |
+
soup: BeautifulSoup,
|
| 199 |
+
raw_html: str,
|
| 200 |
+
req: ScrapeRequest,
|
| 201 |
+
page_url: str,
|
| 202 |
+
page_num: int,
|
| 203 |
+
) -> list[ScrapedItem]:
|
| 204 |
+
"""Route extraction to the appropriate extractor."""
|
| 205 |
+
items: list[ScrapedItem] = []
|
| 206 |
+
|
| 207 |
+
if req.extraction_type == "json_ld":
|
| 208 |
+
for idx, data in enumerate(_extract_json_ld(soup)):
|
| 209 |
+
content = json.dumps(data, ensure_ascii=False)
|
| 210 |
+
items.append(
|
| 211 |
+
ScrapedItem(
|
| 212 |
+
content=content,
|
| 213 |
+
content_type="json_ld",
|
| 214 |
+
page_url=page_url,
|
| 215 |
+
page_num=page_num,
|
| 216 |
+
item_index=idx,
|
| 217 |
+
content_hash=_content_hash(content),
|
| 218 |
+
)
|
| 219 |
+
)
|
| 220 |
+
return items
|
| 221 |
+
|
| 222 |
+
if req.extraction_type == "table":
|
| 223 |
+
for t_idx, table in enumerate(_extract_tables(soup)):
|
| 224 |
+
content = json.dumps(table, ensure_ascii=False)
|
| 225 |
+
items.append(
|
| 226 |
+
ScrapedItem(
|
| 227 |
+
content=content,
|
| 228 |
+
content_type="table",
|
| 229 |
+
page_url=page_url,
|
| 230 |
+
page_num=page_num,
|
| 231 |
+
item_index=t_idx,
|
| 232 |
+
content_hash=_content_hash(content),
|
| 233 |
+
)
|
| 234 |
+
)
|
| 235 |
+
return items
|
| 236 |
+
|
| 237 |
+
if req.extraction_type == "full_html":
|
| 238 |
+
content = str(soup)
|
| 239 |
+
items.append(
|
| 240 |
+
ScrapedItem(
|
| 241 |
+
content=content,
|
| 242 |
+
content_type="html",
|
| 243 |
+
page_url=page_url,
|
| 244 |
+
page_num=page_num,
|
| 245 |
+
item_index=0,
|
| 246 |
+
content_hash=_content_hash(content),
|
| 247 |
+
)
|
| 248 |
+
)
|
| 249 |
+
return items
|
| 250 |
+
|
| 251 |
+
# --- Resolve elements via CSS or XPath or Tag ---
|
| 252 |
+
elements = []
|
| 253 |
+
|
| 254 |
+
if req.xpath_selector:
|
| 255 |
+
texts = _extract_with_xpath(raw_html, req.xpath_selector)
|
| 256 |
+
for idx, text in enumerate(texts):
|
| 257 |
+
items.append(
|
| 258 |
+
ScrapedItem(
|
| 259 |
+
content=text,
|
| 260 |
+
content_type="text",
|
| 261 |
+
page_url=page_url,
|
| 262 |
+
page_num=page_num,
|
| 263 |
+
item_index=idx,
|
| 264 |
+
content_hash=_content_hash(text),
|
| 265 |
+
)
|
| 266 |
+
)
|
| 267 |
+
return items
|
| 268 |
+
|
| 269 |
+
if req.css_selector:
|
| 270 |
+
elements = _extract_with_css(soup, req.css_selector)
|
| 271 |
+
elif req.html_tag:
|
| 272 |
+
elements = soup.find_all(req.html_tag)
|
| 273 |
+
else:
|
| 274 |
+
elements = soup.find_all(True) # All elements
|
| 275 |
+
|
| 276 |
+
for idx, el in enumerate(elements):
|
| 277 |
+
if req.extraction_type == "text":
|
| 278 |
+
content = el.get_text(separator=" ", strip=True)
|
| 279 |
+
elif req.extraction_type == "links":
|
| 280 |
+
href = el.get("href", "") if el.name == "a" else ""
|
| 281 |
+
if not href:
|
| 282 |
+
link_el = el.find("a")
|
| 283 |
+
href = link_el.get("href", "") if link_el else ""
|
| 284 |
+
if href:
|
| 285 |
+
content = urljoin(page_url, href)
|
| 286 |
+
else:
|
| 287 |
+
continue
|
| 288 |
+
elif req.extraction_type == "images":
|
| 289 |
+
src = el.get("src", "") if el.name == "img" else ""
|
| 290 |
+
if not src:
|
| 291 |
+
img_el = el.find("img")
|
| 292 |
+
src = img_el.get("src", "") if img_el else ""
|
| 293 |
+
if src:
|
| 294 |
+
content = urljoin(page_url, src)
|
| 295 |
+
else:
|
| 296 |
+
continue
|
| 297 |
+
elif req.extraction_type == "attributes":
|
| 298 |
+
if req.attribute_name:
|
| 299 |
+
content = el.get(req.attribute_name, "")
|
| 300 |
+
else:
|
| 301 |
+
content = json.dumps(dict(el.attrs), ensure_ascii=False)
|
| 302 |
+
else:
|
| 303 |
+
content = el.get_text(separator=" ", strip=True)
|
| 304 |
+
|
| 305 |
+
content = content.strip()
|
| 306 |
+
if not content:
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
items.append(
|
| 310 |
+
ScrapedItem(
|
| 311 |
+
content=content,
|
| 312 |
+
content_type=req.extraction_type,
|
| 313 |
+
page_url=page_url,
|
| 314 |
+
page_num=page_num,
|
| 315 |
+
item_index=idx,
|
| 316 |
+
content_hash=_content_hash(content),
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
return items
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
# ---------------------------------------------------------------------------
|
| 324 |
+
# Static scraper (Requests + BeautifulSoup)
|
| 325 |
+
# ---------------------------------------------------------------------------
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
class StaticScraper:
|
| 329 |
+
"""HTTP scraper using Requests with retry, UA rotation, and timeout handling."""
|
| 330 |
+
|
| 331 |
+
def __init__(self, req: ScrapeRequest) -> None:
|
| 332 |
+
self.req = req
|
| 333 |
+
self.session = requests.Session()
|
| 334 |
+
self._configure_session()
|
| 335 |
+
|
| 336 |
+
def _configure_session(self) -> None:
|
| 337 |
+
ua = _get_user_agent(self.req.user_agent)
|
| 338 |
+
self.session.headers.update(
|
| 339 |
+
{
|
| 340 |
+
"User-Agent": ua,
|
| 341 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 342 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 343 |
+
"Connection": "keep-alive",
|
| 344 |
+
"Upgrade-Insecure-Requests": "1",
|
| 345 |
+
}
|
| 346 |
+
)
|
| 347 |
+
if self.req.custom_headers:
|
| 348 |
+
self.session.headers.update(self.req.custom_headers)
|
| 349 |
+
|
| 350 |
+
@retry(
|
| 351 |
+
retry=retry_if_exception_type((requests.ConnectionError, requests.Timeout)),
|
| 352 |
+
stop=stop_after_attempt(3),
|
| 353 |
+
wait=wait_exponential(multiplier=1, min=1, max=10),
|
| 354 |
+
reraise=True,
|
| 355 |
+
)
|
| 356 |
+
def _fetch(self, url: str) -> requests.Response:
|
| 357 |
+
response = self.session.get(url, timeout=self.req.timeout_seconds, allow_redirects=True)
|
| 358 |
+
response.raise_for_status()
|
| 359 |
+
return response
|
| 360 |
+
|
| 361 |
+
def scrape(self) -> Generator[tuple[str, str, int], None, None]:
|
| 362 |
+
"""Yields (html, page_url, page_num) for each page."""
|
| 363 |
+
url = self.req.url
|
| 364 |
+
page_num = 1
|
| 365 |
+
|
| 366 |
+
while url and page_num <= self.req.max_pages:
|
| 367 |
+
try:
|
| 368 |
+
# Rotate UA per request
|
| 369 |
+
self.session.headers["User-Agent"] = _get_user_agent(self.req.user_agent)
|
| 370 |
+
response = self._fetch(url)
|
| 371 |
+
html = response.text
|
| 372 |
+
yield html, response.url, page_num
|
| 373 |
+
|
| 374 |
+
if not self.req.follow_pagination or page_num >= self.req.max_pages:
|
| 375 |
+
break
|
| 376 |
+
|
| 377 |
+
# Find next page link
|
| 378 |
+
soup = _parse_html(html)
|
| 379 |
+
next_url = _find_next_page(soup, response.url)
|
| 380 |
+
if not next_url or next_url == url:
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
url = next_url
|
| 384 |
+
page_num += 1
|
| 385 |
+
time.sleep(self.req.delay_seconds)
|
| 386 |
+
|
| 387 |
+
except requests.HTTPError as exc:
|
| 388 |
+
logger.error("HTTP error fetching %s: %s", url, exc)
|
| 389 |
+
raise
|
| 390 |
+
except Exception as exc:
|
| 391 |
+
logger.error("Error fetching %s: %s", url, exc)
|
| 392 |
+
raise
|
| 393 |
+
|
| 394 |
+
def close(self) -> None:
|
| 395 |
+
self.session.close()
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def _find_next_page(soup: BeautifulSoup, current_url: str) -> Optional[str]:
|
| 399 |
+
"""Heuristic: find next pagination link."""
|
| 400 |
+
patterns = [
|
| 401 |
+
"a[rel='next']",
|
| 402 |
+
"a.next",
|
| 403 |
+
"a.pagination-next",
|
| 404 |
+
"li.next a",
|
| 405 |
+
"a[aria-label='Next']",
|
| 406 |
+
".next-page a",
|
| 407 |
+
"#next a",
|
| 408 |
+
]
|
| 409 |
+
for sel in patterns:
|
| 410 |
+
el = soup.select_one(sel)
|
| 411 |
+
if el and el.get("href"):
|
| 412 |
+
return urljoin(current_url, el["href"])
|
| 413 |
+
|
| 414 |
+
# Fallback: look for links with text "next"
|
| 415 |
+
for a in soup.find_all("a", href=True):
|
| 416 |
+
text = a.get_text(strip=True).lower()
|
| 417 |
+
if text in ("next", "next »", "»", "›", "next page"):
|
| 418 |
+
return urljoin(current_url, a["href"])
|
| 419 |
+
|
| 420 |
+
return None
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
# ---------------------------------------------------------------------------
|
| 424 |
+
# Dynamic scraper (Playwright)
|
| 425 |
+
# ---------------------------------------------------------------------------
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
class DynamicScraper:
|
| 429 |
+
"""Playwright-based scraper for JS-rendered content with infinite scroll support."""
|
| 430 |
+
|
| 431 |
+
def scrape(self, req: ScrapeRequest) -> Generator[tuple[str, str, int], None, None]:
|
| 432 |
+
try:
|
| 433 |
+
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
| 434 |
+
except ImportError:
|
| 435 |
+
raise RuntimeError("Playwright not installed. Run: playwright install chromium")
|
| 436 |
+
|
| 437 |
+
ua = _get_user_agent(req.user_agent)
|
| 438 |
+
|
| 439 |
+
with sync_playwright() as p:
|
| 440 |
+
browser = p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"])
|
| 441 |
+
context = browser.new_context(
|
| 442 |
+
user_agent=ua,
|
| 443 |
+
viewport={"width": 1920, "height": 1080},
|
| 444 |
+
extra_http_headers=req.custom_headers or {},
|
| 445 |
+
)
|
| 446 |
+
page = context.new_page()
|
| 447 |
+
page.set_default_timeout(req.timeout_seconds * 1000)
|
| 448 |
+
|
| 449 |
+
url = req.url
|
| 450 |
+
page_num = 1
|
| 451 |
+
|
| 452 |
+
while url and page_num <= req.max_pages:
|
| 453 |
+
try:
|
| 454 |
+
page.goto(url, wait_until="networkidle", timeout=req.timeout_seconds * 1000)
|
| 455 |
+
|
| 456 |
+
if req.infinite_scroll:
|
| 457 |
+
_perform_infinite_scroll(page, req.scroll_count)
|
| 458 |
+
|
| 459 |
+
html = page.content()
|
| 460 |
+
final_url = page.url
|
| 461 |
+
yield html, final_url, page_num
|
| 462 |
+
|
| 463 |
+
if not req.follow_pagination or page_num >= req.max_pages:
|
| 464 |
+
break
|
| 465 |
+
|
| 466 |
+
# Try clicking next page button
|
| 467 |
+
next_url = _playwright_next_page(page, final_url)
|
| 468 |
+
if not next_url or next_url == url:
|
| 469 |
+
break
|
| 470 |
+
|
| 471 |
+
url = next_url
|
| 472 |
+
page_num += 1
|
| 473 |
+
time.sleep(req.delay_seconds)
|
| 474 |
+
|
| 475 |
+
except PWTimeout as exc:
|
| 476 |
+
logger.error("Playwright timeout on %s: %s", url, exc)
|
| 477 |
+
raise
|
| 478 |
+
except Exception as exc:
|
| 479 |
+
logger.error("Playwright error on %s: %s", url, exc)
|
| 480 |
+
raise
|
| 481 |
+
|
| 482 |
+
context.close()
|
| 483 |
+
browser.close()
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
def _perform_infinite_scroll(page, scroll_count: int) -> None:
|
| 487 |
+
"""Scroll to bottom repeatedly to trigger lazy loading."""
|
| 488 |
+
for _ in range(scroll_count):
|
| 489 |
+
prev_height = page.evaluate("document.body.scrollHeight")
|
| 490 |
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
| 491 |
+
page.wait_for_timeout(1500)
|
| 492 |
+
new_height = page.evaluate("document.body.scrollHeight")
|
| 493 |
+
if new_height == prev_height:
|
| 494 |
+
break
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
def _playwright_next_page(page, current_url: str) -> Optional[str]:
|
| 498 |
+
"""Try to find and return next page URL from Playwright page."""
|
| 499 |
+
selectors = ["a[rel='next']", "a.next", "li.next a", "[aria-label='Next']"]
|
| 500 |
+
for sel in selectors:
|
| 501 |
+
try:
|
| 502 |
+
el = page.query_selector(sel)
|
| 503 |
+
if el:
|
| 504 |
+
href = el.get_attribute("href")
|
| 505 |
+
if href:
|
| 506 |
+
return urljoin(current_url, href)
|
| 507 |
+
except Exception:
|
| 508 |
+
pass
|
| 509 |
+
return None
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
# ---------------------------------------------------------------------------
|
| 513 |
+
# Main engine entrypoint
|
| 514 |
+
# ---------------------------------------------------------------------------
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
def run_scrape(req: ScrapeRequest) -> ScrapeResponse:
|
| 518 |
+
"""
|
| 519 |
+
Execute a scrape job and return structured results.
|
| 520 |
+
Handles robots.txt, deduplication, pagination, and error accounting.
|
| 521 |
+
"""
|
| 522 |
+
start_time = time.monotonic()
|
| 523 |
+
items: list[ScrapedItem] = []
|
| 524 |
+
errors: list[str] = []
|
| 525 |
+
pages_scraped = 0
|
| 526 |
+
seen_hashes: set[str] = set()
|
| 527 |
+
|
| 528 |
+
# robots.txt
|
| 529 |
+
if req.check_robots_txt:
|
| 530 |
+
if not _is_allowed_by_robots(req.url):
|
| 531 |
+
return ScrapeResponse(
|
| 532 |
+
items=[],
|
| 533 |
+
pages_scraped=0,
|
| 534 |
+
error_count=1,
|
| 535 |
+
errors=[f"robots.txt disallows scraping: {req.url}"],
|
| 536 |
+
duration_seconds=time.monotonic() - start_time,
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
try:
|
| 540 |
+
if req.scrape_type == "dynamic":
|
| 541 |
+
scraper = DynamicScraper()
|
| 542 |
+
pages = scraper.scrape(req)
|
| 543 |
+
else:
|
| 544 |
+
scraper = StaticScraper(req)
|
| 545 |
+
pages = scraper.scrape()
|
| 546 |
+
|
| 547 |
+
for html, page_url, page_num in pages:
|
| 548 |
+
pages_scraped += 1
|
| 549 |
+
soup = _parse_html(html)
|
| 550 |
+
page_items = _extract_items(soup, html, req, page_url, page_num)
|
| 551 |
+
|
| 552 |
+
for item in page_items:
|
| 553 |
+
if req.deduplicate and item.content_hash in seen_hashes:
|
| 554 |
+
continue
|
| 555 |
+
seen_hashes.add(item.content_hash)
|
| 556 |
+
items.append(item)
|
| 557 |
+
|
| 558 |
+
if req.scrape_type == "static" and hasattr(scraper, "close"):
|
| 559 |
+
scraper.close()
|
| 560 |
+
|
| 561 |
+
except Exception as exc:
|
| 562 |
+
error_msg = f"Scrape failed: {type(exc).__name__}: {exc}"
|
| 563 |
+
logger.exception(error_msg)
|
| 564 |
+
errors.append(error_msg)
|
| 565 |
+
|
| 566 |
+
return ScrapeResponse(
|
| 567 |
+
items=items,
|
| 568 |
+
pages_scraped=pages_scraped,
|
| 569 |
+
error_count=len(errors),
|
| 570 |
+
errors=errors,
|
| 571 |
+
duration_seconds=time.monotonic() - start_time,
|
| 572 |
+
)
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .job_service import (
|
| 2 |
+
create_job,
|
| 3 |
+
get_job,
|
| 4 |
+
list_jobs,
|
| 5 |
+
delete_job,
|
| 6 |
+
cancel_job,
|
| 7 |
+
execute_job_async,
|
| 8 |
+
get_job_logs,
|
| 9 |
+
get_job_results,
|
| 10 |
+
get_dashboard_stats,
|
| 11 |
+
)
|
| 12 |
+
from .export_service import export_json, export_csv, export_excel, get_job_exports
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
"create_job",
|
| 16 |
+
"get_job",
|
| 17 |
+
"list_jobs",
|
| 18 |
+
"delete_job",
|
| 19 |
+
"cancel_job",
|
| 20 |
+
"execute_job_async",
|
| 21 |
+
"get_job_logs",
|
| 22 |
+
"get_job_results",
|
| 23 |
+
"get_dashboard_stats",
|
| 24 |
+
"export_json",
|
| 25 |
+
"export_csv",
|
| 26 |
+
"export_excel",
|
| 27 |
+
"get_job_exports",
|
| 28 |
+
]
|
app/services/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (591 Bytes). View file
|
|
|
app/services/__pycache__/export_service.cpython-310.pyc
ADDED
|
Binary file (5.21 kB). View file
|
|
|
app/services/__pycache__/job_service.cpython-310.pyc
ADDED
|
Binary file (8.15 kB). View file
|
|
|
app/services/export_service.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Export service — generates JSON, CSV, and Excel files from scraped results.
|
| 3 |
+
Files are stored in the exports/ directory and tracked in the database.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import csv
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
from datetime import datetime, timezone
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
import openpyxl
|
| 16 |
+
from openpyxl.styles import Font, PatternFill, Alignment
|
| 17 |
+
|
| 18 |
+
from app.models import db, ScrapeJob, ScrapeResult, ExportRecord
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
EXPORT_DIR = Path("exports")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _ensure_export_dir() -> Path:
|
| 26 |
+
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
return EXPORT_DIR
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _get_results(job_id: int) -> list[ScrapeResult]:
|
| 31 |
+
return (
|
| 32 |
+
ScrapeResult.query.filter_by(job_id=job_id)
|
| 33 |
+
.order_by(ScrapeResult.page_num, ScrapeResult.item_index)
|
| 34 |
+
.all()
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _record_export(job_id: int, fmt: str, filename: str, filepath: str, row_count: int) -> ExportRecord:
|
| 39 |
+
size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
|
| 40 |
+
record = ExportRecord(
|
| 41 |
+
job_id=job_id,
|
| 42 |
+
format=fmt,
|
| 43 |
+
filename=filename,
|
| 44 |
+
filepath=filepath,
|
| 45 |
+
file_size_bytes=size,
|
| 46 |
+
row_count=row_count,
|
| 47 |
+
)
|
| 48 |
+
db.session.add(record)
|
| 49 |
+
db.session.commit()
|
| 50 |
+
return record
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def export_json(job_id: int) -> Optional[str]:
|
| 54 |
+
"""Export results as a JSON file. Returns the filepath."""
|
| 55 |
+
job = ScrapeJob.query.get(job_id)
|
| 56 |
+
if not job:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
results = _get_results(job_id)
|
| 60 |
+
export_dir = _ensure_export_dir()
|
| 61 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 62 |
+
filename = f"job_{job_id}_{ts}.json"
|
| 63 |
+
filepath = str(export_dir / filename)
|
| 64 |
+
|
| 65 |
+
data = {
|
| 66 |
+
"job": job.to_dict(),
|
| 67 |
+
"total_items": len(results),
|
| 68 |
+
"exported_at": datetime.now(timezone.utc).isoformat(),
|
| 69 |
+
"results": [r.to_dict() for r in results],
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 73 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 74 |
+
|
| 75 |
+
_record_export(job_id, "json", filename, filepath, len(results))
|
| 76 |
+
logger.info("JSON export for job %s: %s", job_id, filepath)
|
| 77 |
+
return filepath
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def export_csv(job_id: int) -> Optional[str]:
|
| 81 |
+
"""Export results as a CSV file."""
|
| 82 |
+
job = ScrapeJob.query.get(job_id)
|
| 83 |
+
if not job:
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
results = _get_results(job_id)
|
| 87 |
+
export_dir = _ensure_export_dir()
|
| 88 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 89 |
+
filename = f"job_{job_id}_{ts}.csv"
|
| 90 |
+
filepath = str(export_dir / filename)
|
| 91 |
+
|
| 92 |
+
fieldnames = ["id", "job_id", "page_num", "item_index", "page_url", "content_type", "content", "created_at"]
|
| 93 |
+
|
| 94 |
+
with open(filepath, "w", newline="", encoding="utf-8-sig") as f: # BOM for Excel compat
|
| 95 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
|
| 96 |
+
writer.writeheader()
|
| 97 |
+
for r in results:
|
| 98 |
+
row = r.to_dict()
|
| 99 |
+
row.pop("metadata", None)
|
| 100 |
+
writer.writerow(row)
|
| 101 |
+
|
| 102 |
+
_record_export(job_id, "csv", filename, filepath, len(results))
|
| 103 |
+
logger.info("CSV export for job %s: %s", job_id, filepath)
|
| 104 |
+
return filepath
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def export_excel(job_id: int) -> Optional[str]:
|
| 108 |
+
"""Export results as a styled Excel (.xlsx) file."""
|
| 109 |
+
job = ScrapeJob.query.get(job_id)
|
| 110 |
+
if not job:
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
results = _get_results(job_id)
|
| 114 |
+
export_dir = _ensure_export_dir()
|
| 115 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 116 |
+
filename = f"job_{job_id}_{ts}.xlsx"
|
| 117 |
+
filepath = str(export_dir / filename)
|
| 118 |
+
|
| 119 |
+
wb = openpyxl.Workbook()
|
| 120 |
+
|
| 121 |
+
# --- Results sheet ---
|
| 122 |
+
ws = wb.active
|
| 123 |
+
ws.title = "Results"
|
| 124 |
+
|
| 125 |
+
header_fill = PatternFill("solid", fgColor="1A1A2E")
|
| 126 |
+
header_font = Font(color="FFFFFF", bold=True)
|
| 127 |
+
headers = ["#", "Page", "Index", "URL", "Type", "Content", "Scraped At"]
|
| 128 |
+
|
| 129 |
+
for col_idx, header in enumerate(headers, 1):
|
| 130 |
+
cell = ws.cell(row=1, column=col_idx, value=header)
|
| 131 |
+
cell.fill = header_fill
|
| 132 |
+
cell.font = header_font
|
| 133 |
+
cell.alignment = Alignment(horizontal="center")
|
| 134 |
+
|
| 135 |
+
for row_idx, r in enumerate(results, 2):
|
| 136 |
+
ws.cell(row=row_idx, column=1, value=r.id)
|
| 137 |
+
ws.cell(row=row_idx, column=2, value=r.page_num)
|
| 138 |
+
ws.cell(row=row_idx, column=3, value=r.item_index)
|
| 139 |
+
ws.cell(row=row_idx, column=4, value=r.page_url)
|
| 140 |
+
ws.cell(row=row_idx, column=5, value=r.content_type)
|
| 141 |
+
ws.cell(row=row_idx, column=6, value=(r.content or "")[:32767]) # Excel cell limit
|
| 142 |
+
ws.cell(row=row_idx, column=7, value=r.created_at.isoformat() if r.created_at else "")
|
| 143 |
+
|
| 144 |
+
ws.column_dimensions["D"].width = 40
|
| 145 |
+
ws.column_dimensions["F"].width = 60
|
| 146 |
+
|
| 147 |
+
# --- Summary sheet ---
|
| 148 |
+
ws2 = wb.create_sheet("Summary")
|
| 149 |
+
job_dict = job.to_dict()
|
| 150 |
+
ws2.cell(1, 1, "Field").font = Font(bold=True)
|
| 151 |
+
ws2.cell(1, 2, "Value").font = Font(bold=True)
|
| 152 |
+
for i, (k, v) in enumerate(job_dict.items(), 2):
|
| 153 |
+
ws2.cell(i, 1, k)
|
| 154 |
+
ws2.cell(i, 2, str(v))
|
| 155 |
+
ws2.column_dimensions["A"].width = 25
|
| 156 |
+
ws2.column_dimensions["B"].width = 50
|
| 157 |
+
|
| 158 |
+
wb.save(filepath)
|
| 159 |
+
_record_export(job_id, "excel", filename, filepath, len(results))
|
| 160 |
+
logger.info("Excel export for job %s: %s", job_id, filepath)
|
| 161 |
+
return filepath
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def get_job_exports(job_id: int) -> list[ExportRecord]:
|
| 165 |
+
return ExportRecord.query.filter_by(job_id=job_id).order_by(ExportRecord.created_at.desc()).all()
|
app/services/job_service.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service layer for scrape job management.
|
| 3 |
+
Handles job creation, execution, status updates, and result persistence.
|
| 4 |
+
Keeps routes thin and business logic centralized here.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import threading
|
| 11 |
+
from datetime import datetime, timezone
|
| 12 |
+
from typing import Any, Optional
|
| 13 |
+
|
| 14 |
+
from app.models import db, ScrapeJob, ScrapeResult, JobLog, ExportRecord
|
| 15 |
+
from app.scrapers.engine import ScrapeRequest, run_scrape, ScrapedItem
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def utcnow() -> datetime:
|
| 21 |
+
return datetime.now(timezone.utc).replace(tzinfo=None)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Job CRUD
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def create_job(data: dict, user_id: Optional[int] = None) -> ScrapeJob:
|
| 30 |
+
"""Create and persist a new ScrapeJob from validated form/API data."""
|
| 31 |
+
custom_headers = data.get("custom_headers") or {}
|
| 32 |
+
if isinstance(custom_headers, str):
|
| 33 |
+
try:
|
| 34 |
+
custom_headers = json.loads(custom_headers)
|
| 35 |
+
except json.JSONDecodeError:
|
| 36 |
+
custom_headers = {}
|
| 37 |
+
|
| 38 |
+
job = ScrapeJob(
|
| 39 |
+
user_id=user_id,
|
| 40 |
+
name=data.get("name", f"Job - {data['url'][:50]}"),
|
| 41 |
+
url=data["url"],
|
| 42 |
+
html_tag=data.get("html_tag") or None,
|
| 43 |
+
css_selector=data.get("css_selector") or None,
|
| 44 |
+
xpath_selector=data.get("xpath_selector") or None,
|
| 45 |
+
attribute_name=data.get("attribute_name") or None,
|
| 46 |
+
extraction_type=data.get("extraction_type", "text"),
|
| 47 |
+
scrape_type=data.get("scrape_type", "static"),
|
| 48 |
+
follow_pagination=bool(data.get("follow_pagination", False)),
|
| 49 |
+
max_pages=int(data.get("max_pages", 1)),
|
| 50 |
+
infinite_scroll=bool(data.get("infinite_scroll", False)),
|
| 51 |
+
scroll_count=int(data.get("scroll_count", 3)),
|
| 52 |
+
download_images=bool(data.get("download_images", False)),
|
| 53 |
+
custom_headers=json.dumps(custom_headers),
|
| 54 |
+
user_agent=data.get("user_agent") or None,
|
| 55 |
+
delay_seconds=float(data.get("delay_seconds", 1.0)),
|
| 56 |
+
timeout_seconds=int(data.get("timeout_seconds", 30)),
|
| 57 |
+
max_retries=int(data.get("max_retries", 3)),
|
| 58 |
+
check_robots_txt=bool(data.get("check_robots_txt", True)),
|
| 59 |
+
deduplicate=bool(data.get("deduplicate", True)),
|
| 60 |
+
status="pending",
|
| 61 |
+
)
|
| 62 |
+
db.session.add(job)
|
| 63 |
+
db.session.commit()
|
| 64 |
+
_add_log(job.id, "INFO", f"Job created: {job.name}")
|
| 65 |
+
return job
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def get_job(job_id: int) -> Optional[ScrapeJob]:
|
| 69 |
+
return ScrapeJob.query.get(job_id)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def list_jobs(
|
| 73 |
+
page: int = 1,
|
| 74 |
+
per_page: int = 20,
|
| 75 |
+
status: Optional[str] = None,
|
| 76 |
+
search: Optional[str] = None,
|
| 77 |
+
) -> Any:
|
| 78 |
+
q = ScrapeJob.query.order_by(ScrapeJob.created_at.desc())
|
| 79 |
+
if status:
|
| 80 |
+
q = q.filter(ScrapeJob.status == status)
|
| 81 |
+
if search:
|
| 82 |
+
q = q.filter(
|
| 83 |
+
ScrapeJob.name.ilike(f"%{search}%") | ScrapeJob.url.ilike(f"%{search}%")
|
| 84 |
+
)
|
| 85 |
+
return q.paginate(page=page, per_page=per_page, error_out=False)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def delete_job(job_id: int) -> bool:
|
| 89 |
+
job = ScrapeJob.query.get(job_id)
|
| 90 |
+
if not job:
|
| 91 |
+
return False
|
| 92 |
+
db.session.delete(job)
|
| 93 |
+
db.session.commit()
|
| 94 |
+
return True
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def cancel_job(job_id: int) -> bool:
|
| 98 |
+
job = ScrapeJob.query.get(job_id)
|
| 99 |
+
if not job or job.status not in ("pending", "running"):
|
| 100 |
+
return False
|
| 101 |
+
job.status = "cancelled"
|
| 102 |
+
db.session.commit()
|
| 103 |
+
_add_log(job_id, "WARNING", "Job cancelled by user")
|
| 104 |
+
return True
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
# Execution
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def execute_job(job_id: int, app=None) -> None:
|
| 113 |
+
"""
|
| 114 |
+
Execute a scrape job synchronously.
|
| 115 |
+
Designed to be called in a background thread with app context.
|
| 116 |
+
"""
|
| 117 |
+
if app:
|
| 118 |
+
with app.app_context():
|
| 119 |
+
_run_job(job_id)
|
| 120 |
+
else:
|
| 121 |
+
_run_job(job_id)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def execute_job_async(job_id: int, app) -> threading.Thread:
|
| 125 |
+
"""Spawn a daemon thread for async job execution."""
|
| 126 |
+
thread = threading.Thread(target=execute_job, args=(job_id, app), daemon=True)
|
| 127 |
+
thread.start()
|
| 128 |
+
return thread
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _run_job(job_id: int) -> None:
|
| 132 |
+
job = ScrapeJob.query.get(job_id)
|
| 133 |
+
if not job:
|
| 134 |
+
logger.error("Job %s not found", job_id)
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
job.status = "running"
|
| 138 |
+
job.started_at = utcnow()
|
| 139 |
+
db.session.commit()
|
| 140 |
+
|
| 141 |
+
_add_log(job_id, "INFO", f"Starting scrape: {job.url}")
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
req = ScrapeRequest(
|
| 145 |
+
url=job.url,
|
| 146 |
+
html_tag=job.html_tag,
|
| 147 |
+
css_selector=job.css_selector,
|
| 148 |
+
xpath_selector=job.xpath_selector,
|
| 149 |
+
attribute_name=job.attribute_name,
|
| 150 |
+
extraction_type=job.extraction_type,
|
| 151 |
+
scrape_type=job.scrape_type,
|
| 152 |
+
follow_pagination=job.follow_pagination,
|
| 153 |
+
max_pages=job.max_pages,
|
| 154 |
+
infinite_scroll=job.infinite_scroll,
|
| 155 |
+
scroll_count=job.scroll_count,
|
| 156 |
+
download_images=job.download_images,
|
| 157 |
+
custom_headers=job.custom_headers_dict,
|
| 158 |
+
user_agent=job.user_agent,
|
| 159 |
+
delay_seconds=job.delay_seconds,
|
| 160 |
+
timeout_seconds=job.timeout_seconds,
|
| 161 |
+
max_retries=job.max_retries,
|
| 162 |
+
check_robots_txt=job.check_robots_txt,
|
| 163 |
+
deduplicate=job.deduplicate,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
response = run_scrape(req)
|
| 167 |
+
|
| 168 |
+
# Persist results
|
| 169 |
+
_save_results(job_id, response.items)
|
| 170 |
+
|
| 171 |
+
# Log errors
|
| 172 |
+
for err in response.errors:
|
| 173 |
+
_add_log(job_id, "ERROR", err)
|
| 174 |
+
|
| 175 |
+
job.status = "completed" if response.error_count == 0 else "failed"
|
| 176 |
+
job.total_items = len(response.items)
|
| 177 |
+
job.pages_scraped = response.pages_scraped
|
| 178 |
+
job.error_count = response.error_count
|
| 179 |
+
job.duration_seconds = response.duration_seconds
|
| 180 |
+
job.completed_at = utcnow()
|
| 181 |
+
db.session.commit()
|
| 182 |
+
|
| 183 |
+
_add_log(
|
| 184 |
+
job_id,
|
| 185 |
+
"INFO",
|
| 186 |
+
f"Completed: {len(response.items)} items from {response.pages_scraped} pages "
|
| 187 |
+
f"in {response.duration_seconds:.2f}s",
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
except Exception as exc:
|
| 191 |
+
logger.exception("Job %s failed with unexpected error", job_id)
|
| 192 |
+
job.status = "failed"
|
| 193 |
+
job.error_count = (job.error_count or 0) + 1
|
| 194 |
+
job.completed_at = utcnow()
|
| 195 |
+
db.session.commit()
|
| 196 |
+
_add_log(job_id, "ERROR", f"Fatal error: {type(exc).__name__}: {exc}")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _save_results(job_id: int, items: list[ScrapedItem]) -> None:
|
| 200 |
+
"""Bulk insert scraped results."""
|
| 201 |
+
if not items:
|
| 202 |
+
return
|
| 203 |
+
objs = [
|
| 204 |
+
ScrapeResult(
|
| 205 |
+
job_id=job_id,
|
| 206 |
+
page_url=item.page_url,
|
| 207 |
+
page_num=item.page_num,
|
| 208 |
+
item_index=item.item_index,
|
| 209 |
+
content=item.content,
|
| 210 |
+
content_type=item.content_type,
|
| 211 |
+
content_hash=item.content_hash,
|
| 212 |
+
metadata_=json.dumps(item.metadata) if item.metadata else None,
|
| 213 |
+
)
|
| 214 |
+
for item in items
|
| 215 |
+
]
|
| 216 |
+
db.session.bulk_save_objects(objs)
|
| 217 |
+
db.session.commit()
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# ---------------------------------------------------------------------------
|
| 221 |
+
# Logs
|
| 222 |
+
# ---------------------------------------------------------------------------
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def _add_log(job_id: int, level: str, message: str, details: Optional[dict] = None) -> None:
|
| 226 |
+
try:
|
| 227 |
+
log = JobLog(
|
| 228 |
+
job_id=job_id,
|
| 229 |
+
level=level,
|
| 230 |
+
message=message,
|
| 231 |
+
details=json.dumps(details) if details else None,
|
| 232 |
+
)
|
| 233 |
+
db.session.add(log)
|
| 234 |
+
db.session.commit()
|
| 235 |
+
except Exception as exc:
|
| 236 |
+
logger.warning("Could not persist log for job %s: %s", job_id, exc)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def get_job_logs(job_id: int, page: int = 1, per_page: int = 100) -> Any:
|
| 240 |
+
return (
|
| 241 |
+
JobLog.query.filter_by(job_id=job_id)
|
| 242 |
+
.order_by(JobLog.created_at.asc())
|
| 243 |
+
.paginate(page=page, per_page=per_page, error_out=False)
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def get_job_results(job_id: int, page: int = 1, per_page: int = 50) -> Any:
|
| 248 |
+
return (
|
| 249 |
+
ScrapeResult.query.filter_by(job_id=job_id)
|
| 250 |
+
.order_by(ScrapeResult.page_num, ScrapeResult.item_index)
|
| 251 |
+
.paginate(page=page, per_page=per_page, error_out=False)
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
# ---------------------------------------------------------------------------
|
| 256 |
+
# Dashboard stats
|
| 257 |
+
# ---------------------------------------------------------------------------
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def get_dashboard_stats() -> dict:
|
| 261 |
+
total = ScrapeJob.query.count()
|
| 262 |
+
completed = ScrapeJob.query.filter_by(status="completed").count()
|
| 263 |
+
failed = ScrapeJob.query.filter_by(status="failed").count()
|
| 264 |
+
running = ScrapeJob.query.filter_by(status="running").count()
|
| 265 |
+
pending = ScrapeJob.query.filter_by(status="pending").count()
|
| 266 |
+
total_items = db.session.query(db.func.sum(ScrapeJob.total_items)).scalar() or 0
|
| 267 |
+
|
| 268 |
+
recent_jobs = (
|
| 269 |
+
ScrapeJob.query.order_by(ScrapeJob.created_at.desc()).limit(5).all()
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
return {
|
| 273 |
+
"total_jobs": total,
|
| 274 |
+
"completed_jobs": completed,
|
| 275 |
+
"failed_jobs": failed,
|
| 276 |
+
"running_jobs": running,
|
| 277 |
+
"pending_jobs": pending,
|
| 278 |
+
"total_items_scraped": total_items,
|
| 279 |
+
"recent_jobs": [j.to_dict() for j in recent_jobs],
|
| 280 |
+
"success_rate": round((completed / total * 100) if total else 0, 1),
|
| 281 |
+
}
|
app/static/css/style.css
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ========================================
|
| 2 |
+
WebScraper.pro — Premium Dark Theme
|
| 3 |
+
======================================== */
|
| 4 |
+
|
| 5 |
+
/* --- CSS Variables --- */
|
| 6 |
+
:root {
|
| 7 |
+
--bg-primary: #0a0a12;
|
| 8 |
+
--bg-secondary: #12121e;
|
| 9 |
+
--bg-card: #1a1a2e;
|
| 10 |
+
--bg-card-hover: #1f1f35;
|
| 11 |
+
--bg-input: #16162a;
|
| 12 |
+
--border: #2a2a45;
|
| 13 |
+
--border-focus: #6c5ce7;
|
| 14 |
+
--text-primary: #e8e8f0;
|
| 15 |
+
--text-secondary: #9898b8;
|
| 16 |
+
--text-muted: #6868a0;
|
| 17 |
+
--accent: #6c5ce7;
|
| 18 |
+
--accent-glow: rgba(108, 92, 231, 0.4);
|
| 19 |
+
--success: #00cec9;
|
| 20 |
+
--warning: #fdcb6e;
|
| 21 |
+
--danger: #ff6b6b;
|
| 22 |
+
--info: #74b9ff;
|
| 23 |
+
--font: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
|
| 24 |
+
--radius: 12px;
|
| 25 |
+
--radius-sm: 8px;
|
| 26 |
+
--shadow: 0 4px 24px rgba(0,0,0,0.3);
|
| 27 |
+
--shadow-lg: 0 8px 40px rgba(0,0,0,0.4);
|
| 28 |
+
--transition: 0.25s cubic-bezier(0.4, 0, 0.2, 1);
|
| 29 |
+
--nav-height: 64px;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/* --- Reset & Base --- */
|
| 33 |
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
| 34 |
+
html { scroll-behavior: smooth; }
|
| 35 |
+
body {
|
| 36 |
+
font-family: var(--font);
|
| 37 |
+
background: var(--bg-primary);
|
| 38 |
+
color: var(--text-primary);
|
| 39 |
+
line-height: 1.6;
|
| 40 |
+
min-height: 100vh;
|
| 41 |
+
display: flex;
|
| 42 |
+
flex-direction: column;
|
| 43 |
+
-webkit-font-smoothing: antialiased;
|
| 44 |
+
}
|
| 45 |
+
a { color: var(--accent); text-decoration: none; transition: color var(--transition); }
|
| 46 |
+
a:hover { color: #8b7ff0; }
|
| 47 |
+
|
| 48 |
+
/* --- Layout --- */
|
| 49 |
+
.container { max-width: 1280px; margin: 0 auto; padding: 0 24px; width: 100%; }
|
| 50 |
+
.main-content { flex: 1; padding: calc(var(--nav-height) + 32px) 0 48px; }
|
| 51 |
+
|
| 52 |
+
/* --- Navbar --- */
|
| 53 |
+
.navbar {
|
| 54 |
+
position: fixed; top: 0; left: 0; right: 0; z-index: 100;
|
| 55 |
+
height: var(--nav-height);
|
| 56 |
+
background: rgba(10, 10, 18, 0.85);
|
| 57 |
+
backdrop-filter: blur(20px);
|
| 58 |
+
border-bottom: 1px solid var(--border);
|
| 59 |
+
}
|
| 60 |
+
.nav-container {
|
| 61 |
+
max-width: 1280px; margin: 0 auto; padding: 0 24px;
|
| 62 |
+
display: flex; align-items: center; justify-content: space-between; height: 100%;
|
| 63 |
+
}
|
| 64 |
+
.nav-brand { display: flex; align-items: center; gap: 10px; font-weight: 700; font-size: 1.2rem; color: var(--text-primary); }
|
| 65 |
+
.brand-icon { font-size: 1.5rem; }
|
| 66 |
+
.brand-accent { color: var(--accent); }
|
| 67 |
+
.nav-links { display: flex; align-items: center; gap: 8px; }
|
| 68 |
+
.nav-link {
|
| 69 |
+
display: flex; align-items: center; gap: 6px; padding: 8px 16px;
|
| 70 |
+
border-radius: var(--radius-sm); color: var(--text-secondary);
|
| 71 |
+
font-size: 0.9rem; font-weight: 500; transition: all var(--transition);
|
| 72 |
+
}
|
| 73 |
+
.nav-link:hover, .nav-link.active { color: var(--text-primary); background: rgba(108, 92, 231, 0.1); }
|
| 74 |
+
.nav-link.active { color: var(--accent); }
|
| 75 |
+
.nav-icon { width: 18px; height: 18px; }
|
| 76 |
+
.btn-nav-primary {
|
| 77 |
+
background: linear-gradient(135deg, var(--accent), #8b5cf6);
|
| 78 |
+
color: #fff !important; border-radius: var(--radius-sm);
|
| 79 |
+
}
|
| 80 |
+
.btn-nav-primary:hover { transform: translateY(-1px); box-shadow: 0 4px 15px var(--accent-glow); }
|
| 81 |
+
.nav-toggle { display: none; background: none; border: none; cursor: pointer; padding: 8px; }
|
| 82 |
+
.nav-toggle span { display: block; width: 22px; height: 2px; background: var(--text-primary); margin: 5px 0; transition: var(--transition); }
|
| 83 |
+
|
| 84 |
+
/* --- Page Header --- */
|
| 85 |
+
.page-header {
|
| 86 |
+
display: flex; align-items: flex-start; justify-content: space-between;
|
| 87 |
+
margin-bottom: 32px; flex-wrap: wrap; gap: 16px;
|
| 88 |
+
}
|
| 89 |
+
.page-title { font-size: 2rem; font-weight: 800; letter-spacing: -0.02em; }
|
| 90 |
+
.page-subtitle { color: var(--text-muted); margin-top: 4px; font-size: 0.95rem; }
|
| 91 |
+
.page-actions { display: flex; gap: 8px; flex-wrap: wrap; }
|
| 92 |
+
|
| 93 |
+
/* --- Cards --- */
|
| 94 |
+
.card {
|
| 95 |
+
background: var(--bg-card); border: 1px solid var(--border);
|
| 96 |
+
border-radius: var(--radius); margin-bottom: 24px;
|
| 97 |
+
transition: border-color var(--transition);
|
| 98 |
+
}
|
| 99 |
+
.card:hover { border-color: rgba(108, 92, 231, 0.3); }
|
| 100 |
+
.card-header {
|
| 101 |
+
display: flex; align-items: center; justify-content: space-between;
|
| 102 |
+
padding: 20px 24px; border-bottom: 1px solid var(--border);
|
| 103 |
+
}
|
| 104 |
+
.card-title { font-size: 1.1rem; font-weight: 600; }
|
| 105 |
+
.card-subtitle { color: var(--text-muted); font-size: 0.85rem; }
|
| 106 |
+
.card-body { padding: 24px; }
|
| 107 |
+
|
| 108 |
+
/* --- Stats Grid --- */
|
| 109 |
+
.stats-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px; margin-bottom: 32px; }
|
| 110 |
+
.stat-card {
|
| 111 |
+
background: var(--bg-card); border: 1px solid var(--border); border-radius: var(--radius);
|
| 112 |
+
padding: 24px; display: flex; align-items: center; gap: 16px;
|
| 113 |
+
transition: all var(--transition); cursor: default;
|
| 114 |
+
}
|
| 115 |
+
.stat-card:hover { transform: translateY(-3px); box-shadow: var(--shadow); }
|
| 116 |
+
.stat-icon { font-size: 2rem; }
|
| 117 |
+
.stat-value { font-size: 1.8rem; font-weight: 800; display: block; line-height: 1; }
|
| 118 |
+
.stat-label { font-size: 0.8rem; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.05em; margin-top: 4px; display: block; }
|
| 119 |
+
.stat-card-primary { border-left: 3px solid var(--accent); }
|
| 120 |
+
.stat-card-success { border-left: 3px solid var(--success); }
|
| 121 |
+
.stat-card-warning { border-left: 3px solid var(--warning); }
|
| 122 |
+
.stat-card-danger { border-left: 3px solid var(--danger); }
|
| 123 |
+
.stat-card-info { border-left: 3px solid var(--info); }
|
| 124 |
+
.stat-card-accent { border-left: 3px solid #a29bfe; }
|
| 125 |
+
|
| 126 |
+
/* --- Buttons --- */
|
| 127 |
+
.btn {
|
| 128 |
+
display: inline-flex; align-items: center; gap: 8px;
|
| 129 |
+
padding: 10px 20px; border-radius: var(--radius-sm);
|
| 130 |
+
font-family: var(--font); font-size: 0.9rem; font-weight: 600;
|
| 131 |
+
border: none; cursor: pointer; transition: all var(--transition);
|
| 132 |
+
text-decoration: none; line-height: 1.4;
|
| 133 |
+
}
|
| 134 |
+
.btn-primary { background: linear-gradient(135deg, var(--accent), #8b5cf6); color: #fff; }
|
| 135 |
+
.btn-primary:hover { transform: translateY(-1px); box-shadow: 0 4px 20px var(--accent-glow); color: #fff; }
|
| 136 |
+
.btn-secondary { background: var(--bg-card); color: var(--text-primary); border: 1px solid var(--border); }
|
| 137 |
+
.btn-secondary:hover { border-color: var(--accent); }
|
| 138 |
+
.btn-warning { background: rgba(253, 203, 110, 0.15); color: var(--warning); border: 1px solid rgba(253, 203, 110, 0.3); }
|
| 139 |
+
.btn-danger { background: rgba(255, 107, 107, 0.15); color: var(--danger); border: 1px solid rgba(255, 107, 107, 0.3); }
|
| 140 |
+
.btn-outline { background: transparent; color: var(--text-secondary); border: 1px solid var(--border); }
|
| 141 |
+
.btn-outline:hover { border-color: var(--accent); color: var(--accent); }
|
| 142 |
+
.btn-ghost { background: transparent; color: var(--text-secondary); }
|
| 143 |
+
.btn-ghost:hover { color: var(--text-primary); }
|
| 144 |
+
.btn-sm { padding: 6px 14px; font-size: 0.82rem; }
|
| 145 |
+
.btn-lg { padding: 14px 28px; font-size: 1rem; }
|
| 146 |
+
.btn-glow:hover { box-shadow: 0 0 30px var(--accent-glow), 0 4px 20px rgba(0,0,0,0.3); }
|
| 147 |
+
.btn-icon { width: 18px; height: 18px; }
|
| 148 |
+
.btn-icon-sm { background: none; border: none; cursor: pointer; padding: 6px; font-size: 1rem; border-radius: 6px; transition: var(--transition); }
|
| 149 |
+
.btn-icon-sm:hover { background: rgba(255,255,255,0.05); }
|
| 150 |
+
.btn-danger-ghost:hover { background: rgba(255, 107, 107, 0.1); }
|
| 151 |
+
|
| 152 |
+
/* --- Tables --- */
|
| 153 |
+
.table-responsive { overflow-x: auto; }
|
| 154 |
+
.table { width: 100%; border-collapse: collapse; font-size: 0.9rem; }
|
| 155 |
+
.table th {
|
| 156 |
+
padding: 12px 16px; text-align: left; font-weight: 600;
|
| 157 |
+
color: var(--text-muted); font-size: 0.78rem; text-transform: uppercase;
|
| 158 |
+
letter-spacing: 0.05em; border-bottom: 1px solid var(--border);
|
| 159 |
+
}
|
| 160 |
+
.table td { padding: 14px 16px; border-bottom: 1px solid rgba(42, 42, 69, 0.5); }
|
| 161 |
+
.table-row-hover:hover { background: rgba(108, 92, 231, 0.04); cursor: pointer; }
|
| 162 |
+
.table-link { color: var(--text-primary); font-weight: 500; }
|
| 163 |
+
.table-link:hover { color: var(--accent); }
|
| 164 |
+
.table-compact td { padding: 10px 14px; font-size: 0.85rem; }
|
| 165 |
+
.text-truncate { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
| 166 |
+
.text-muted { color: var(--text-muted); }
|
| 167 |
+
.content-cell { max-width: 400px; }
|
| 168 |
+
.content-preview { font-size: 0.83rem; color: var(--text-secondary); word-break: break-word; max-height: 60px; overflow: hidden; }
|
| 169 |
+
|
| 170 |
+
/* --- Badges --- */
|
| 171 |
+
.badge {
|
| 172 |
+
display: inline-flex; align-items: center; padding: 3px 10px;
|
| 173 |
+
border-radius: 20px; font-size: 0.75rem; font-weight: 600;
|
| 174 |
+
}
|
| 175 |
+
.badge-muted { background: rgba(152, 152, 184, 0.1); color: var(--text-muted); }
|
| 176 |
+
.badge-type { background: rgba(108, 92, 231, 0.12); color: var(--accent); }
|
| 177 |
+
.count-badge { background: rgba(108, 92, 231, 0.15); color: var(--accent); padding: 2px 10px; border-radius: 12px; font-size: 0.8rem; margin-left: 8px; }
|
| 178 |
+
|
| 179 |
+
/* --- Status Badges --- */
|
| 180 |
+
.status-badge {
|
| 181 |
+
display: inline-flex; align-items: center; gap: 6px;
|
| 182 |
+
padding: 4px 12px; border-radius: 20px; font-size: 0.78rem; font-weight: 600;
|
| 183 |
+
}
|
| 184 |
+
.status-dot { width: 7px; height: 7px; border-radius: 50%; }
|
| 185 |
+
.status-pending { background: rgba(253, 203, 110, 0.12); color: var(--warning); }
|
| 186 |
+
.status-pending .status-dot { background: var(--warning); }
|
| 187 |
+
.status-running { background: rgba(116, 185, 255, 0.12); color: var(--info); }
|
| 188 |
+
.status-running .status-dot { background: var(--info); animation: pulse 1.5s infinite; }
|
| 189 |
+
.status-completed { background: rgba(0, 206, 201, 0.12); color: var(--success); }
|
| 190 |
+
.status-completed .status-dot { background: var(--success); }
|
| 191 |
+
.status-failed { background: rgba(255, 107, 107, 0.12); color: var(--danger); }
|
| 192 |
+
.status-failed .status-dot { background: var(--danger); }
|
| 193 |
+
.status-cancelled { background: rgba(152, 152, 184, 0.12); color: var(--text-muted); }
|
| 194 |
+
.status-cancelled .status-dot { background: var(--text-muted); }
|
| 195 |
+
.status-lg { padding: 8px 18px; font-size: 0.85rem; }
|
| 196 |
+
|
| 197 |
+
@keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.4; } }
|
| 198 |
+
|
| 199 |
+
/* --- Status Bar --- */
|
| 200 |
+
.status-bar {
|
| 201 |
+
background: var(--bg-card); border: 1px solid var(--border);
|
| 202 |
+
border-radius: var(--radius); margin-bottom: 24px; overflow: hidden;
|
| 203 |
+
}
|
| 204 |
+
.status-bar-inner { display: flex; align-items: center; justify-content: space-between; padding: 20px 24px; flex-wrap: wrap; gap: 12px; }
|
| 205 |
+
.status-meta { display: flex; gap: 20px; color: var(--text-secondary); font-size: 0.9rem; flex-wrap: wrap; }
|
| 206 |
+
.progress-bar { height: 3px; background: var(--bg-input); }
|
| 207 |
+
.progress-bar-fill { height: 100%; background: linear-gradient(90deg, var(--accent), var(--info)); border-radius: 3px; }
|
| 208 |
+
.progress-animate { width: 100%; animation: progress-sweep 2s ease-in-out infinite; }
|
| 209 |
+
@keyframes progress-sweep {
|
| 210 |
+
0% { transform: translateX(-100%); }
|
| 211 |
+
100% { transform: translateX(100%); }
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
/* --- Detail Grid --- */
|
| 215 |
+
.detail-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 24px; }
|
| 216 |
+
.detail-list { display: flex; flex-direction: column; gap: 14px; }
|
| 217 |
+
.detail-item { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; }
|
| 218 |
+
.detail-key { font-size: 0.85rem; color: var(--text-muted); min-width: 120px; flex-shrink: 0; }
|
| 219 |
+
.detail-value { font-size: 0.9rem; text-align: right; word-break: break-all; }
|
| 220 |
+
.detail-value.code { font-family: 'Fira Code', monospace; background: var(--bg-input); padding: 2px 8px; border-radius: 4px; font-size: 0.82rem; }
|
| 221 |
+
.detail-value.link { color: var(--accent); }
|
| 222 |
+
|
| 223 |
+
/* --- Export Buttons --- */
|
| 224 |
+
.export-buttons { display: flex; gap: 12px; flex-wrap: wrap; }
|
| 225 |
+
.btn-export {
|
| 226 |
+
flex: 1; min-width: 100px; display: flex; flex-direction: column; align-items: center;
|
| 227 |
+
gap: 8px; padding: 20px; background: var(--bg-input); border: 1px solid var(--border);
|
| 228 |
+
border-radius: var(--radius-sm); color: var(--text-primary); font-weight: 600;
|
| 229 |
+
font-size: 0.9rem; cursor: pointer; transition: all var(--transition); text-decoration: none;
|
| 230 |
+
}
|
| 231 |
+
.btn-export:hover { border-color: var(--accent); transform: translateY(-2px); box-shadow: var(--shadow); color: var(--text-primary); }
|
| 232 |
+
.export-icon { font-size: 1.5rem; }
|
| 233 |
+
.export-history { margin-top: 20px; }
|
| 234 |
+
.export-history h4 { font-size: 0.85rem; color: var(--text-muted); margin-bottom: 10px; }
|
| 235 |
+
.export-item { display: flex; align-items: center; gap: 10px; padding: 8px 0; border-bottom: 1px solid rgba(42,42,69,0.3); font-size: 0.85rem; }
|
| 236 |
+
|
| 237 |
+
/* --- Logs --- */
|
| 238 |
+
.logs-container { font-family: 'Fira Code', 'Cascadia Code', monospace; font-size: 0.82rem; max-height: 400px; overflow-y: auto; }
|
| 239 |
+
.log-entry { display: flex; gap: 12px; padding: 6px 8px; border-radius: 4px; }
|
| 240 |
+
.log-entry:hover { background: rgba(255,255,255,0.02); }
|
| 241 |
+
.log-time { color: var(--text-muted); min-width: 65px; }
|
| 242 |
+
.log-level { font-weight: 700; min-width: 60px; }
|
| 243 |
+
.log-info .log-level { color: var(--info); }
|
| 244 |
+
.log-warning .log-level { color: var(--warning); }
|
| 245 |
+
.log-error .log-level { color: var(--danger); }
|
| 246 |
+
.log-debug .log-level { color: var(--text-muted); }
|
| 247 |
+
.log-message { color: var(--text-secondary); word-break: break-word; }
|
| 248 |
+
|
| 249 |
+
/* --- Forms --- */
|
| 250 |
+
.form-section { margin-bottom: 24px; }
|
| 251 |
+
.form-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
|
| 252 |
+
.form-full { grid-column: 1 / -1; }
|
| 253 |
+
.form-group { display: flex; flex-direction: column; gap: 6px; }
|
| 254 |
+
.form-label { font-size: 0.85rem; font-weight: 600; color: var(--text-secondary); }
|
| 255 |
+
.required { color: var(--danger); }
|
| 256 |
+
.form-input, .form-select, .form-textarea {
|
| 257 |
+
padding: 12px 16px; background: var(--bg-input); border: 1px solid var(--border);
|
| 258 |
+
border-radius: var(--radius-sm); color: var(--text-primary); font-family: var(--font);
|
| 259 |
+
font-size: 0.9rem; transition: all var(--transition); outline: none;
|
| 260 |
+
}
|
| 261 |
+
.form-input:focus, .form-select:focus, .form-textarea:focus {
|
| 262 |
+
border-color: var(--accent); box-shadow: 0 0 0 3px var(--accent-glow);
|
| 263 |
+
}
|
| 264 |
+
.form-input.form-error, .form-select.form-error { border-color: var(--danger); }
|
| 265 |
+
.form-error-text { color: var(--danger); font-size: 0.8rem; }
|
| 266 |
+
.form-hint { color: var(--text-muted); font-size: 0.78rem; }
|
| 267 |
+
.form-select { appearance: none; background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%239898b8' d='M6 8L1 3h10z'/%3E%3C/svg%3E"); background-repeat: no-repeat; background-position: right 14px center; padding-right: 36px; }
|
| 268 |
+
.form-textarea { resize: vertical; min-height: 80px; }
|
| 269 |
+
|
| 270 |
+
/* --- Checkboxes --- */
|
| 271 |
+
.form-checkboxes { display: flex; flex-wrap: wrap; gap: 16px; margin-top: 20px; padding-top: 20px; border-top: 1px solid var(--border); }
|
| 272 |
+
.checkbox-label {
|
| 273 |
+
display: flex; align-items: center; gap: 10px; cursor: pointer;
|
| 274 |
+
font-size: 0.88rem; color: var(--text-secondary); user-select: none;
|
| 275 |
+
}
|
| 276 |
+
.checkbox-label input { display: none; }
|
| 277 |
+
.checkbox-custom {
|
| 278 |
+
width: 20px; height: 20px; border: 2px solid var(--border); border-radius: 5px;
|
| 279 |
+
display: flex; align-items: center; justify-content: center; transition: all var(--transition);
|
| 280 |
+
}
|
| 281 |
+
.checkbox-label input:checked + .checkbox-custom {
|
| 282 |
+
background: var(--accent); border-color: var(--accent);
|
| 283 |
+
}
|
| 284 |
+
.checkbox-label input:checked + .checkbox-custom::after {
|
| 285 |
+
content: '✓'; color: #fff; font-size: 0.7rem; font-weight: 700;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
/* --- Collapsible --- */
|
| 289 |
+
.collapsible { cursor: pointer; user-select: none; }
|
| 290 |
+
.collapse-icon { width: 20px; height: 20px; color: var(--text-muted); transition: transform var(--transition); }
|
| 291 |
+
.collapsible.collapsed .collapse-icon { transform: rotate(-90deg); }
|
| 292 |
+
|
| 293 |
+
/* --- Form Actions --- */
|
| 294 |
+
.form-actions { display: flex; justify-content: flex-end; gap: 12px; margin-top: 8px; }
|
| 295 |
+
|
| 296 |
+
/* --- Filter Bar --- */
|
| 297 |
+
.filter-bar { margin-bottom: 20px; }
|
| 298 |
+
.filter-bar .card-body, .filter-form { padding: 16px 20px; display: flex; align-items: flex-end; gap: 16px; flex-wrap: wrap; }
|
| 299 |
+
.filter-group { display: flex; flex-direction: column; gap: 4px; }
|
| 300 |
+
.filter-label { font-size: 0.75rem; font-weight: 600; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.05em; }
|
| 301 |
+
.filter-search { flex: 1; min-width: 200px; }
|
| 302 |
+
|
| 303 |
+
/* --- Pagination --- */
|
| 304 |
+
.pagination { display: flex; justify-content: center; gap: 6px; margin-top: 24px; padding-top: 20px; border-top: 1px solid var(--border); }
|
| 305 |
+
.pagination-link {
|
| 306 |
+
padding: 8px 14px; border-radius: var(--radius-sm); font-size: 0.85rem;
|
| 307 |
+
color: var(--text-secondary); background: var(--bg-input); border: 1px solid var(--border);
|
| 308 |
+
transition: all var(--transition);
|
| 309 |
+
}
|
| 310 |
+
.pagination-link:hover, .pagination-link.active { background: var(--accent); color: #fff; border-color: var(--accent); }
|
| 311 |
+
.pagination-ellipsis { padding: 8px 6px; color: var(--text-muted); }
|
| 312 |
+
|
| 313 |
+
/* --- Empty State --- */
|
| 314 |
+
.empty-state { text-align: center; padding: 60px 24px; }
|
| 315 |
+
.empty-icon { font-size: 3.5rem; margin-bottom: 16px; }
|
| 316 |
+
.empty-state h3 { font-size: 1.2rem; margin-bottom: 8px; }
|
| 317 |
+
.empty-state p { color: var(--text-muted); margin-bottom: 20px; }
|
| 318 |
+
.empty-state-sm { text-align: center; padding: 32px 16px; }
|
| 319 |
+
|
| 320 |
+
/* --- Error Page --- */
|
| 321 |
+
.error-page { display: flex; align-items: center; justify-content: center; min-height: 60vh; gap: 60px; }
|
| 322 |
+
.error-code { font-size: 7rem; font-weight: 900; background: linear-gradient(135deg, var(--accent), var(--danger)); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1; }
|
| 323 |
+
.error-title { font-size: 1.5rem; margin: 12px 0; }
|
| 324 |
+
.error-description { color: var(--text-muted); margin-bottom: 24px; max-width: 400px; }
|
| 325 |
+
.error-actions { display: flex; gap: 12px; }
|
| 326 |
+
.error-visual { font-size: 5rem; opacity: 0.15; }
|
| 327 |
+
|
| 328 |
+
/* --- Toast --- */
|
| 329 |
+
.toast-container { position: fixed; top: calc(var(--nav-height) + 16px); right: 24px; z-index: 200; display: flex; flex-direction: column; gap: 10px; }
|
| 330 |
+
.toast {
|
| 331 |
+
display: flex; align-items: center; gap: 12px; padding: 14px 20px;
|
| 332 |
+
background: var(--bg-card); border: 1px solid var(--border); border-radius: var(--radius-sm);
|
| 333 |
+
box-shadow: var(--shadow-lg); animation: slideIn 0.3s ease; min-width: 300px; max-width: 500px;
|
| 334 |
+
}
|
| 335 |
+
.toast-success { border-left: 3px solid var(--success); }
|
| 336 |
+
.toast-danger, .toast-error { border-left: 3px solid var(--danger); }
|
| 337 |
+
.toast-warning { border-left: 3px solid var(--warning); }
|
| 338 |
+
.toast-info { border-left: 3px solid var(--info); }
|
| 339 |
+
.toast-icon { font-size: 1.1rem; font-weight: 700; }
|
| 340 |
+
.toast-success .toast-icon { color: var(--success); }
|
| 341 |
+
.toast-danger .toast-icon, .toast-error .toast-icon { color: var(--danger); }
|
| 342 |
+
.toast-warning .toast-icon { color: var(--warning); }
|
| 343 |
+
.toast-body { flex: 1; font-size: 0.9rem; }
|
| 344 |
+
.toast-close { background: none; border: none; color: var(--text-muted); cursor: pointer; font-size: 1.2rem; padding: 0 4px; }
|
| 345 |
+
@keyframes slideIn { from { transform: translateX(100%); opacity: 0; } to { transform: translateX(0); opacity: 1; } }
|
| 346 |
+
|
| 347 |
+
/* --- Footer --- */
|
| 348 |
+
.footer { border-top: 1px solid var(--border); padding: 24px 0; margin-top: auto; }
|
| 349 |
+
.footer-inner { display: flex; justify-content: space-between; align-items: center; }
|
| 350 |
+
.footer-text { font-size: 0.82rem; color: var(--text-muted); }
|
| 351 |
+
.footer-links { display: flex; gap: 16px; }
|
| 352 |
+
.footer-link { font-size: 0.82rem; color: var(--text-muted); }
|
| 353 |
+
.footer-link:hover { color: var(--accent); }
|
| 354 |
+
|
| 355 |
+
/* --- Inline form --- */
|
| 356 |
+
.inline-form { display: inline; }
|
| 357 |
+
|
| 358 |
+
/* --- Action Group --- */
|
| 359 |
+
.action-group { display: flex; gap: 4px; }
|
| 360 |
+
|
| 361 |
+
/* --- Responsive --- */
|
| 362 |
+
@media (max-width: 768px) {
|
| 363 |
+
.nav-links { display: none; position: fixed; top: var(--nav-height); left: 0; right: 0; background: var(--bg-secondary); flex-direction: column; padding: 16px; border-bottom: 1px solid var(--border); }
|
| 364 |
+
.nav-links.open { display: flex; }
|
| 365 |
+
.nav-toggle { display: block; }
|
| 366 |
+
.page-header { flex-direction: column; }
|
| 367 |
+
.stats-grid { grid-template-columns: repeat(2, 1fr); }
|
| 368 |
+
.form-grid { grid-template-columns: 1fr; }
|
| 369 |
+
.detail-grid { grid-template-columns: 1fr; }
|
| 370 |
+
.filter-form { flex-direction: column; align-items: stretch; }
|
| 371 |
+
.error-page { flex-direction: column; text-align: center; }
|
| 372 |
+
.footer-inner { flex-direction: column; gap: 12px; text-align: center; }
|
| 373 |
+
.status-bar-inner { flex-direction: column; align-items: flex-start; }
|
| 374 |
+
}
|
| 375 |
+
@media (max-width: 480px) {
|
| 376 |
+
.stats-grid { grid-template-columns: 1fr; }
|
| 377 |
+
.container { padding: 0 16px; }
|
| 378 |
+
.page-title { font-size: 1.5rem; }
|
| 379 |
+
}
|
app/static/js/app.js
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ========================================
|
| 2 |
+
WebScraper.pro — Interactive JS
|
| 3 |
+
======================================== */
|
| 4 |
+
|
| 5 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 6 |
+
// --- Mobile Nav Toggle ---
|
| 7 |
+
const toggle = document.getElementById('nav-toggle');
|
| 8 |
+
const links = document.getElementById('nav-links');
|
| 9 |
+
if (toggle && links) {
|
| 10 |
+
toggle.addEventListener('click', () => links.classList.toggle('open'));
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
// --- Auto-dismiss Toasts ---
|
| 14 |
+
document.querySelectorAll('.toast[data-auto-dismiss]').forEach(toast => {
|
| 15 |
+
const delay = parseInt(toast.dataset.autoDismiss, 10) || 5000;
|
| 16 |
+
setTimeout(() => {
|
| 17 |
+
toast.style.animation = 'slideOut 0.3s ease forwards';
|
| 18 |
+
setTimeout(() => toast.remove(), 300);
|
| 19 |
+
}, delay);
|
| 20 |
+
});
|
| 21 |
+
|
| 22 |
+
// --- Stat Card Entrance Animation ---
|
| 23 |
+
document.querySelectorAll('.stat-card').forEach((card, i) => {
|
| 24 |
+
card.style.opacity = '0';
|
| 25 |
+
card.style.transform = 'translateY(20px)';
|
| 26 |
+
setTimeout(() => {
|
| 27 |
+
card.style.transition = 'all 0.5s cubic-bezier(0.4, 0, 0.2, 1)';
|
| 28 |
+
card.style.opacity = '1';
|
| 29 |
+
card.style.transform = 'translateY(0)';
|
| 30 |
+
}, 80 * i);
|
| 31 |
+
});
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
// --- Collapsible Sections ---
|
| 35 |
+
function toggleSection(id) {
|
| 36 |
+
const el = document.getElementById(id);
|
| 37 |
+
if (!el) return;
|
| 38 |
+
const header = el.previousElementSibling || el.parentElement.querySelector('.collapsible');
|
| 39 |
+
if (el.style.display === 'none') {
|
| 40 |
+
el.style.display = '';
|
| 41 |
+
if (header) header.classList.remove('collapsed');
|
| 42 |
+
} else {
|
| 43 |
+
el.style.display = 'none';
|
| 44 |
+
if (header) header.classList.add('collapsed');
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
// --- Slide Out animation for toasts ---
|
| 49 |
+
const style = document.createElement('style');
|
| 50 |
+
style.textContent = `@keyframes slideOut { to { transform: translateX(120%); opacity: 0; } }`;
|
| 51 |
+
document.head.appendChild(style);
|
app/templates/base.html
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en" data-theme="dark">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<meta name="description" content="WebScraper Platform — Professional web scraping with visual configuration, exports, and scheduling.">
|
| 7 |
+
<title>{% block title %}WebScraper Platform{% endblock %}</title>
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
|
| 11 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
|
| 12 |
+
{% block extra_head %}{% endblock %}
|
| 13 |
+
</head>
|
| 14 |
+
<body>
|
| 15 |
+
<!-- Navigation -->
|
| 16 |
+
<nav class="navbar" id="main-nav">
|
| 17 |
+
<div class="nav-container">
|
| 18 |
+
<a href="{{ url_for('main.index') }}" class="nav-brand">
|
| 19 |
+
<span class="brand-icon">🕷️</span>
|
| 20 |
+
<span class="brand-text">WebScraper<span class="brand-accent">.pro</span></span>
|
| 21 |
+
</a>
|
| 22 |
+
<div class="nav-links" id="nav-links">
|
| 23 |
+
<a href="{{ url_for('main.index') }}" class="nav-link {% if request.endpoint == 'main.index' %}active{% endif %}">
|
| 24 |
+
<svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"/><polyline points="9 22 9 12 15 12 15 22"/></svg>
|
| 25 |
+
Dashboard
|
| 26 |
+
</a>
|
| 27 |
+
<a href="{{ url_for('jobs.list_view') }}" class="nav-link {% if request.endpoint and request.endpoint.startswith('jobs.') and request.endpoint != 'jobs.new_job' %}active{% endif %}">
|
| 28 |
+
<svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="2" y="7" width="20" height="14" rx="2" ry="2"/><path d="M16 21V5a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"/></svg>
|
| 29 |
+
Jobs
|
| 30 |
+
</a>
|
| 31 |
+
<a href="{{ url_for('jobs.new_job') }}" class="nav-link btn-nav-primary {% if request.endpoint == 'jobs.new_job' %}active{% endif %}">
|
| 32 |
+
<svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
|
| 33 |
+
New Scrape
|
| 34 |
+
</a>
|
| 35 |
+
</div>
|
| 36 |
+
<button class="nav-toggle" id="nav-toggle" aria-label="Toggle navigation">
|
| 37 |
+
<span></span><span></span><span></span>
|
| 38 |
+
</button>
|
| 39 |
+
</div>
|
| 40 |
+
</nav>
|
| 41 |
+
|
| 42 |
+
<!-- Flash Messages -->
|
| 43 |
+
{% with messages = get_flashed_messages(with_categories=true) %}
|
| 44 |
+
{% if messages %}
|
| 45 |
+
<div class="toast-container" id="toast-container">
|
| 46 |
+
{% for category, message in messages %}
|
| 47 |
+
<div class="toast toast-{{ category }}" data-auto-dismiss="5000">
|
| 48 |
+
<div class="toast-icon">
|
| 49 |
+
{% if category == 'success' %}✓{% elif category == 'danger' or category == 'error' %}✕{% elif category == 'warning' %}⚠{% else %}ℹ{% endif %}
|
| 50 |
+
</div>
|
| 51 |
+
<div class="toast-body">{{ message }}</div>
|
| 52 |
+
<button class="toast-close" onclick="this.parentElement.remove()">×</button>
|
| 53 |
+
</div>
|
| 54 |
+
{% endfor %}
|
| 55 |
+
</div>
|
| 56 |
+
{% endif %}
|
| 57 |
+
{% endwith %}
|
| 58 |
+
|
| 59 |
+
<!-- Main Content -->
|
| 60 |
+
<main class="main-content">
|
| 61 |
+
<div class="container">
|
| 62 |
+
{% block content %}{% endblock %}
|
| 63 |
+
</div>
|
| 64 |
+
</main>
|
| 65 |
+
|
| 66 |
+
<!-- Footer -->
|
| 67 |
+
<footer class="footer">
|
| 68 |
+
<div class="container footer-inner">
|
| 69 |
+
<p class="footer-text">© 2026 WebScraper<span class="brand-accent">.pro</span> — Built with Flask & BeautifulSoup</p>
|
| 70 |
+
<div class="footer-links">
|
| 71 |
+
<a href="{{ url_for('main.health') }}" class="footer-link">Health</a>
|
| 72 |
+
<a href="{{ url_for('main.metrics') }}" class="footer-link">Metrics</a>
|
| 73 |
+
</div>
|
| 74 |
+
</div>
|
| 75 |
+
</footer>
|
| 76 |
+
|
| 77 |
+
<script src="{{ url_for('static', filename='js/app.js') }}"></script>
|
| 78 |
+
{% block extra_scripts %}{% endblock %}
|
| 79 |
+
</body>
|
| 80 |
+
</html>
|
app/templates/pages/dashboard.html
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
{% block title %}Dashboard — WebScraper.pro{% endblock %}
|
| 3 |
+
|
| 4 |
+
{% block content %}
|
| 5 |
+
<div class="page-header">
|
| 6 |
+
<div class="page-header-content">
|
| 7 |
+
<h1 class="page-title">Dashboard</h1>
|
| 8 |
+
<p class="page-subtitle">Monitor your scraping operations at a glance</p>
|
| 9 |
+
</div>
|
| 10 |
+
<a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary btn-glow">
|
| 11 |
+
<svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
|
| 12 |
+
New Scrape Job
|
| 13 |
+
</a>
|
| 14 |
+
</div>
|
| 15 |
+
|
| 16 |
+
<!-- Stats Grid -->
|
| 17 |
+
<div class="stats-grid" id="stats-grid">
|
| 18 |
+
<div class="stat-card stat-card-primary">
|
| 19 |
+
<div class="stat-icon">📊</div>
|
| 20 |
+
<div class="stat-info">
|
| 21 |
+
<span class="stat-value">{{ stats.total_jobs }}</span>
|
| 22 |
+
<span class="stat-label">Total Jobs</span>
|
| 23 |
+
</div>
|
| 24 |
+
</div>
|
| 25 |
+
<div class="stat-card stat-card-success">
|
| 26 |
+
<div class="stat-icon">✅</div>
|
| 27 |
+
<div class="stat-info">
|
| 28 |
+
<span class="stat-value">{{ stats.completed_jobs }}</span>
|
| 29 |
+
<span class="stat-label">Completed</span>
|
| 30 |
+
</div>
|
| 31 |
+
</div>
|
| 32 |
+
<div class="stat-card stat-card-warning">
|
| 33 |
+
<div class="stat-icon">⏳</div>
|
| 34 |
+
<div class="stat-info">
|
| 35 |
+
<span class="stat-value">{{ stats.running_jobs + stats.pending_jobs }}</span>
|
| 36 |
+
<span class="stat-label">In Progress</span>
|
| 37 |
+
</div>
|
| 38 |
+
</div>
|
| 39 |
+
<div class="stat-card stat-card-danger">
|
| 40 |
+
<div class="stat-icon">❌</div>
|
| 41 |
+
<div class="stat-info">
|
| 42 |
+
<span class="stat-value">{{ stats.failed_jobs }}</span>
|
| 43 |
+
<span class="stat-label">Failed</span>
|
| 44 |
+
</div>
|
| 45 |
+
</div>
|
| 46 |
+
<div class="stat-card stat-card-info">
|
| 47 |
+
<div class="stat-icon">📦</div>
|
| 48 |
+
<div class="stat-info">
|
| 49 |
+
<span class="stat-value">{{ stats.total_items_scraped }}</span>
|
| 50 |
+
<span class="stat-label">Items Scraped</span>
|
| 51 |
+
</div>
|
| 52 |
+
</div>
|
| 53 |
+
<div class="stat-card stat-card-accent">
|
| 54 |
+
<div class="stat-icon">🎯</div>
|
| 55 |
+
<div class="stat-info">
|
| 56 |
+
<span class="stat-value">{{ stats.success_rate }}%</span>
|
| 57 |
+
<span class="stat-label">Success Rate</span>
|
| 58 |
+
</div>
|
| 59 |
+
</div>
|
| 60 |
+
</div>
|
| 61 |
+
|
| 62 |
+
<!-- Recent Jobs -->
|
| 63 |
+
<div class="card" id="recent-jobs-card">
|
| 64 |
+
<div class="card-header">
|
| 65 |
+
<h2 class="card-title">Recent Jobs</h2>
|
| 66 |
+
<a href="{{ url_for('jobs.list_view') }}" class="btn btn-outline btn-sm">View All →</a>
|
| 67 |
+
</div>
|
| 68 |
+
<div class="card-body">
|
| 69 |
+
{% if stats.recent_jobs %}
|
| 70 |
+
<div class="table-responsive">
|
| 71 |
+
<table class="table" id="recent-jobs-table">
|
| 72 |
+
<thead>
|
| 73 |
+
<tr>
|
| 74 |
+
<th>ID</th>
|
| 75 |
+
<th>Name</th>
|
| 76 |
+
<th>URL</th>
|
| 77 |
+
<th>Status</th>
|
| 78 |
+
<th>Items</th>
|
| 79 |
+
<th>Duration</th>
|
| 80 |
+
<th>Created</th>
|
| 81 |
+
</tr>
|
| 82 |
+
</thead>
|
| 83 |
+
<tbody>
|
| 84 |
+
{% for job in stats.recent_jobs %}
|
| 85 |
+
<tr class="table-row-hover" onclick="window.location='/jobs/{{ job.id }}'">
|
| 86 |
+
<td><span class="badge badge-muted">#{{ job.id }}</span></td>
|
| 87 |
+
<td class="text-truncate" style="max-width: 200px;">{{ job.name }}</td>
|
| 88 |
+
<td class="text-truncate text-muted" style="max-width: 250px;">{{ job.url }}</td>
|
| 89 |
+
<td>
|
| 90 |
+
<span class="status-badge status-{{ job.status }}">
|
| 91 |
+
<span class="status-dot"></span>
|
| 92 |
+
{{ job.status | capitalize }}
|
| 93 |
+
</span>
|
| 94 |
+
</td>
|
| 95 |
+
<td>{{ job.total_items }}</td>
|
| 96 |
+
<td>{% if job.duration_seconds %}{{ "%.1f"|format(job.duration_seconds) }}s{% else %}—{% endif %}</td>
|
| 97 |
+
<td class="text-muted">{{ job.created_at[:16] if job.created_at else '—' }}</td>
|
| 98 |
+
</tr>
|
| 99 |
+
{% endfor %}
|
| 100 |
+
</tbody>
|
| 101 |
+
</table>
|
| 102 |
+
</div>
|
| 103 |
+
{% else %}
|
| 104 |
+
<div class="empty-state">
|
| 105 |
+
<div class="empty-icon">🕸️</div>
|
| 106 |
+
<h3>No scrape jobs yet</h3>
|
| 107 |
+
<p>Create your first scraping job to get started.</p>
|
| 108 |
+
<a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary">Create Your First Job</a>
|
| 109 |
+
</div>
|
| 110 |
+
{% endif %}
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
{% endblock %}
|
app/templates/pages/error.html
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
{% block title %}{{ code }} — WebScraper.pro{% endblock %}
|
| 3 |
+
|
| 4 |
+
{% block content %}
|
| 5 |
+
<div class="error-page">
|
| 6 |
+
<div class="error-content">
|
| 7 |
+
<div class="error-code">{{ code }}</div>
|
| 8 |
+
<h1 class="error-title">{{ message }}</h1>
|
| 9 |
+
<p class="error-description">
|
| 10 |
+
{% if code == 404 %}
|
| 11 |
+
The page you're looking for doesn't exist or has been moved.
|
| 12 |
+
{% elif code == 429 %}
|
| 13 |
+
You've made too many requests. Please wait a moment and try again.
|
| 14 |
+
{% elif code == 500 %}
|
| 15 |
+
Something went wrong on our end. Our team has been notified.
|
| 16 |
+
{% else %}
|
| 17 |
+
An unexpected error occurred. Please try again.
|
| 18 |
+
{% endif %}
|
| 19 |
+
</p>
|
| 20 |
+
<div class="error-actions">
|
| 21 |
+
<a href="{{ url_for('main.index') }}" class="btn btn-primary">← Back to Dashboard</a>
|
| 22 |
+
<a href="{{ url_for('jobs.list_view') }}" class="btn btn-secondary">View Jobs</a>
|
| 23 |
+
</div>
|
| 24 |
+
</div>
|
| 25 |
+
<div class="error-visual">
|
| 26 |
+
<div class="error-spider">🕷️</div>
|
| 27 |
+
<div class="error-web"></div>
|
| 28 |
+
</div>
|
| 29 |
+
</div>
|
| 30 |
+
{% endblock %}
|
app/templates/pages/job_detail.html
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
{% block title %}Job #{{ job.id }} — WebScraper.pro{% endblock %}
|
| 3 |
+
|
| 4 |
+
{% block content %}
|
| 5 |
+
<div class="page-header">
|
| 6 |
+
<div class="page-header-content">
|
| 7 |
+
<h1 class="page-title">{{ job.name }}</h1>
|
| 8 |
+
<p class="page-subtitle">Job #{{ job.id }} — {{ job.url }}</p>
|
| 9 |
+
</div>
|
| 10 |
+
<div class="page-actions">
|
| 11 |
+
{% if job.status in ('pending', 'running') %}
|
| 12 |
+
<form method="post" action="{{ url_for('jobs.cancel', job_id=job.id) }}" class="inline-form">
|
| 13 |
+
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
| 14 |
+
<button type="submit" class="btn btn-warning" id="btn-cancel">⏸️ Cancel</button>
|
| 15 |
+
</form>
|
| 16 |
+
{% endif %}
|
| 17 |
+
{% if job.status in ('completed', 'failed', 'cancelled') %}
|
| 18 |
+
<form method="post" action="{{ url_for('jobs.rerun', job_id=job.id) }}" class="inline-form">
|
| 19 |
+
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
| 20 |
+
<button type="submit" class="btn btn-secondary" id="btn-rerun">🔄 Re-run</button>
|
| 21 |
+
</form>
|
| 22 |
+
{% endif %}
|
| 23 |
+
<form method="post" action="{{ url_for('jobs.delete', job_id=job.id) }}" class="inline-form" onsubmit="return confirm('Permanently delete this job and all results?')">
|
| 24 |
+
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
| 25 |
+
<button type="submit" class="btn btn-danger" id="btn-delete">🗑️ Delete</button>
|
| 26 |
+
</form>
|
| 27 |
+
</div>
|
| 28 |
+
</div>
|
| 29 |
+
|
| 30 |
+
<!-- Status Bar -->
|
| 31 |
+
<div class="status-bar status-bar-{{ job.status }}" id="job-status-bar" data-job-id="{{ job.id }}" data-status="{{ job.status }}">
|
| 32 |
+
<div class="status-bar-inner">
|
| 33 |
+
<span class="status-badge status-{{ job.status }} status-lg">
|
| 34 |
+
<span class="status-dot"></span>
|
| 35 |
+
{{ job.status | upper }}
|
| 36 |
+
</span>
|
| 37 |
+
<div class="status-meta">
|
| 38 |
+
<span>📦 <strong>{{ job.total_items }}</strong> items</span>
|
| 39 |
+
<span>📄 <strong>{{ job.pages_scraped }}</strong> pages</span>
|
| 40 |
+
{% if job.error_count %}<span>❌ <strong>{{ job.error_count }}</strong> errors</span>{% endif %}
|
| 41 |
+
{% if job.duration_seconds %}<span>⏱️ <strong>{{ "%.2f"|format(job.duration_seconds) }}</strong>s</span>{% endif %}
|
| 42 |
+
</div>
|
| 43 |
+
</div>
|
| 44 |
+
{% if job.status in ('pending', 'running') %}
|
| 45 |
+
<div class="progress-bar">
|
| 46 |
+
<div class="progress-bar-fill progress-animate"></div>
|
| 47 |
+
</div>
|
| 48 |
+
{% endif %}
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
<!-- Job Info -->
|
| 52 |
+
<div class="detail-grid">
|
| 53 |
+
<div class="card" id="job-config-card">
|
| 54 |
+
<div class="card-header">
|
| 55 |
+
<h2 class="card-title">⚙️ Configuration</h2>
|
| 56 |
+
</div>
|
| 57 |
+
<div class="card-body">
|
| 58 |
+
<div class="detail-list">
|
| 59 |
+
<div class="detail-item">
|
| 60 |
+
<span class="detail-key">URL</span>
|
| 61 |
+
<a href="{{ job.url }}" target="_blank" rel="noopener" class="detail-value link">{{ job.url }}</a>
|
| 62 |
+
</div>
|
| 63 |
+
<div class="detail-item">
|
| 64 |
+
<span class="detail-key">Scrape Type</span>
|
| 65 |
+
<span class="detail-value"><span class="badge badge-type">{{ job.scrape_type }}</span></span>
|
| 66 |
+
</div>
|
| 67 |
+
<div class="detail-item">
|
| 68 |
+
<span class="detail-key">Extraction</span>
|
| 69 |
+
<span class="detail-value">{{ job.extraction_type }}</span>
|
| 70 |
+
</div>
|
| 71 |
+
{% if job.css_selector %}
|
| 72 |
+
<div class="detail-item">
|
| 73 |
+
<span class="detail-key">CSS Selector</span>
|
| 74 |
+
<code class="detail-value code">{{ job.css_selector }}</code>
|
| 75 |
+
</div>
|
| 76 |
+
{% endif %}
|
| 77 |
+
{% if job.xpath_selector %}
|
| 78 |
+
<div class="detail-item">
|
| 79 |
+
<span class="detail-key">XPath</span>
|
| 80 |
+
<code class="detail-value code">{{ job.xpath_selector }}</code>
|
| 81 |
+
</div>
|
| 82 |
+
{% endif %}
|
| 83 |
+
{% if job.html_tag %}
|
| 84 |
+
<div class="detail-item">
|
| 85 |
+
<span class="detail-key">HTML Tag</span>
|
| 86 |
+
<code class="detail-value code">{{ job.html_tag }}</code>
|
| 87 |
+
</div>
|
| 88 |
+
{% endif %}
|
| 89 |
+
<div class="detail-item">
|
| 90 |
+
<span class="detail-key">Pagination</span>
|
| 91 |
+
<span class="detail-value">{{ 'Yes' if job.follow_pagination else 'No' }} (max {{ job.max_pages }} pages)</span>
|
| 92 |
+
</div>
|
| 93 |
+
<div class="detail-item">
|
| 94 |
+
<span class="detail-key">Delay</span>
|
| 95 |
+
<span class="detail-value">{{ job.delay_seconds }}s</span>
|
| 96 |
+
</div>
|
| 97 |
+
<div class="detail-item">
|
| 98 |
+
<span class="detail-key">Created</span>
|
| 99 |
+
<span class="detail-value">{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') if job.created_at else '—' }}</span>
|
| 100 |
+
</div>
|
| 101 |
+
</div>
|
| 102 |
+
</div>
|
| 103 |
+
</div>
|
| 104 |
+
|
| 105 |
+
<!-- Export -->
|
| 106 |
+
<div class="card" id="export-card">
|
| 107 |
+
<div class="card-header">
|
| 108 |
+
<h2 class="card-title">📤 Export Results</h2>
|
| 109 |
+
</div>
|
| 110 |
+
<div class="card-body">
|
| 111 |
+
{% if job.total_items > 0 %}
|
| 112 |
+
<div class="export-buttons">
|
| 113 |
+
<a href="{{ url_for('jobs.export', job_id=job.id, fmt='json') }}" class="btn btn-export" id="export-json">
|
| 114 |
+
<span class="export-icon">{ }</span>
|
| 115 |
+
<span>JSON</span>
|
| 116 |
+
</a>
|
| 117 |
+
<a href="{{ url_for('jobs.export', job_id=job.id, fmt='csv') }}" class="btn btn-export" id="export-csv">
|
| 118 |
+
<span class="export-icon">📊</span>
|
| 119 |
+
<span>CSV</span>
|
| 120 |
+
</a>
|
| 121 |
+
<a href="{{ url_for('jobs.export', job_id=job.id, fmt='excel') }}" class="btn btn-export" id="export-excel">
|
| 122 |
+
<span class="export-icon">📗</span>
|
| 123 |
+
<span>Excel</span>
|
| 124 |
+
</a>
|
| 125 |
+
</div>
|
| 126 |
+
{% if exports %}
|
| 127 |
+
<div class="export-history">
|
| 128 |
+
<h4>Previous Exports</h4>
|
| 129 |
+
{% for exp in exports %}
|
| 130 |
+
<div class="export-item">
|
| 131 |
+
<span class="badge badge-type">{{ exp.format }}</span>
|
| 132 |
+
<span>{{ exp.filename }}</span>
|
| 133 |
+
<span class="text-muted">{{ exp.row_count }} rows</span>
|
| 134 |
+
</div>
|
| 135 |
+
{% endfor %}
|
| 136 |
+
</div>
|
| 137 |
+
{% endif %}
|
| 138 |
+
{% else %}
|
| 139 |
+
<div class="empty-state-sm">
|
| 140 |
+
<p class="text-muted">No results to export yet.</p>
|
| 141 |
+
</div>
|
| 142 |
+
{% endif %}
|
| 143 |
+
</div>
|
| 144 |
+
</div>
|
| 145 |
+
</div>
|
| 146 |
+
|
| 147 |
+
<!-- Results -->
|
| 148 |
+
<div class="card" id="results-card">
|
| 149 |
+
<div class="card-header">
|
| 150 |
+
<h2 class="card-title">📦 Scraped Results <span class="count-badge">{{ results.total }}</span></h2>
|
| 151 |
+
</div>
|
| 152 |
+
<div class="card-body">
|
| 153 |
+
{% if results.items %}
|
| 154 |
+
<div class="table-responsive">
|
| 155 |
+
<table class="table table-compact" id="results-table">
|
| 156 |
+
<thead>
|
| 157 |
+
<tr>
|
| 158 |
+
<th>#</th>
|
| 159 |
+
<th>Page</th>
|
| 160 |
+
<th>Type</th>
|
| 161 |
+
<th>Content</th>
|
| 162 |
+
<th>URL</th>
|
| 163 |
+
</tr>
|
| 164 |
+
</thead>
|
| 165 |
+
<tbody>
|
| 166 |
+
{% for r in results.items %}
|
| 167 |
+
<tr>
|
| 168 |
+
<td>{{ r.item_index }}</td>
|
| 169 |
+
<td>{{ r.page_num }}</td>
|
| 170 |
+
<td><span class="badge badge-muted">{{ r.content_type }}</span></td>
|
| 171 |
+
<td class="content-cell">
|
| 172 |
+
<div class="content-preview">{{ r.content[:300] if r.content else '—' }}{% if r.content and r.content|length > 300 %}…{% endif %}</div>
|
| 173 |
+
</td>
|
| 174 |
+
<td class="text-truncate text-muted" style="max-width:180px;">{{ r.page_url }}</td>
|
| 175 |
+
</tr>
|
| 176 |
+
{% endfor %}
|
| 177 |
+
</tbody>
|
| 178 |
+
</table>
|
| 179 |
+
</div>
|
| 180 |
+
{% if results.pages > 1 %}
|
| 181 |
+
<div class="pagination">
|
| 182 |
+
{% if results.has_prev %}
|
| 183 |
+
<a href="{{ url_for('jobs.detail', job_id=job.id, rpage=results.prev_num) }}" class="pagination-link">← Prev</a>
|
| 184 |
+
{% endif %}
|
| 185 |
+
{% for p in results.iter_pages(left_edge=1, right_edge=1, left_current=2, right_current=2) %}
|
| 186 |
+
{% if p %}
|
| 187 |
+
<a href="{{ url_for('jobs.detail', job_id=job.id, rpage=p) }}" class="pagination-link {% if p == results.page %}active{% endif %}">{{ p }}</a>
|
| 188 |
+
{% else %}
|
| 189 |
+
<span class="pagination-ellipsis">…</span>
|
| 190 |
+
{% endif %}
|
| 191 |
+
{% endfor %}
|
| 192 |
+
{% if results.has_next %}
|
| 193 |
+
<a href="{{ url_for('jobs.detail', job_id=job.id, rpage=results.next_num) }}" class="pagination-link">Next →</a>
|
| 194 |
+
{% endif %}
|
| 195 |
+
</div>
|
| 196 |
+
{% endif %}
|
| 197 |
+
{% else %}
|
| 198 |
+
<div class="empty-state-sm">
|
| 199 |
+
<p class="text-muted">No results scraped yet.</p>
|
| 200 |
+
</div>
|
| 201 |
+
{% endif %}
|
| 202 |
+
</div>
|
| 203 |
+
</div>
|
| 204 |
+
|
| 205 |
+
<!-- Logs -->
|
| 206 |
+
<div class="card" id="logs-card">
|
| 207 |
+
<div class="card-header collapsible" onclick="toggleSection('logs-body')">
|
| 208 |
+
<h2 class="card-title">📋 Job Logs <span class="count-badge">{{ logs.total }}</span></h2>
|
| 209 |
+
<svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="6 9 12 15 18 9"/></svg>
|
| 210 |
+
</div>
|
| 211 |
+
<div class="card-body" id="logs-body">
|
| 212 |
+
{% if logs.items %}
|
| 213 |
+
<div class="logs-container">
|
| 214 |
+
{% for log in logs.items %}
|
| 215 |
+
<div class="log-entry log-{{ log.level | lower }}">
|
| 216 |
+
<span class="log-time">{{ log.created_at.strftime('%H:%M:%S') if log.created_at else '' }}</span>
|
| 217 |
+
<span class="log-level">{{ log.level }}</span>
|
| 218 |
+
<span class="log-message">{{ log.message }}</span>
|
| 219 |
+
</div>
|
| 220 |
+
{% endfor %}
|
| 221 |
+
</div>
|
| 222 |
+
{% else %}
|
| 223 |
+
<p class="text-muted">No logs recorded.</p>
|
| 224 |
+
{% endif %}
|
| 225 |
+
</div>
|
| 226 |
+
</div>
|
| 227 |
+
{% endblock %}
|
| 228 |
+
|
| 229 |
+
{% block extra_scripts %}
|
| 230 |
+
{% if job.status in ('pending', 'running') %}
|
| 231 |
+
<script>
|
| 232 |
+
// Poll for status updates
|
| 233 |
+
(function() {
|
| 234 |
+
const jobId = {{ job.id }};
|
| 235 |
+
const statusBar = document.getElementById('job-status-bar');
|
| 236 |
+
const interval = setInterval(async () => {
|
| 237 |
+
try {
|
| 238 |
+
const res = await fetch(`/jobs/api/jobs/${jobId}/status`);
|
| 239 |
+
const data = await res.json();
|
| 240 |
+
if (data.status !== '{{ job.status }}') {
|
| 241 |
+
window.location.reload();
|
| 242 |
+
}
|
| 243 |
+
// Update live counts
|
| 244 |
+
const meta = statusBar.querySelector('.status-meta');
|
| 245 |
+
if (meta) {
|
| 246 |
+
meta.innerHTML = `
|
| 247 |
+
<span>📦 <strong>${data.total_items}</strong> items</span>
|
| 248 |
+
<span>📄 <strong>${data.pages_scraped}</strong> pages</span>
|
| 249 |
+
${data.error_count ? `<span>❌ <strong>${data.error_count}</strong> errors</span>` : ''}
|
| 250 |
+
`;
|
| 251 |
+
}
|
| 252 |
+
} catch(e) { /* silently continue */ }
|
| 253 |
+
}, 2000);
|
| 254 |
+
|
| 255 |
+
// Stop polling when job completes
|
| 256 |
+
if (!['pending', 'running'].includes('{{ job.status }}')) {
|
| 257 |
+
clearInterval(interval);
|
| 258 |
+
}
|
| 259 |
+
})();
|
| 260 |
+
</script>
|
| 261 |
+
{% endif %}
|
| 262 |
+
{% endblock %}
|
app/templates/pages/jobs.html
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
{% block title %}Jobs — WebScraper.pro{% endblock %}
|
| 3 |
+
|
| 4 |
+
{% block content %}
|
| 5 |
+
<div class="page-header">
|
| 6 |
+
<div class="page-header-content">
|
| 7 |
+
<h1 class="page-title">Scrape Jobs</h1>
|
| 8 |
+
<p class="page-subtitle">Manage and monitor all your scraping tasks</p>
|
| 9 |
+
</div>
|
| 10 |
+
<a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary btn-glow">
|
| 11 |
+
<svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
|
| 12 |
+
New Scrape Job
|
| 13 |
+
</a>
|
| 14 |
+
</div>
|
| 15 |
+
|
| 16 |
+
<!-- Filters -->
|
| 17 |
+
<div class="card filter-bar" id="job-filters">
|
| 18 |
+
<form method="get" class="filter-form">
|
| 19 |
+
<div class="filter-group">
|
| 20 |
+
<label class="filter-label">Status</label>
|
| 21 |
+
<select name="status" class="form-select" id="filter-status">
|
| 22 |
+
<option value="">All Statuses</option>
|
| 23 |
+
<option value="pending" {% if status_filter == 'pending' %}selected{% endif %}>Pending</option>
|
| 24 |
+
<option value="running" {% if status_filter == 'running' %}selected{% endif %}>Running</option>
|
| 25 |
+
<option value="completed" {% if status_filter == 'completed' %}selected{% endif %}>Completed</option>
|
| 26 |
+
<option value="failed" {% if status_filter == 'failed' %}selected{% endif %}>Failed</option>
|
| 27 |
+
<option value="cancelled" {% if status_filter == 'cancelled' %}selected{% endif %}>Cancelled</option>
|
| 28 |
+
</select>
|
| 29 |
+
</div>
|
| 30 |
+
<div class="filter-group filter-search">
|
| 31 |
+
<label class="filter-label">Search</label>
|
| 32 |
+
<input type="text" name="search" class="form-input" placeholder="Search by name or URL..." value="{{ search or '' }}" id="filter-search">
|
| 33 |
+
</div>
|
| 34 |
+
<button type="submit" class="btn btn-secondary" id="filter-submit">Filter</button>
|
| 35 |
+
{% if status_filter or search %}
|
| 36 |
+
<a href="{{ url_for('jobs.list_view') }}" class="btn btn-ghost" id="filter-clear">Clear</a>
|
| 37 |
+
{% endif %}
|
| 38 |
+
</form>
|
| 39 |
+
</div>
|
| 40 |
+
|
| 41 |
+
<!-- Jobs Table -->
|
| 42 |
+
<div class="card" id="jobs-list-card">
|
| 43 |
+
<div class="card-body">
|
| 44 |
+
{% if pagination.items %}
|
| 45 |
+
<div class="table-responsive">
|
| 46 |
+
<table class="table" id="jobs-table">
|
| 47 |
+
<thead>
|
| 48 |
+
<tr>
|
| 49 |
+
<th>ID</th>
|
| 50 |
+
<th>Name</th>
|
| 51 |
+
<th>URL</th>
|
| 52 |
+
<th>Type</th>
|
| 53 |
+
<th>Status</th>
|
| 54 |
+
<th>Items</th>
|
| 55 |
+
<th>Duration</th>
|
| 56 |
+
<th>Created</th>
|
| 57 |
+
<th>Actions</th>
|
| 58 |
+
</tr>
|
| 59 |
+
</thead>
|
| 60 |
+
<tbody>
|
| 61 |
+
{% for job in pagination.items %}
|
| 62 |
+
<tr class="table-row-hover">
|
| 63 |
+
<td><span class="badge badge-muted">#{{ job.id }}</span></td>
|
| 64 |
+
<td>
|
| 65 |
+
<a href="{{ url_for('jobs.detail', job_id=job.id) }}" class="table-link">{{ job.name }}</a>
|
| 66 |
+
</td>
|
| 67 |
+
<td class="text-truncate text-muted" style="max-width: 220px;" title="{{ job.url }}">{{ job.url }}</td>
|
| 68 |
+
<td><span class="badge badge-type">{{ job.scrape_type }}</span></td>
|
| 69 |
+
<td>
|
| 70 |
+
<span class="status-badge status-{{ job.status }}">
|
| 71 |
+
<span class="status-dot"></span>
|
| 72 |
+
{{ job.status | capitalize }}
|
| 73 |
+
</span>
|
| 74 |
+
</td>
|
| 75 |
+
<td>{{ job.total_items }}</td>
|
| 76 |
+
<td>{% if job.duration_seconds %}{{ "%.1f"|format(job.duration_seconds) }}s{% else %}—{% endif %}</td>
|
| 77 |
+
<td class="text-muted">{{ job.created_at.strftime('%Y-%m-%d %H:%M') if job.created_at else '—' }}</td>
|
| 78 |
+
<td>
|
| 79 |
+
<div class="action-group">
|
| 80 |
+
<a href="{{ url_for('jobs.detail', job_id=job.id) }}" class="btn btn-icon-sm" title="View Details">👁️</a>
|
| 81 |
+
<form method="post" action="{{ url_for('jobs.delete', job_id=job.id) }}" class="inline-form" onsubmit="return confirm('Delete job #{{ job.id }}?')">
|
| 82 |
+
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
| 83 |
+
<button type="submit" class="btn btn-icon-sm btn-danger-ghost" title="Delete">🗑️</button>
|
| 84 |
+
</form>
|
| 85 |
+
</div>
|
| 86 |
+
</td>
|
| 87 |
+
</tr>
|
| 88 |
+
{% endfor %}
|
| 89 |
+
</tbody>
|
| 90 |
+
</table>
|
| 91 |
+
</div>
|
| 92 |
+
|
| 93 |
+
<!-- Pagination -->
|
| 94 |
+
{% if pagination.pages > 1 %}
|
| 95 |
+
<div class="pagination" id="jobs-pagination">
|
| 96 |
+
{% if pagination.has_prev %}
|
| 97 |
+
<a href="{{ url_for('jobs.list_view', page=pagination.prev_num, status=status_filter, search=search) }}" class="pagination-link">← Prev</a>
|
| 98 |
+
{% endif %}
|
| 99 |
+
{% for p in pagination.iter_pages(left_edge=1, right_edge=1, left_current=2, right_current=2) %}
|
| 100 |
+
{% if p %}
|
| 101 |
+
<a href="{{ url_for('jobs.list_view', page=p, status=status_filter, search=search) }}" class="pagination-link {% if p == pagination.page %}active{% endif %}">{{ p }}</a>
|
| 102 |
+
{% else %}
|
| 103 |
+
<span class="pagination-ellipsis">…</span>
|
| 104 |
+
{% endif %}
|
| 105 |
+
{% endfor %}
|
| 106 |
+
{% if pagination.has_next %}
|
| 107 |
+
<a href="{{ url_for('jobs.list_view', page=pagination.next_num, status=status_filter, search=search) }}" class="pagination-link">Next →</a>
|
| 108 |
+
{% endif %}
|
| 109 |
+
</div>
|
| 110 |
+
{% endif %}
|
| 111 |
+
|
| 112 |
+
{% else %}
|
| 113 |
+
<div class="empty-state">
|
| 114 |
+
<div class="empty-icon">🔍</div>
|
| 115 |
+
<h3>No jobs found</h3>
|
| 116 |
+
<p>{% if status_filter or search %}Try adjusting your filters.{% else %}Create your first scraping job to get started.{% endif %}</p>
|
| 117 |
+
<a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary">Create New Job</a>
|
| 118 |
+
</div>
|
| 119 |
+
{% endif %}
|
| 120 |
+
</div>
|
| 121 |
+
</div>
|
| 122 |
+
{% endblock %}
|
app/templates/pages/new_job.html
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
{% block title %}New Scrape Job — WebScraper.pro{% endblock %}
|
| 3 |
+
|
| 4 |
+
{% block content %}
|
| 5 |
+
<div class="page-header">
|
| 6 |
+
<div class="page-header-content">
|
| 7 |
+
<h1 class="page-title">Create Scrape Job</h1>
|
| 8 |
+
<p class="page-subtitle">Configure and launch a new web scraping task</p>
|
| 9 |
+
</div>
|
| 10 |
+
</div>
|
| 11 |
+
|
| 12 |
+
<form method="post" action="{{ url_for('jobs.new_job') }}" id="new-job-form" class="job-form">
|
| 13 |
+
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
|
| 14 |
+
|
| 15 |
+
<!-- Basic Config -->
|
| 16 |
+
<div class="card form-section" id="section-basic">
|
| 17 |
+
<div class="card-header">
|
| 18 |
+
<h2 class="card-title">🎯 Basic Configuration</h2>
|
| 19 |
+
</div>
|
| 20 |
+
<div class="card-body">
|
| 21 |
+
<div class="form-grid">
|
| 22 |
+
<div class="form-group form-full">
|
| 23 |
+
<label for="name" class="form-label">Job Name</label>
|
| 24 |
+
<input type="text" name="name" id="name" class="form-input {% if errors.get('name') %}form-error{% endif %}" placeholder="e.g. Product Prices - Amazon" value="{{ form_data.get('name', '') }}">
|
| 25 |
+
{% if errors.get('name') %}<span class="form-error-text">{{ errors.name }}</span>{% endif %}
|
| 26 |
+
</div>
|
| 27 |
+
|
| 28 |
+
<div class="form-group form-full">
|
| 29 |
+
<label for="url" class="form-label">Target URL <span class="required">*</span></label>
|
| 30 |
+
<input type="url" name="url" id="url" class="form-input {% if errors.get('url') %}form-error{% endif %}" placeholder="https://example.com/page" value="{{ form_data.get('url', '') }}" required>
|
| 31 |
+
{% if errors.get('url') %}<span class="form-error-text">{{ errors.url }}</span>{% endif %}
|
| 32 |
+
</div>
|
| 33 |
+
|
| 34 |
+
<div class="form-group">
|
| 35 |
+
<label for="scrape_type" class="form-label">Scrape Type</label>
|
| 36 |
+
<select name="scrape_type" id="scrape_type" class="form-select">
|
| 37 |
+
<option value="static" {% if form_data.get('scrape_type') == 'static' %}selected{% endif %}>Static (Requests + BS4)</option>
|
| 38 |
+
<option value="dynamic" {% if form_data.get('scrape_type') == 'dynamic' %}selected{% endif %}>Dynamic (Playwright)</option>
|
| 39 |
+
</select>
|
| 40 |
+
<span class="form-hint">Use Dynamic for JS-rendered pages</span>
|
| 41 |
+
</div>
|
| 42 |
+
|
| 43 |
+
<div class="form-group">
|
| 44 |
+
<label for="extraction_type" class="form-label">Extraction Type</label>
|
| 45 |
+
<select name="extraction_type" id="extraction_type" class="form-select {% if errors.get('extraction_type') %}form-error{% endif %}">
|
| 46 |
+
<option value="text" {% if form_data.get('extraction_type') == 'text' %}selected{% endif %}>Text Content</option>
|
| 47 |
+
<option value="links" {% if form_data.get('extraction_type') == 'links' %}selected{% endif %}>Links (URLs)</option>
|
| 48 |
+
<option value="images" {% if form_data.get('extraction_type') == 'images' %}selected{% endif %}>Image URLs</option>
|
| 49 |
+
<option value="attributes" {% if form_data.get('extraction_type') == 'attributes' %}selected{% endif %}>HTML Attributes</option>
|
| 50 |
+
<option value="table" {% if form_data.get('extraction_type') == 'table' %}selected{% endif %}>Table Data</option>
|
| 51 |
+
<option value="json_ld" {% if form_data.get('extraction_type') == 'json_ld' %}selected{% endif %}>JSON-LD Schema</option>
|
| 52 |
+
<option value="full_html" {% if form_data.get('extraction_type') == 'full_html' %}selected{% endif %}>Full HTML</option>
|
| 53 |
+
</select>
|
| 54 |
+
{% if errors.get('extraction_type') %}<span class="form-error-text">{{ errors.extraction_type }}</span>{% endif %}
|
| 55 |
+
</div>
|
| 56 |
+
</div>
|
| 57 |
+
</div>
|
| 58 |
+
</div>
|
| 59 |
+
|
| 60 |
+
<!-- Selectors -->
|
| 61 |
+
<div class="card form-section" id="section-selectors">
|
| 62 |
+
<div class="card-header">
|
| 63 |
+
<h2 class="card-title">🔎 Selectors</h2>
|
| 64 |
+
<span class="card-subtitle">Define what to extract (leave blank for all elements)</span>
|
| 65 |
+
</div>
|
| 66 |
+
<div class="card-body">
|
| 67 |
+
<div class="form-grid">
|
| 68 |
+
<div class="form-group">
|
| 69 |
+
<label for="css_selector" class="form-label">CSS Selector</label>
|
| 70 |
+
<input type="text" name="css_selector" id="css_selector" class="form-input {% if errors.get('css_selector') %}form-error{% endif %}" placeholder="e.g. .product-card h2" value="{{ form_data.get('css_selector', '') }}">
|
| 71 |
+
{% if errors.get('css_selector') %}<span class="form-error-text">{{ errors.css_selector }}</span>{% endif %}
|
| 72 |
+
</div>
|
| 73 |
+
|
| 74 |
+
<div class="form-group">
|
| 75 |
+
<label for="xpath_selector" class="form-label">XPath Selector</label>
|
| 76 |
+
<input type="text" name="xpath_selector" id="xpath_selector" class="form-input {% if errors.get('xpath_selector') %}form-error{% endif %}" placeholder="e.g. //div[@class='item']/h2" value="{{ form_data.get('xpath_selector', '') }}">
|
| 77 |
+
{% if errors.get('xpath_selector') %}<span class="form-error-text">{{ errors.xpath_selector }}</span>{% endif %}
|
| 78 |
+
</div>
|
| 79 |
+
|
| 80 |
+
<div class="form-group">
|
| 81 |
+
<label for="html_tag" class="form-label">HTML Tag</label>
|
| 82 |
+
<input type="text" name="html_tag" id="html_tag" class="form-input" placeholder="e.g. h2, p, div" value="{{ form_data.get('html_tag', '') }}">
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
<div class="form-group">
|
| 86 |
+
<label for="attribute_name" class="form-label">Attribute Name</label>
|
| 87 |
+
<input type="text" name="attribute_name" id="attribute_name" class="form-input" placeholder="e.g. href, src, data-id" value="{{ form_data.get('attribute_name', '') }}">
|
| 88 |
+
<span class="form-hint">Used when extraction type is "Attributes"</span>
|
| 89 |
+
</div>
|
| 90 |
+
</div>
|
| 91 |
+
</div>
|
| 92 |
+
</div>
|
| 93 |
+
|
| 94 |
+
<!-- Advanced Options -->
|
| 95 |
+
<div class="card form-section" id="section-advanced">
|
| 96 |
+
<div class="card-header collapsible" onclick="toggleSection('advanced-body')">
|
| 97 |
+
<h2 class="card-title">⚙️ Advanced Options</h2>
|
| 98 |
+
<svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="6 9 12 15 18 9"/></svg>
|
| 99 |
+
</div>
|
| 100 |
+
<div class="card-body" id="advanced-body">
|
| 101 |
+
<div class="form-grid">
|
| 102 |
+
<div class="form-group">
|
| 103 |
+
<label for="delay_seconds" class="form-label">Delay Between Requests (s)</label>
|
| 104 |
+
<input type="number" name="delay_seconds" id="delay_seconds" class="form-input {% if errors.get('delay_seconds') %}form-error{% endif %}" value="{{ form_data.get('delay_seconds', '1.0') }}" step="0.1" min="0" max="60">
|
| 105 |
+
{% if errors.get('delay_seconds') %}<span class="form-error-text">{{ errors.delay_seconds }}</span>{% endif %}
|
| 106 |
+
</div>
|
| 107 |
+
|
| 108 |
+
<div class="form-group">
|
| 109 |
+
<label for="timeout_seconds" class="form-label">Timeout (s)</label>
|
| 110 |
+
<input type="number" name="timeout_seconds" id="timeout_seconds" class="form-input" value="{{ form_data.get('timeout_seconds', '30') }}" min="5" max="120">
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<div class="form-group">
|
| 114 |
+
<label for="max_retries" class="form-label">Max Retries</label>
|
| 115 |
+
<input type="number" name="max_retries" id="max_retries" class="form-input" value="{{ form_data.get('max_retries', '3') }}" min="0" max="10">
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
<div class="form-group">
|
| 119 |
+
<label for="max_pages" class="form-label">Max Pages</label>
|
| 120 |
+
<input type="number" name="max_pages" id="max_pages" class="form-input {% if errors.get('max_pages') %}form-error{% endif %}" value="{{ form_data.get('max_pages', '1') }}" min="1" max="200">
|
| 121 |
+
{% if errors.get('max_pages') %}<span class="form-error-text">{{ errors.max_pages }}</span>{% endif %}
|
| 122 |
+
</div>
|
| 123 |
+
|
| 124 |
+
<div class="form-group">
|
| 125 |
+
<label for="scroll_count" class="form-label">Scroll Count</label>
|
| 126 |
+
<input type="number" name="scroll_count" id="scroll_count" class="form-input" value="{{ form_data.get('scroll_count', '3') }}" min="1" max="50">
|
| 127 |
+
<span class="form-hint">For infinite scroll pages</span>
|
| 128 |
+
</div>
|
| 129 |
+
|
| 130 |
+
<div class="form-group">
|
| 131 |
+
<label for="user_agent" class="form-label">Custom User Agent</label>
|
| 132 |
+
<input type="text" name="user_agent" id="user_agent" class="form-input" placeholder="Leave blank for random UA" value="{{ form_data.get('user_agent', '') }}">
|
| 133 |
+
</div>
|
| 134 |
+
|
| 135 |
+
<div class="form-group form-full">
|
| 136 |
+
<label for="custom_headers" class="form-label">Custom Headers (JSON)</label>
|
| 137 |
+
<textarea name="custom_headers" id="custom_headers" class="form-textarea" rows="3" placeholder='{"Accept": "text/html", "Cookie": "session=abc123"}'>{{ form_data.get('custom_headers', '') }}</textarea>
|
| 138 |
+
</div>
|
| 139 |
+
</div>
|
| 140 |
+
|
| 141 |
+
<!-- Checkboxes -->
|
| 142 |
+
<div class="form-checkboxes">
|
| 143 |
+
<label class="checkbox-label">
|
| 144 |
+
<input type="checkbox" name="follow_pagination" {% if form_data.get('follow_pagination') %}checked{% endif %}>
|
| 145 |
+
<span class="checkbox-custom"></span>
|
| 146 |
+
Follow Pagination
|
| 147 |
+
</label>
|
| 148 |
+
<label class="checkbox-label">
|
| 149 |
+
<input type="checkbox" name="infinite_scroll" {% if form_data.get('infinite_scroll') %}checked{% endif %}>
|
| 150 |
+
<span class="checkbox-custom"></span>
|
| 151 |
+
Infinite Scroll
|
| 152 |
+
</label>
|
| 153 |
+
<label class="checkbox-label">
|
| 154 |
+
<input type="checkbox" name="check_robots_txt" {% if form_data.get('check_robots_txt', True) %}checked{% endif %}>
|
| 155 |
+
<span class="checkbox-custom"></span>
|
| 156 |
+
Respect robots.txt
|
| 157 |
+
</label>
|
| 158 |
+
<label class="checkbox-label">
|
| 159 |
+
<input type="checkbox" name="deduplicate" {% if form_data.get('deduplicate', True) %}checked{% endif %}>
|
| 160 |
+
<span class="checkbox-custom"></span>
|
| 161 |
+
Deduplicate Results
|
| 162 |
+
</label>
|
| 163 |
+
<label class="checkbox-label">
|
| 164 |
+
<input type="checkbox" name="download_images" {% if form_data.get('download_images') %}checked{% endif %}>
|
| 165 |
+
<span class="checkbox-custom"></span>
|
| 166 |
+
Download Images
|
| 167 |
+
</label>
|
| 168 |
+
</div>
|
| 169 |
+
</div>
|
| 170 |
+
</div>
|
| 171 |
+
|
| 172 |
+
<!-- Submit -->
|
| 173 |
+
<div class="form-actions">
|
| 174 |
+
<a href="{{ url_for('jobs.list_view') }}" class="btn btn-ghost">Cancel</a>
|
| 175 |
+
<button type="submit" class="btn btn-primary btn-glow btn-lg" id="submit-job">
|
| 176 |
+
<svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polygon points="5 3 19 12 5 21 5 3"/></svg>
|
| 177 |
+
Launch Scrape Job
|
| 178 |
+
</button>
|
| 179 |
+
</div>
|
| 180 |
+
</form>
|
| 181 |
+
{% endblock %}
|
app/utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .validators import validate_url, validate_job_data, sanitize_string
|
| 2 |
+
from .logging_config import configure_logging
|
| 3 |
+
|
| 4 |
+
__all__ = ["validate_url", "validate_job_data", "sanitize_string", "configure_logging"]
|
app/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (347 Bytes). View file
|
|
|
app/utils/__pycache__/logging_config.cpython-310.pyc
ADDED
|
Binary file (1.51 kB). View file
|
|
|
app/utils/__pycache__/validators.cpython-310.pyc
ADDED
|
Binary file (3.43 kB). View file
|
|
|
app/utils/logging_config.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Structured logging configuration.
|
| 3 |
+
Sets up file + console handlers with proper formatting.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import logging.handlers
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def configure_logging(log_level: str = "INFO", log_dir: str = "logs") -> None:
|
| 14 |
+
"""Configure root logger with rotating file and console handlers."""
|
| 15 |
+
log_path = Path(log_dir)
|
| 16 |
+
log_path.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
| 19 |
+
fmt = logging.Formatter(
|
| 20 |
+
"%(asctime)s [%(levelname)-8s] %(name)s: %(message)s",
|
| 21 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Root logger
|
| 25 |
+
root = logging.getLogger()
|
| 26 |
+
root.setLevel(level)
|
| 27 |
+
root.handlers.clear()
|
| 28 |
+
|
| 29 |
+
# Console handler
|
| 30 |
+
console = logging.StreamHandler(sys.stdout)
|
| 31 |
+
console.setFormatter(fmt)
|
| 32 |
+
console.setLevel(level)
|
| 33 |
+
root.addHandler(console)
|
| 34 |
+
|
| 35 |
+
# Rotating file handler
|
| 36 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
| 37 |
+
log_path / "app.log",
|
| 38 |
+
maxBytes=10 * 1024 * 1024, # 10 MB
|
| 39 |
+
backupCount=5,
|
| 40 |
+
encoding="utf-8",
|
| 41 |
+
)
|
| 42 |
+
file_handler.setFormatter(fmt)
|
| 43 |
+
file_handler.setLevel(level)
|
| 44 |
+
root.addHandler(file_handler)
|
| 45 |
+
|
| 46 |
+
# Error-only file
|
| 47 |
+
error_handler = logging.handlers.RotatingFileHandler(
|
| 48 |
+
log_path / "errors.log",
|
| 49 |
+
maxBytes=5 * 1024 * 1024,
|
| 50 |
+
backupCount=3,
|
| 51 |
+
encoding="utf-8",
|
| 52 |
+
)
|
| 53 |
+
error_handler.setFormatter(fmt)
|
| 54 |
+
error_handler.setLevel(logging.ERROR)
|
| 55 |
+
root.addHandler(error_handler)
|
| 56 |
+
|
| 57 |
+
# Suppress noisy libs
|
| 58 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
| 59 |
+
logging.getLogger("requests").setLevel(logging.WARNING)
|
| 60 |
+
logging.getLogger("playwright").setLevel(logging.WARNING)
|
app/utils/validators.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Input validation and sanitization utilities.
|
| 3 |
+
All user-supplied strings pass through here before entering business logic.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from typing import Any, Optional
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
|
| 11 |
+
import bleach
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Allowed HTML tags for any rich-text fields (none for our use case)
|
| 15 |
+
ALLOWED_TAGS: list[str] = []
|
| 16 |
+
ALLOWED_ATTRS: dict = {}
|
| 17 |
+
|
| 18 |
+
# Valid CSS selector pattern (permissive but blocks script injection)
|
| 19 |
+
_CSS_UNSAFE_PATTERN = re.compile(r"[<>\"']|javascript:", re.IGNORECASE)
|
| 20 |
+
_XPATH_UNSAFE_PATTERN = re.compile(r"[<>]|javascript:", re.IGNORECASE)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def sanitize_string(value: Any, max_length: int = 500) -> str:
|
| 24 |
+
"""Strip HTML tags and trim whitespace."""
|
| 25 |
+
if not value:
|
| 26 |
+
return ""
|
| 27 |
+
cleaned = bleach.clean(str(value), tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRS, strip=True)
|
| 28 |
+
return cleaned[:max_length].strip()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def validate_url(url: str) -> tuple[bool, str]:
|
| 32 |
+
"""Validate URL format. Returns (is_valid, error_message)."""
|
| 33 |
+
if not url:
|
| 34 |
+
return False, "URL is required."
|
| 35 |
+
url = url.strip()
|
| 36 |
+
try:
|
| 37 |
+
parsed = urlparse(url)
|
| 38 |
+
if parsed.scheme not in ("http", "https"):
|
| 39 |
+
return False, "URL must use http or https."
|
| 40 |
+
if not parsed.netloc:
|
| 41 |
+
return False, "URL must include a valid domain."
|
| 42 |
+
# Block localhost/private IPs (SSRF prevention)
|
| 43 |
+
host = parsed.hostname or ""
|
| 44 |
+
blocked = ["localhost", "127.0.0.1", "0.0.0.0", "::1"]
|
| 45 |
+
if host in blocked or host.startswith("192.168.") or host.startswith("10."):
|
| 46 |
+
return False, "Requests to internal addresses are not allowed."
|
| 47 |
+
except Exception:
|
| 48 |
+
return False, "Invalid URL format."
|
| 49 |
+
return True, ""
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def validate_css_selector(selector: Optional[str]) -> tuple[bool, str]:
|
| 53 |
+
if not selector:
|
| 54 |
+
return True, ""
|
| 55 |
+
if _CSS_UNSAFE_PATTERN.search(selector):
|
| 56 |
+
return False, "CSS selector contains unsafe characters."
|
| 57 |
+
return True, ""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def validate_xpath(xpath: Optional[str]) -> tuple[bool, str]:
|
| 61 |
+
if not xpath:
|
| 62 |
+
return True, ""
|
| 63 |
+
if _XPATH_UNSAFE_PATTERN.search(xpath):
|
| 64 |
+
return False, "XPath contains unsafe characters."
|
| 65 |
+
return True, ""
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def validate_job_data(data: dict) -> tuple[bool, dict]:
|
| 69 |
+
"""
|
| 70 |
+
Validate all fields for a scrape job.
|
| 71 |
+
Returns (is_valid, error_dict).
|
| 72 |
+
"""
|
| 73 |
+
errors: dict[str, str] = {}
|
| 74 |
+
|
| 75 |
+
url = data.get("url", "")
|
| 76 |
+
valid, msg = validate_url(url)
|
| 77 |
+
if not valid:
|
| 78 |
+
errors["url"] = msg
|
| 79 |
+
|
| 80 |
+
css = data.get("css_selector")
|
| 81 |
+
valid, msg = validate_css_selector(css)
|
| 82 |
+
if not valid:
|
| 83 |
+
errors["css_selector"] = msg
|
| 84 |
+
|
| 85 |
+
xpath = data.get("xpath_selector")
|
| 86 |
+
valid, msg = validate_xpath(xpath)
|
| 87 |
+
if not valid:
|
| 88 |
+
errors["xpath_selector"] = msg
|
| 89 |
+
|
| 90 |
+
extraction_type = data.get("extraction_type", "text")
|
| 91 |
+
valid_extractions = ("text", "images", "links", "attributes", "table", "json_ld", "full_html")
|
| 92 |
+
if extraction_type not in valid_extractions:
|
| 93 |
+
errors["extraction_type"] = f"Must be one of: {', '.join(valid_extractions)}"
|
| 94 |
+
|
| 95 |
+
scrape_type = data.get("scrape_type", "static")
|
| 96 |
+
if scrape_type not in ("static", "dynamic"):
|
| 97 |
+
errors["scrape_type"] = "Must be 'static' or 'dynamic'."
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
max_pages = int(data.get("max_pages", 1))
|
| 101 |
+
if not 1 <= max_pages <= 200:
|
| 102 |
+
errors["max_pages"] = "max_pages must be between 1 and 200."
|
| 103 |
+
except (TypeError, ValueError):
|
| 104 |
+
errors["max_pages"] = "max_pages must be an integer."
|
| 105 |
+
|
| 106 |
+
try:
|
| 107 |
+
delay = float(data.get("delay_seconds", 1.0))
|
| 108 |
+
if not 0 <= delay <= 60:
|
| 109 |
+
errors["delay_seconds"] = "delay_seconds must be between 0 and 60."
|
| 110 |
+
except (TypeError, ValueError):
|
| 111 |
+
errors["delay_seconds"] = "delay_seconds must be a number."
|
| 112 |
+
|
| 113 |
+
return len(errors) == 0, errors
|
database/scraper.db
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78dfc516cfcade3e0ab261153191e2958ef3ac2ec9589c8f8ac1606a414dc293
|
| 3 |
+
size 114688
|
env.example
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Application
|
| 2 |
+
FLASK_ENV=development
|
| 3 |
+
SECRET_KEY=dev-secret-key-change-in-production-use-32-chars
|
| 4 |
+
DEBUG=True
|
| 5 |
+
|
| 6 |
+
# Database
|
| 7 |
+
DATABASE_URL=sqlite:///database/scraper.db
|
| 8 |
+
|
| 9 |
+
# Security
|
| 10 |
+
WTF_CSRF_ENABLED=True
|
| 11 |
+
SESSION_COOKIE_SECURE=False
|
| 12 |
+
SESSION_COOKIE_HTTPONLY=True
|
| 13 |
+
SESSION_COOKIE_SAMESITE=Lax
|
| 14 |
+
|
| 15 |
+
# Rate Limiting
|
| 16 |
+
RATELIMIT_DEFAULT=100 per hour
|
| 17 |
+
RATELIMIT_STORAGE_URL=memory://
|
| 18 |
+
|
| 19 |
+
# Scraping
|
| 20 |
+
MAX_CONCURRENT_JOBS=5
|
| 21 |
+
REQUEST_TIMEOUT=30
|
| 22 |
+
MAX_RETRIES=3
|
| 23 |
+
DEFAULT_DELAY=1.0
|
| 24 |
+
|
| 25 |
+
# Exports
|
| 26 |
+
EXPORT_DIR=exports
|
| 27 |
+
|
| 28 |
+
# Logging
|
| 29 |
+
LOG_LEVEL=DEBUG
|
| 30 |
+
LOG_DIR=logs
|
logs/app.log
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-05-17 14:44:10 [INFO ] app: WebScraper Platform started [development]
|
| 2 |
+
2026-05-17 14:44:10 [INFO ] werkzeug: [31m[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.[0m
|
| 3 |
+
* Running on http://127.0.0.1:5000
|
| 4 |
+
2026-05-17 14:44:10 [INFO ] werkzeug: [33mPress CTRL+C to quit[0m
|
| 5 |
+
2026-05-17 14:44:10 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 6 |
+
2026-05-17 14:44:11 [INFO ] app: WebScraper Platform started [development]
|
| 7 |
+
2026-05-17 14:44:11 [WARNING ] werkzeug: * Debugger is active!
|
| 8 |
+
2026-05-17 14:44:11 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 9 |
+
2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET / -> 200 (94.00ms)
|
| 10 |
+
2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET / HTTP/1.1" 200 -
|
| 11 |
+
2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 200 (0.00ms)
|
| 12 |
+
2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET /static/css/style.css HTTP/1.1" 200 -
|
| 13 |
+
2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 200 (0.00ms)
|
| 14 |
+
2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET /static/js/app.js HTTP/1.1" 200 -
|
| 15 |
+
2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /favicon.ico -> 404 (0.00ms)
|
| 16 |
+
2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
|
| 17 |
+
2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (94.00ms)
|
| 18 |
+
2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "GET /jobs/new HTTP/1.1" 200 -
|
| 19 |
+
2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 20 |
+
2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 21 |
+
2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 22 |
+
2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 23 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (1).py', reloading
|
| 24 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (2).py', reloading
|
| 25 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (3).py', reloading
|
| 26 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (4).py', reloading
|
| 27 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (5).py', reloading
|
| 28 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (6).py', reloading
|
| 29 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (7).py', reloading
|
| 30 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__.py', reloading
|
| 31 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\engine.py', reloading
|
| 32 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\models.py', reloading
|
| 33 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\job_service.py', reloading
|
| 34 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\export_service.py', reloading
|
| 35 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\security.py', reloading
|
| 36 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\main.py', reloading
|
| 37 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\jobs.py', reloading
|
| 38 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\validators.py', reloading
|
| 39 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\logging_config.py', reloading
|
| 40 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\settings.py', reloading
|
| 41 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\pygments\\plugin.py', reloading
|
| 42 |
+
2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\dns\\rdtypes\\IN\\A.py', reloading
|
| 43 |
+
2026-05-17 14:45:19 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 44 |
+
2026-05-17 14:45:20 [INFO ] app: WebScraper Platform started [development]
|
| 45 |
+
2026-05-17 14:45:20 [WARNING ] werkzeug: * Debugger is active!
|
| 46 |
+
2026-05-17 14:45:20 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 47 |
+
2026-05-17 14:46:31 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (62.00ms)
|
| 48 |
+
2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "[32mPOST /jobs/new HTTP/1.1[0m" 302 -
|
| 49 |
+
2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (78.00ms)
|
| 50 |
+
2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "GET /jobs/1 HTTP/1.1" 200 -
|
| 51 |
+
2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 52 |
+
2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 53 |
+
2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 54 |
+
2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 55 |
+
2026-05-17 14:46:34 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
|
| 56 |
+
2026-05-17 14:46:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:34] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
|
| 57 |
+
2026-05-17 14:46:36 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
|
| 58 |
+
2026-05-17 14:46:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:36] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
|
| 59 |
+
2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
|
| 60 |
+
2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
|
| 61 |
+
2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (0.00ms)
|
| 62 |
+
2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /jobs/1 HTTP/1.1" 200 -
|
| 63 |
+
2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 64 |
+
2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 65 |
+
2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 66 |
+
2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 67 |
+
2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET / -> 200 (62.00ms)
|
| 68 |
+
2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "GET / HTTP/1.1" 200 -
|
| 69 |
+
2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 70 |
+
2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 71 |
+
2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 72 |
+
2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 73 |
+
2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (16.00ms)
|
| 74 |
+
2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "GET /jobs/1 HTTP/1.1" 200 -
|
| 75 |
+
2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 76 |
+
2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 77 |
+
2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 78 |
+
2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 79 |
+
2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (31.00ms)
|
| 80 |
+
2026-05-17 14:48:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:14] "GET /jobs/new HTTP/1.1" 200 -
|
| 81 |
+
2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 82 |
+
2026-05-17 14:48:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:14] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 83 |
+
2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 84 |
+
2026-05-17 14:48:15 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:15] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 85 |
+
2026-05-17 14:48:38 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (31.00ms)
|
| 86 |
+
2026-05-17 14:48:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:38] "[32mPOST /jobs/new HTTP/1.1[0m" 302 -
|
| 87 |
+
2026-05-17 14:48:38 [DEBUG ] app.middleware.security: GET /jobs/2 -> 200 (79.00ms)
|
| 88 |
+
2026-05-17 14:48:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:38] "GET /jobs/2 HTTP/1.1" 200 -
|
| 89 |
+
2026-05-17 14:48:39 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 90 |
+
2026-05-17 14:48:39 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:39] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 91 |
+
2026-05-17 14:48:39 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 92 |
+
2026-05-17 14:48:39 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:39] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 93 |
+
2026-05-17 14:48:41 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/2/status -> 200 (15.00ms)
|
| 94 |
+
2026-05-17 14:48:41 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:41] "GET /jobs/api/jobs/2/status HTTP/1.1" 200 -
|
| 95 |
+
2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/2/status -> 200 (16.00ms)
|
| 96 |
+
2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /jobs/api/jobs/2/status HTTP/1.1" 200 -
|
| 97 |
+
2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /jobs/2 -> 200 (47.00ms)
|
| 98 |
+
2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /jobs/2 HTTP/1.1" 200 -
|
| 99 |
+
2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 100 |
+
2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 101 |
+
2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 102 |
+
2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 103 |
+
2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
|
| 104 |
+
2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "GET /jobs/new HTTP/1.1" 200 -
|
| 105 |
+
2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 106 |
+
2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 107 |
+
2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 108 |
+
2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 109 |
+
2026-05-17 14:49:14 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (32.00ms)
|
| 110 |
+
2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "[32mPOST /jobs/new HTTP/1.1[0m" 302 -
|
| 111 |
+
2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /jobs/3 -> 200 (47.00ms)
|
| 112 |
+
2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "GET /jobs/3 HTTP/1.1" 200 -
|
| 113 |
+
2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 114 |
+
2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 115 |
+
2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 116 |
+
2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 117 |
+
2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/3/status -> 200 (0.00ms)
|
| 118 |
+
2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /jobs/api/jobs/3/status HTTP/1.1" 200 -
|
| 119 |
+
2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /jobs/3 -> 200 (47.00ms)
|
| 120 |
+
2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /jobs/3 HTTP/1.1" 200 -
|
| 121 |
+
2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (15.00ms)
|
| 122 |
+
2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 123 |
+
2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 124 |
+
2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 125 |
+
2026-05-17 14:50:47 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\app\\scrapers\\engine.py', reloading
|
| 126 |
+
2026-05-17 14:50:47 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\app\\scrapers\\engine.py', reloading
|
| 127 |
+
2026-05-17 14:50:47 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 128 |
+
2026-05-17 14:50:48 [INFO ] app: WebScraper Platform started [development]
|
| 129 |
+
2026-05-17 14:50:48 [WARNING ] werkzeug: * Debugger is active!
|
| 130 |
+
2026-05-17 14:50:48 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 131 |
+
2026-05-17 14:51:04 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\brotli.py', reloading
|
| 132 |
+
2026-05-17 14:51:04 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\brotli.py', reloading
|
| 133 |
+
2026-05-17 14:51:05 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 134 |
+
2026-05-17 14:51:06 [INFO ] app: WebScraper Platform started [development]
|
| 135 |
+
2026-05-17 14:51:06 [WARNING ] werkzeug: * Debugger is active!
|
| 136 |
+
2026-05-17 14:51:06 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 137 |
+
2026-05-17 14:52:00 [INFO ] app: WebScraper Platform started [development]
|
| 138 |
+
2026-05-17 14:52:00 [INFO ] werkzeug: [31m[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.[0m
|
| 139 |
+
* Running on http://127.0.0.1:5000
|
| 140 |
+
2026-05-17 14:52:00 [INFO ] werkzeug: [33mPress CTRL+C to quit[0m
|
| 141 |
+
2026-05-17 14:52:00 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 142 |
+
2026-05-17 14:52:01 [INFO ] app: WebScraper Platform started [development]
|
| 143 |
+
2026-05-17 14:52:01 [WARNING ] werkzeug: * Debugger is active!
|
| 144 |
+
2026-05-17 14:52:01 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 145 |
+
2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (31.00ms)
|
| 146 |
+
2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "GET /jobs/new HTTP/1.1" 200 -
|
| 147 |
+
2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 148 |
+
2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 149 |
+
2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 150 |
+
2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 151 |
+
2026-05-17 14:53:26 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (171.00ms)
|
| 152 |
+
2026-05-17 14:53:26 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:26] "[32mPOST /jobs/new HTTP/1.1[0m" 302 -
|
| 153 |
+
2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (250.00ms)
|
| 154 |
+
2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "GET /jobs/4 HTTP/1.1" 200 -
|
| 155 |
+
2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (16.00ms)
|
| 156 |
+
2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 157 |
+
2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 158 |
+
2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 159 |
+
2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (15.00ms)
|
| 160 |
+
2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
|
| 161 |
+
2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (15.00ms)
|
| 162 |
+
2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /jobs/4 HTTP/1.1" 200 -
|
| 163 |
+
2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 164 |
+
2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 165 |
+
2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (16.00ms)
|
| 166 |
+
2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 167 |
+
2026-05-17 14:53:31 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (0.00ms)
|
| 168 |
+
2026-05-17 14:53:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:31] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
|
| 169 |
+
2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (15.00ms)
|
| 170 |
+
2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
|
| 171 |
+
2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (16.00ms)
|
| 172 |
+
2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /jobs/4 HTTP/1.1" 200 -
|
| 173 |
+
2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (15.00ms)
|
| 174 |
+
2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 175 |
+
2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 176 |
+
2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 177 |
+
2026-05-17 14:57:40 [INFO ] app: WebScraper Platform started [development]
|
| 178 |
+
2026-05-17 14:57:40 [INFO ] werkzeug: [31m[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.[0m
|
| 179 |
+
* Running on http://127.0.0.1:5000
|
| 180 |
+
2026-05-17 14:57:40 [INFO ] werkzeug: [33mPress CTRL+C to quit[0m
|
| 181 |
+
2026-05-17 14:57:40 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
|
| 182 |
+
2026-05-17 14:57:41 [INFO ] app: WebScraper Platform started [development]
|
| 183 |
+
2026-05-17 14:57:41 [WARNING ] werkzeug: * Debugger is active!
|
| 184 |
+
2026-05-17 14:57:41 [INFO ] werkzeug: * Debugger PIN: 590-942-950
|
| 185 |
+
2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET / -> 200 (63.00ms)
|
| 186 |
+
2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET / HTTP/1.1" 200 -
|
| 187 |
+
2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 200 (0.00ms)
|
| 188 |
+
2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET /static/css/style.css HTTP/1.1" 200 -
|
| 189 |
+
2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 200 (0.00ms)
|
| 190 |
+
2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET /static/js/app.js HTTP/1.1" 200 -
|
| 191 |
+
2026-05-17 14:57:54 [DEBUG ] app.middleware.security: GET /favicon.ico -> 404 (15.00ms)
|
| 192 |
+
2026-05-17 14:57:54 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:54] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
|
| 193 |
+
2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /jobs/ -> 200 (47.00ms)
|
| 194 |
+
2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "GET /jobs/ HTTP/1.1" 200 -
|
| 195 |
+
2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 196 |
+
2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 197 |
+
2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 198 |
+
2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 199 |
+
2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (16.00ms)
|
| 200 |
+
2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "GET /jobs/new HTTP/1.1" 200 -
|
| 201 |
+
2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 202 |
+
2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 203 |
+
2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 204 |
+
2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 205 |
+
2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET / -> 200 (15.00ms)
|
| 206 |
+
2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "GET / HTTP/1.1" 200 -
|
| 207 |
+
2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 208 |
+
2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 209 |
+
2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 210 |
+
2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 211 |
+
2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /jobs/ -> 200 (16.00ms)
|
| 212 |
+
2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "GET /jobs/ HTTP/1.1" 200 -
|
| 213 |
+
2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 214 |
+
2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 215 |
+
2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 216 |
+
2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 217 |
+
2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
|
| 218 |
+
2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "GET /jobs/new HTTP/1.1" 200 -
|
| 219 |
+
2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 220 |
+
2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 221 |
+
2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 222 |
+
2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 223 |
+
2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET / -> 200 (0.00ms)
|
| 224 |
+
2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "GET / HTTP/1.1" 200 -
|
| 225 |
+
2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 226 |
+
2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 227 |
+
2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 228 |
+
2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 229 |
+
2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
|
| 230 |
+
2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "GET /jobs/new HTTP/1.1" 200 -
|
| 231 |
+
2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 232 |
+
2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 233 |
+
2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 234 |
+
2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 235 |
+
2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
|
| 236 |
+
2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "GET /jobs/new HTTP/1.1" 200 -
|
| 237 |
+
2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 238 |
+
2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 239 |
+
2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 240 |
+
2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 241 |
+
2026-05-17 14:59:09 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (31.00ms)
|
| 242 |
+
2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "[32mPOST /jobs/new HTTP/1.1[0m" 302 -
|
| 243 |
+
2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /jobs/5 -> 200 (78.00ms)
|
| 244 |
+
2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "GET /jobs/5 HTTP/1.1" 200 -
|
| 245 |
+
2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 246 |
+
2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 247 |
+
2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 248 |
+
2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 249 |
+
2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/5/status -> 200 (16.00ms)
|
| 250 |
+
2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /jobs/api/jobs/5/status HTTP/1.1" 200 -
|
| 251 |
+
2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /jobs/5 -> 200 (15.00ms)
|
| 252 |
+
2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /jobs/5 HTTP/1.1" 200 -
|
| 253 |
+
2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (16.00ms)
|
| 254 |
+
2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 255 |
+
2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 256 |
+
2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
| 257 |
+
2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET / -> 200 (16.00ms)
|
| 258 |
+
2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "GET / HTTP/1.1" 200 -
|
| 259 |
+
2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
|
| 260 |
+
2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "[36mGET /static/css/style.css HTTP/1.1[0m" 304 -
|
| 261 |
+
2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
|
| 262 |
+
2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "[36mGET /static/js/app.js HTTP/1.1[0m" 304 -
|
logs/errors.log
ADDED
|
File without changes
|