LovnishVerma commited on
Commit
50dca14
·
verified ·
1 Parent(s): 4518e69

Upload 51 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env +30 -0
  2. .gitattributes +1 -0
  3. app/__init__.py +80 -0
  4. app/__pycache__/__init__.cpython-310.pyc +0 -0
  5. app/config/__init__.py +3 -0
  6. app/config/__pycache__/__init__.cpython-310.pyc +0 -0
  7. app/config/__pycache__/settings.cpython-310.pyc +0 -0
  8. app/config/settings.py +107 -0
  9. app/middleware/__init__.py +3 -0
  10. app/middleware/__pycache__/__init__.cpython-310.pyc +0 -0
  11. app/middleware/__pycache__/security.cpython-310.pyc +0 -0
  12. app/middleware/security.py +67 -0
  13. app/models/__init__.py +22 -0
  14. app/models/__pycache__/__init__.cpython-310.pyc +0 -0
  15. app/models/__pycache__/models.cpython-310.pyc +0 -0
  16. app/models/models.py +371 -0
  17. app/routes/__init__.py +4 -0
  18. app/routes/__pycache__/__init__.cpython-310.pyc +0 -0
  19. app/routes/__pycache__/jobs.cpython-310.pyc +0 -0
  20. app/routes/__pycache__/main.cpython-310.pyc +0 -0
  21. app/routes/jobs.py +265 -0
  22. app/routes/main.py +34 -0
  23. app/scrapers/__init__.py +3 -0
  24. app/scrapers/__pycache__/__init__.cpython-310.pyc +0 -0
  25. app/scrapers/__pycache__/engine.cpython-310.pyc +0 -0
  26. app/scrapers/engine.py +572 -0
  27. app/services/__init__.py +28 -0
  28. app/services/__pycache__/__init__.cpython-310.pyc +0 -0
  29. app/services/__pycache__/export_service.cpython-310.pyc +0 -0
  30. app/services/__pycache__/job_service.cpython-310.pyc +0 -0
  31. app/services/export_service.py +165 -0
  32. app/services/job_service.py +281 -0
  33. app/static/css/style.css +379 -0
  34. app/static/js/app.js +51 -0
  35. app/templates/base.html +80 -0
  36. app/templates/pages/dashboard.html +113 -0
  37. app/templates/pages/error.html +30 -0
  38. app/templates/pages/job_detail.html +262 -0
  39. app/templates/pages/jobs.html +122 -0
  40. app/templates/pages/new_job.html +181 -0
  41. app/utils/__init__.py +4 -0
  42. app/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  43. app/utils/__pycache__/logging_config.cpython-310.pyc +0 -0
  44. app/utils/__pycache__/validators.cpython-310.pyc +0 -0
  45. app/utils/logging_config.py +60 -0
  46. app/utils/validators.py +113 -0
  47. database/scraper.db +3 -0
  48. env.example +30 -0
  49. logs/app.log +262 -0
  50. logs/errors.log +0 -0
.env ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Application
2
+ FLASK_ENV=development
3
+ SECRET_KEY=dev-secret-key-change-in-production-use-32-chars
4
+ DEBUG=True
5
+
6
+ # Database (leave unset to use default SQLite in project/database/)
7
+ # DATABASE_URL=sqlite:///database/scraper.db
8
+
9
+ # Security
10
+ WTF_CSRF_ENABLED=True
11
+ SESSION_COOKIE_SECURE=False
12
+ SESSION_COOKIE_HTTPONLY=True
13
+ SESSION_COOKIE_SAMESITE=Lax
14
+
15
+ # Rate Limiting
16
+ RATELIMIT_DEFAULT=100 per hour
17
+ RATELIMIT_STORAGE_URL=memory://
18
+
19
+ # Scraping
20
+ MAX_CONCURRENT_JOBS=5
21
+ REQUEST_TIMEOUT=30
22
+ MAX_RETRIES=3
23
+ DEFAULT_DELAY=1.0
24
+
25
+ # Exports
26
+ EXPORT_DIR=exports
27
+
28
+ # Logging
29
+ LOG_LEVEL=DEBUG
30
+ LOG_DIR=logs
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ database/scraper.db filter=lfs diff=lfs merge=lfs -text
app/__init__.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Application factory.
3
+ Create and configure the Flask app with all extensions, blueprints, and middleware.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+
9
+ from flask import Flask
10
+ from flask_wtf.csrf import CSRFProtect
11
+ from flask_limiter import Limiter
12
+ from flask_limiter.util import get_remote_address
13
+
14
+ from app.config import get_config
15
+ from app.models import db
16
+ from app.middleware import register_middleware
17
+ from app.utils.logging_config import configure_logging
18
+
19
+ csrf = CSRFProtect()
20
+ limiter = Limiter(key_func=get_remote_address)
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def create_app(config_class=None) -> Flask:
26
+ """Application factory — creates a fully configured Flask app."""
27
+ app = Flask(
28
+ __name__,
29
+ template_folder="templates",
30
+ static_folder="static",
31
+ )
32
+
33
+ # --- Load config ---
34
+ cfg = config_class or get_config()
35
+ app.config.from_object(cfg)
36
+ cfg.init_app(app)
37
+
38
+ # --- Logging ---
39
+ configure_logging(
40
+ log_level=app.config.get("LOG_LEVEL", "INFO"),
41
+ log_dir=str(app.config.get("LOG_DIR", "logs")),
42
+ )
43
+
44
+ # --- Extensions ---
45
+ db.init_app(app)
46
+ csrf.init_app(app)
47
+ limiter.init_app(app)
48
+
49
+ # --- Middleware ---
50
+ register_middleware(app)
51
+
52
+ # --- Blueprints ---
53
+ from app.routes import main_bp, jobs_bp
54
+ app.register_blueprint(main_bp)
55
+ app.register_blueprint(jobs_bp)
56
+
57
+ # Apply rate limiting to API routes
58
+ limiter.limit("60 per minute")(jobs_bp)
59
+
60
+ # --- Database init ---
61
+ with app.app_context():
62
+ db.create_all()
63
+ _seed_defaults()
64
+
65
+ logger.info("WebScraper Platform started [%s]", app.config.get("FLASK_ENV", "development"))
66
+ return app
67
+
68
+
69
+ def _seed_defaults() -> None:
70
+ """Seed default app settings if not present."""
71
+ from app.models import AppSetting
72
+ defaults = [
73
+ ("max_concurrent_jobs", "5", "Maximum parallel scrape jobs"),
74
+ ("default_delay", "1.0", "Default seconds between requests"),
75
+ ("default_timeout", "30", "Default HTTP timeout in seconds"),
76
+ ]
77
+ for key, value, desc in defaults:
78
+ if not AppSetting.query.filter_by(key=key).first():
79
+ db.session.add(AppSetting(key=key, value=value, description=desc))
80
+ db.session.commit()
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.3 kB). View file
 
app/config/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .settings import get_config, BaseConfig, DevelopmentConfig, ProductionConfig, TestingConfig
2
+
3
+ __all__ = ["get_config", "BaseConfig", "DevelopmentConfig", "ProductionConfig", "TestingConfig"]
app/config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (312 Bytes). View file
 
app/config/__pycache__/settings.cpython-310.pyc ADDED
Binary file (3.51 kB). View file
 
app/config/settings.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Application configuration classes for different environments.
3
+ Uses environment variables with sensible defaults.
4
+ """
5
+ import os
6
+ from pathlib import Path
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ # app/config/settings.py -> parent (config) -> parent (app) -> parent (project root)
12
+ BASE_DIR = Path(__file__).resolve().parent.parent.parent
13
+
14
+
15
+ class BaseConfig:
16
+ """Base configuration shared across all environments."""
17
+
18
+ # Flask
19
+ SECRET_KEY: str = os.getenv("SECRET_KEY", "change-me-in-production-32-chars!!")
20
+ DEBUG: bool = False
21
+ TESTING: bool = False
22
+
23
+ # SQLAlchemy
24
+ SQLALCHEMY_DATABASE_URI: str = os.getenv(
25
+ "DATABASE_URL", f"sqlite:///{BASE_DIR / 'database' / 'scraper.db'}"
26
+ )
27
+ SQLALCHEMY_TRACK_MODIFICATIONS: bool = False
28
+ SQLALCHEMY_ENGINE_OPTIONS: dict = {
29
+ "pool_pre_ping": True,
30
+ "pool_recycle": 300,
31
+ }
32
+
33
+ # CSRF
34
+ WTF_CSRF_ENABLED: bool = os.getenv("WTF_CSRF_ENABLED", "True") == "True"
35
+ WTF_CSRF_TIME_LIMIT: int = 3600
36
+
37
+ # Session
38
+ SESSION_COOKIE_SECURE: bool = os.getenv("SESSION_COOKIE_SECURE", "False") == "True"
39
+ SESSION_COOKIE_HTTPONLY: bool = True
40
+ SESSION_COOKIE_SAMESITE: str = "Lax"
41
+ PERMANENT_SESSION_LIFETIME: int = 86400
42
+
43
+ # Rate limiting
44
+ RATELIMIT_DEFAULT: str = os.getenv("RATELIMIT_DEFAULT", "200 per hour")
45
+ RATELIMIT_STORAGE_URL: str = os.getenv("RATELIMIT_STORAGE_URL", "memory://")
46
+ RATELIMIT_HEADERS_ENABLED: bool = True
47
+
48
+ # Scraping engine
49
+ MAX_CONCURRENT_JOBS: int = int(os.getenv("MAX_CONCURRENT_JOBS", "5"))
50
+ REQUEST_TIMEOUT: int = int(os.getenv("REQUEST_TIMEOUT", "30"))
51
+ MAX_RETRIES: int = int(os.getenv("MAX_RETRIES", "3"))
52
+ DEFAULT_DELAY: float = float(os.getenv("DEFAULT_DELAY", "1.0"))
53
+ MAX_PAGES: int = int(os.getenv("MAX_PAGES", "50"))
54
+
55
+ # Directories
56
+ EXPORT_DIR: Path = BASE_DIR / os.getenv("EXPORT_DIR", "exports")
57
+ LOG_DIR: Path = BASE_DIR / os.getenv("LOG_DIR", "logs")
58
+
59
+ # Logging
60
+ LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
61
+ LOG_FORMAT: str = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
62
+
63
+ @classmethod
64
+ def init_app(cls, app) -> None:
65
+ """Hook for environment-specific initialization."""
66
+ # Ensure required directories exist
67
+ cls.EXPORT_DIR.mkdir(parents=True, exist_ok=True)
68
+ cls.LOG_DIR.mkdir(parents=True, exist_ok=True)
69
+ (BASE_DIR / "database").mkdir(parents=True, exist_ok=True)
70
+
71
+
72
+ class DevelopmentConfig(BaseConfig):
73
+ DEBUG: bool = True
74
+ LOG_LEVEL: str = "DEBUG"
75
+ SESSION_COOKIE_SECURE: bool = False
76
+
77
+
78
+ class ProductionConfig(BaseConfig):
79
+ DEBUG: bool = False
80
+ SESSION_COOKIE_SECURE: bool = True
81
+ LOG_LEVEL: str = "WARNING"
82
+ SQLALCHEMY_ENGINE_OPTIONS: dict = {
83
+ "pool_pre_ping": True,
84
+ "pool_recycle": 300,
85
+ "pool_size": 10,
86
+ "max_overflow": 20,
87
+ }
88
+
89
+
90
+ class TestingConfig(BaseConfig):
91
+ TESTING: bool = True
92
+ DEBUG: bool = True
93
+ WTF_CSRF_ENABLED: bool = False
94
+ SQLALCHEMY_DATABASE_URI: str = "sqlite:///:memory:"
95
+
96
+
97
+ config_map = {
98
+ "development": DevelopmentConfig,
99
+ "production": ProductionConfig,
100
+ "testing": TestingConfig,
101
+ "default": DevelopmentConfig,
102
+ }
103
+
104
+
105
+ def get_config() -> type:
106
+ env = os.getenv("FLASK_ENV", "development").lower()
107
+ return config_map.get(env, DevelopmentConfig)
app/middleware/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .security import register_middleware
2
+
3
+ __all__ = ["register_middleware"]
app/middleware/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (228 Bytes). View file
 
app/middleware/__pycache__/security.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
app/middleware/security.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Flask middleware:
3
+ - Request timing
4
+ - Security headers (XSS, clickjacking, content-type sniffing)
5
+ - Global error handlers (404, 429, 500)
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import time
11
+
12
+ from flask import Flask, jsonify, request, render_template
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def register_middleware(app: Flask) -> None:
18
+ """Register all middleware and error handlers on the Flask app."""
19
+
20
+ @app.before_request
21
+ def start_timer() -> None:
22
+ request._start_time = time.monotonic()
23
+
24
+ @app.after_request
25
+ def add_security_headers(response):
26
+ duration = time.monotonic() - getattr(request, "_start_time", time.monotonic())
27
+ response.headers["X-Response-Time"] = f"{duration * 1000:.2f}ms"
28
+ response.headers["X-Content-Type-Options"] = "nosniff"
29
+ response.headers["X-Frame-Options"] = "SAMEORIGIN"
30
+ response.headers["X-XSS-Protection"] = "1; mode=block"
31
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
32
+ response.headers["Permissions-Policy"] = "geolocation=(), microphone=()"
33
+ if app.config.get("SESSION_COOKIE_SECURE"):
34
+ response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
35
+ logger.debug(
36
+ "%s %s -> %d (%.2fms)",
37
+ request.method,
38
+ request.path,
39
+ response.status_code,
40
+ duration * 1000,
41
+ )
42
+ return response
43
+
44
+ @app.errorhandler(400)
45
+ def bad_request(exc):
46
+ if request.is_json:
47
+ return jsonify({"error": "Bad request", "detail": str(exc)}), 400
48
+ return render_template("pages/error.html", code=400, message="Bad Request"), 400
49
+
50
+ @app.errorhandler(404)
51
+ def not_found(exc):
52
+ if request.is_json:
53
+ return jsonify({"error": "Not found"}), 404
54
+ return render_template("pages/error.html", code=404, message="Page Not Found"), 404
55
+
56
+ @app.errorhandler(429)
57
+ def rate_limited(exc):
58
+ if request.is_json:
59
+ return jsonify({"error": "Rate limit exceeded. Please slow down."}), 429
60
+ return render_template("pages/error.html", code=429, message="Rate limit exceeded"), 429
61
+
62
+ @app.errorhandler(500)
63
+ def internal_error(exc):
64
+ logger.exception("Internal server error: %s", exc)
65
+ if request.is_json:
66
+ return jsonify({"error": "Internal server error"}), 500
67
+ return render_template("pages/error.html", code=500, message="Internal Server Error"), 500
app/models/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .models import (
2
+ db,
3
+ BaseModel,
4
+ User,
5
+ ScrapeJob,
6
+ ScrapeResult,
7
+ JobLog,
8
+ ExportRecord,
9
+ Schedule,
10
+ AppSetting,
11
+ )
12
+
13
+ __all__ = [
14
+ "db",
15
+ "User",
16
+ "ScrapeJob",
17
+ "ScrapeResult",
18
+ "JobLog",
19
+ "ExportRecord",
20
+ "Schedule",
21
+ "AppSetting",
22
+ ]
app/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (402 Bytes). View file
 
app/models/__pycache__/models.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
app/models/models.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SQLAlchemy ORM models with relationships, indexes, and cascading deletes.
3
+ All timestamps are UTC. All text fields are sanitized at the service layer.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from datetime import datetime, timezone
9
+ from typing import Optional
10
+
11
+ from flask_sqlalchemy import SQLAlchemy
12
+ from sqlalchemy import (
13
+ Boolean,
14
+ Column,
15
+ DateTime,
16
+ Float,
17
+ ForeignKey,
18
+ Index,
19
+ Integer,
20
+ String,
21
+ Text,
22
+ Enum,
23
+ UniqueConstraint,
24
+ )
25
+ from sqlalchemy.orm import relationship
26
+
27
+ db = SQLAlchemy()
28
+
29
+
30
+ class BaseModel(db.Model):
31
+ """Base model with __allow_unmapped__ for legacy annotations."""
32
+ __abstract__ = True
33
+ __allow_unmapped__ = True
34
+
35
+
36
+ def utcnow() -> datetime:
37
+ return datetime.now(timezone.utc).replace(tzinfo=None)
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Enumerations (stored as VARCHAR for SQLite compatibility)
42
+ # ---------------------------------------------------------------------------
43
+
44
+ JOB_STATUS = ("pending", "running", "completed", "failed", "cancelled")
45
+ SCRAPE_TYPE = ("static", "dynamic")
46
+ EXTRACTION_TYPE = ("text", "images", "links", "attributes", "table", "json_ld", "full_html")
47
+ EXPORT_FORMAT = ("json", "csv", "excel")
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Models
52
+ # ---------------------------------------------------------------------------
53
+
54
+
55
+ class User(BaseModel):
56
+ """Simple user model — single-user by default, extendable to multi-user."""
57
+
58
+ __tablename__ = "users"
59
+
60
+ id: int = Column(Integer, primary_key=True)
61
+ username: str = Column(String(80), unique=True, nullable=False, index=True)
62
+ email: str = Column(String(120), unique=True, nullable=False, index=True)
63
+ password_hash: str = Column(String(256), nullable=False)
64
+ is_active: bool = Column(Boolean, default=True, nullable=False)
65
+ is_admin: bool = Column(Boolean, default=False, nullable=False)
66
+ api_key: Optional[str] = Column(String(64), unique=True, index=True)
67
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
68
+ updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow, nullable=False)
69
+
70
+ # Relationships
71
+ jobs: list[ScrapeJob] = relationship(
72
+ "ScrapeJob", back_populates="user", cascade="all, delete-orphan", lazy="select"
73
+ )
74
+ schedules: list[Schedule] = relationship(
75
+ "Schedule", back_populates="user", cascade="all, delete-orphan", lazy="select"
76
+ )
77
+
78
+ def __repr__(self) -> str:
79
+ return f"<User {self.username}>"
80
+
81
+ def to_dict(self) -> dict:
82
+ return {
83
+ "id": self.id,
84
+ "username": self.username,
85
+ "email": self.email,
86
+ "is_active": self.is_active,
87
+ "is_admin": self.is_admin,
88
+ "created_at": self.created_at.isoformat() if self.created_at else None,
89
+ }
90
+
91
+
92
+ class ScrapeJob(BaseModel):
93
+ """Represents a single scraping job with all configuration."""
94
+
95
+ __tablename__ = "scrape_jobs"
96
+ __table_args__ = (
97
+ Index("ix_scrape_jobs_status", "status"),
98
+ Index("ix_scrape_jobs_created_at", "created_at"),
99
+ Index("ix_scrape_jobs_user_id", "user_id"),
100
+ )
101
+
102
+ id: int = Column(Integer, primary_key=True)
103
+ user_id: Optional[int] = Column(Integer, ForeignKey("users.id", ondelete="SET NULL"), nullable=True)
104
+ name: str = Column(String(200), nullable=False)
105
+ url: str = Column(Text, nullable=False)
106
+
107
+ # Selectors
108
+ html_tag: Optional[str] = Column(String(100))
109
+ css_selector: Optional[str] = Column(Text)
110
+ xpath_selector: Optional[str] = Column(Text)
111
+ attribute_name: Optional[str] = Column(String(100))
112
+
113
+ # Configuration
114
+ extraction_type: str = Column(
115
+ Enum(*EXTRACTION_TYPE, name="extraction_type"), default="text", nullable=False
116
+ )
117
+ scrape_type: str = Column(
118
+ Enum(*SCRAPE_TYPE, name="scrape_type"), default="static", nullable=False
119
+ )
120
+ status: str = Column(
121
+ Enum(*JOB_STATUS, name="job_status"), default="pending", nullable=False
122
+ )
123
+
124
+ # Advanced options
125
+ follow_pagination: bool = Column(Boolean, default=False)
126
+ max_pages: int = Column(Integer, default=1)
127
+ infinite_scroll: bool = Column(Boolean, default=False)
128
+ scroll_count: int = Column(Integer, default=3)
129
+ download_images: bool = Column(Boolean, default=False)
130
+ custom_headers: Optional[str] = Column(Text) # JSON string
131
+ user_agent: Optional[str] = Column(Text)
132
+ delay_seconds: float = Column(Float, default=1.0)
133
+ timeout_seconds: int = Column(Integer, default=30)
134
+ max_retries: int = Column(Integer, default=3)
135
+ check_robots_txt: bool = Column(Boolean, default=True)
136
+ deduplicate: bool = Column(Boolean, default=True)
137
+
138
+ # Stats
139
+ total_items: int = Column(Integer, default=0)
140
+ pages_scraped: int = Column(Integer, default=0)
141
+ error_count: int = Column(Integer, default=0)
142
+ duration_seconds: Optional[float] = Column(Float)
143
+
144
+ # Timestamps
145
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
146
+ updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
147
+ started_at: Optional[datetime] = Column(DateTime)
148
+ completed_at: Optional[datetime] = Column(DateTime)
149
+
150
+ # Relationships
151
+ user: Optional[User] = relationship("User", back_populates="jobs")
152
+ results: list[ScrapeResult] = relationship(
153
+ "ScrapeResult", back_populates="job", cascade="all, delete-orphan", lazy="select"
154
+ )
155
+ logs: list[JobLog] = relationship(
156
+ "JobLog", back_populates="job", cascade="all, delete-orphan", lazy="select"
157
+ )
158
+ exports: list[ExportRecord] = relationship(
159
+ "ExportRecord", back_populates="job", cascade="all, delete-orphan", lazy="select"
160
+ )
161
+
162
+ def __repr__(self) -> str:
163
+ return f"<ScrapeJob {self.id} [{self.status}] {self.url[:50]}>"
164
+
165
+ @property
166
+ def custom_headers_dict(self) -> dict:
167
+ if self.custom_headers:
168
+ try:
169
+ return json.loads(self.custom_headers)
170
+ except (json.JSONDecodeError, TypeError):
171
+ return {}
172
+ return {}
173
+
174
+ def to_dict(self) -> dict:
175
+ return {
176
+ "id": self.id,
177
+ "name": self.name,
178
+ "url": self.url,
179
+ "html_tag": self.html_tag,
180
+ "css_selector": self.css_selector,
181
+ "xpath_selector": self.xpath_selector,
182
+ "extraction_type": self.extraction_type,
183
+ "scrape_type": self.scrape_type,
184
+ "status": self.status,
185
+ "follow_pagination": self.follow_pagination,
186
+ "max_pages": self.max_pages,
187
+ "infinite_scroll": self.infinite_scroll,
188
+ "total_items": self.total_items,
189
+ "pages_scraped": self.pages_scraped,
190
+ "error_count": self.error_count,
191
+ "duration_seconds": self.duration_seconds,
192
+ "created_at": self.created_at.isoformat() if self.created_at else None,
193
+ "started_at": self.started_at.isoformat() if self.started_at else None,
194
+ "completed_at": self.completed_at.isoformat() if self.completed_at else None,
195
+ }
196
+
197
+
198
+ class ScrapeResult(BaseModel):
199
+ """Individual scraped data item linked to a job."""
200
+
201
+ __tablename__ = "scrape_results"
202
+ __table_args__ = (
203
+ Index("ix_scrape_results_job_id", "job_id"),
204
+ Index("ix_scrape_results_page_num", "page_num"),
205
+ Index("ix_scrape_results_content_hash", "content_hash"),
206
+ )
207
+
208
+ id: int = Column(Integer, primary_key=True)
209
+ job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
210
+ page_url: str = Column(Text, nullable=False)
211
+ page_num: int = Column(Integer, default=1, nullable=False)
212
+ item_index: int = Column(Integer, default=0)
213
+ content: Optional[str] = Column(Text)
214
+ content_type: str = Column(String(50), default="text")
215
+ content_hash: Optional[str] = Column(String(64))
216
+ metadata_: Optional[str] = Column("metadata", Text) # JSON
217
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
218
+
219
+ # Relationship
220
+ job: ScrapeJob = relationship("ScrapeJob", back_populates="results")
221
+
222
+ def __repr__(self) -> str:
223
+ return f"<ScrapeResult job={self.job_id} page={self.page_num} idx={self.item_index}>"
224
+
225
+ @property
226
+ def metadata_dict(self) -> dict:
227
+ if self.metadata_:
228
+ try:
229
+ return json.loads(self.metadata_)
230
+ except (json.JSONDecodeError, TypeError):
231
+ return {}
232
+ return {}
233
+
234
+ def to_dict(self) -> dict:
235
+ return {
236
+ "id": self.id,
237
+ "job_id": self.job_id,
238
+ "page_url": self.page_url,
239
+ "page_num": self.page_num,
240
+ "item_index": self.item_index,
241
+ "content": self.content,
242
+ "content_type": self.content_type,
243
+ "metadata": self.metadata_dict,
244
+ "created_at": self.created_at.isoformat() if self.created_at else None,
245
+ }
246
+
247
+
248
+ class JobLog(BaseModel):
249
+ """Structured logs for each scraping job."""
250
+
251
+ __tablename__ = "job_logs"
252
+ __table_args__ = (
253
+ Index("ix_job_logs_job_id", "job_id"),
254
+ Index("ix_job_logs_level", "level"),
255
+ Index("ix_job_logs_created_at", "created_at"),
256
+ )
257
+
258
+ id: int = Column(Integer, primary_key=True)
259
+ job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
260
+ level: str = Column(String(10), default="INFO", nullable=False)
261
+ message: str = Column(Text, nullable=False)
262
+ details: Optional[str] = Column(Text) # JSON for structured extras
263
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
264
+
265
+ job: ScrapeJob = relationship("ScrapeJob", back_populates="logs")
266
+
267
+ def to_dict(self) -> dict:
268
+ return {
269
+ "id": self.id,
270
+ "job_id": self.job_id,
271
+ "level": self.level,
272
+ "message": self.message,
273
+ "details": self.details,
274
+ "created_at": self.created_at.isoformat() if self.created_at else None,
275
+ }
276
+
277
+
278
+ class ExportRecord(BaseModel):
279
+ """Tracks export files generated for each job."""
280
+
281
+ __tablename__ = "export_records"
282
+ __table_args__ = (Index("ix_export_records_job_id", "job_id"),)
283
+
284
+ id: int = Column(Integer, primary_key=True)
285
+ job_id: int = Column(Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False)
286
+ format: str = Column(Enum(*EXPORT_FORMAT, name="export_format"), nullable=False)
287
+ filename: str = Column(String(255), nullable=False)
288
+ filepath: str = Column(Text, nullable=False)
289
+ file_size_bytes: Optional[int] = Column(Integer)
290
+ row_count: int = Column(Integer, default=0)
291
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
292
+
293
+ job: ScrapeJob = relationship("ScrapeJob", back_populates="exports")
294
+
295
+ def to_dict(self) -> dict:
296
+ return {
297
+ "id": self.id,
298
+ "job_id": self.job_id,
299
+ "format": self.format,
300
+ "filename": self.filename,
301
+ "file_size_bytes": self.file_size_bytes,
302
+ "row_count": self.row_count,
303
+ "created_at": self.created_at.isoformat() if self.created_at else None,
304
+ }
305
+
306
+
307
+ class Schedule(BaseModel):
308
+ """APScheduler-backed recurring scrape schedules."""
309
+
310
+ __tablename__ = "schedules"
311
+ __table_args__ = (Index("ix_schedules_user_id", "user_id"),)
312
+
313
+ id: int = Column(Integer, primary_key=True)
314
+ user_id: Optional[int] = Column(Integer, ForeignKey("users.id", ondelete="SET NULL"), nullable=True)
315
+ name: str = Column(String(200), nullable=False)
316
+ cron_expression: str = Column(String(100), nullable=False) # e.g. "0 9 * * 1"
317
+ job_config: str = Column(Text, nullable=False) # JSON of ScrapeJob config
318
+ is_active: bool = Column(Boolean, default=True, nullable=False)
319
+ last_run_at: Optional[datetime] = Column(DateTime)
320
+ next_run_at: Optional[datetime] = Column(DateTime)
321
+ run_count: int = Column(Integer, default=0)
322
+ created_at: datetime = Column(DateTime, default=utcnow, nullable=False)
323
+ updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
324
+
325
+ user: Optional[User] = relationship("User", back_populates="schedules")
326
+
327
+ @property
328
+ def job_config_dict(self) -> dict:
329
+ try:
330
+ return json.loads(self.job_config)
331
+ except (json.JSONDecodeError, TypeError):
332
+ return {}
333
+
334
+ def to_dict(self) -> dict:
335
+ return {
336
+ "id": self.id,
337
+ "name": self.name,
338
+ "cron_expression": self.cron_expression,
339
+ "is_active": self.is_active,
340
+ "last_run_at": self.last_run_at.isoformat() if self.last_run_at else None,
341
+ "next_run_at": self.next_run_at.isoformat() if self.next_run_at else None,
342
+ "run_count": self.run_count,
343
+ "created_at": self.created_at.isoformat() if self.created_at else None,
344
+ }
345
+
346
+
347
+ class AppSetting(BaseModel):
348
+ """Key-value application settings stored in the DB."""
349
+
350
+ __tablename__ = "app_settings"
351
+
352
+ id: int = Column(Integer, primary_key=True)
353
+ key: str = Column(String(100), unique=True, nullable=False, index=True)
354
+ value: str = Column(Text, nullable=False)
355
+ description: Optional[str] = Column(Text)
356
+ updated_at: datetime = Column(DateTime, default=utcnow, onupdate=utcnow)
357
+
358
+ @classmethod
359
+ def get(cls, key: str, default: str = "") -> str:
360
+ row = cls.query.filter_by(key=key).first()
361
+ return row.value if row else default
362
+
363
+ @classmethod
364
+ def set(cls, key: str, value: str, description: str = "") -> None:
365
+ row = cls.query.filter_by(key=key).first()
366
+ if row:
367
+ row.value = value
368
+ else:
369
+ row = cls(key=key, value=value, description=description)
370
+ db.session.add(row)
371
+ db.session.commit()
app/routes/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .main import main_bp
2
+ from .jobs import jobs_bp
3
+
4
+ __all__ = ["main_bp", "jobs_bp"]
app/routes/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (251 Bytes). View file
 
app/routes/__pycache__/jobs.cpython-310.pyc ADDED
Binary file (7.28 kB). View file
 
app/routes/__pycache__/main.cpython-310.pyc ADDED
Binary file (1.11 kB). View file
 
app/routes/jobs.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scrape job routes (UI + REST API).
3
+ Blueprint: 'jobs' — prefix: /jobs
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ from flask import (
11
+ Blueprint,
12
+ abort,
13
+ current_app,
14
+ flash,
15
+ jsonify,
16
+ redirect,
17
+ render_template,
18
+ request,
19
+ send_file,
20
+ url_for,
21
+ )
22
+
23
+ from app.services import (
24
+ create_job,
25
+ cancel_job,
26
+ delete_job,
27
+ execute_job_async,
28
+ get_job,
29
+ get_job_logs,
30
+ get_job_results,
31
+ list_jobs,
32
+ export_json,
33
+ export_csv,
34
+ export_excel,
35
+ get_job_exports,
36
+ )
37
+ from app.utils.validators import validate_job_data, sanitize_string
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ jobs_bp = Blueprint("jobs", __name__, url_prefix="/jobs")
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # UI Routes
46
+ # ---------------------------------------------------------------------------
47
+
48
+
49
+ @jobs_bp.route("/")
50
+ def list_view():
51
+ page = request.args.get("page", 1, type=int)
52
+ status = request.args.get("status")
53
+ search = request.args.get("search")
54
+ pagination = list_jobs(page=page, per_page=20, status=status, search=search)
55
+ return render_template(
56
+ "pages/jobs.html",
57
+ pagination=pagination,
58
+ status_filter=status,
59
+ search=search,
60
+ )
61
+
62
+
63
+ @jobs_bp.route("/new", methods=["GET", "POST"])
64
+ def new_job():
65
+ if request.method == "POST":
66
+ data = {k: v for k, v in request.form.items()}
67
+ # Sanitize string fields
68
+ for field in ("name", "html_tag", "css_selector", "xpath_selector", "attribute_name", "user_agent"):
69
+ if field in data:
70
+ data[field] = sanitize_string(data[field])
71
+
72
+ # Checkboxes
73
+ data["follow_pagination"] = "follow_pagination" in request.form
74
+ data["infinite_scroll"] = "infinite_scroll" in request.form
75
+ data["download_images"] = "download_images" in request.form
76
+ data["check_robots_txt"] = "check_robots_txt" in request.form
77
+ data["deduplicate"] = "deduplicate" in request.form
78
+
79
+ valid, errors = validate_job_data(data)
80
+ if not valid:
81
+ return render_template("pages/new_job.html", errors=errors, form_data=data)
82
+
83
+ job = create_job(data)
84
+ execute_job_async(job.id, current_app._get_current_object())
85
+ flash(f"Job #{job.id} started successfully!", "success")
86
+ return redirect(url_for("jobs.detail", job_id=job.id))
87
+
88
+ return render_template("pages/new_job.html", errors={}, form_data={})
89
+
90
+
91
+ @jobs_bp.route("/<int:job_id>")
92
+ def detail(job_id: int):
93
+ job = get_job(job_id)
94
+ if not job:
95
+ abort(404)
96
+ results_page = get_job_results(job_id, page=request.args.get("rpage", 1, type=int), per_page=50)
97
+ logs_page = get_job_logs(job_id, page=1, per_page=200)
98
+ exports = get_job_exports(job_id)
99
+ return render_template(
100
+ "pages/job_detail.html",
101
+ job=job,
102
+ results=results_page,
103
+ logs=logs_page,
104
+ exports=exports,
105
+ )
106
+
107
+
108
+ @jobs_bp.route("/<int:job_id>/delete", methods=["POST"])
109
+ def delete(job_id: int):
110
+ if not delete_job(job_id):
111
+ abort(404)
112
+ flash(f"Job #{job_id} deleted.", "info")
113
+ return redirect(url_for("jobs.list_view"))
114
+
115
+
116
+ @jobs_bp.route("/<int:job_id>/cancel", methods=["POST"])
117
+ def cancel(job_id: int):
118
+ if not cancel_job(job_id):
119
+ flash("Cannot cancel this job.", "warning")
120
+ else:
121
+ flash(f"Job #{job_id} cancelled.", "info")
122
+ return redirect(url_for("jobs.detail", job_id=job_id))
123
+
124
+
125
+ @jobs_bp.route("/<int:job_id>/rerun", methods=["POST"])
126
+ def rerun(job_id: int):
127
+ job = get_job(job_id)
128
+ if not job:
129
+ abort(404)
130
+ job.status = "pending"
131
+ from app.models import db
132
+ db.session.commit()
133
+ execute_job_async(job_id, current_app._get_current_object())
134
+ flash(f"Job #{job_id} queued for re-run.", "success")
135
+ return redirect(url_for("jobs.detail", job_id=job_id))
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # Export Routes
140
+ # ---------------------------------------------------------------------------
141
+
142
+
143
+ @jobs_bp.route("/<int:job_id>/export/<string:fmt>")
144
+ def export(job_id: int, fmt: str):
145
+ job = get_job(job_id)
146
+ if not job:
147
+ abort(404)
148
+
149
+ exporters = {"json": export_json, "csv": export_csv, "excel": export_excel}
150
+ if fmt not in exporters:
151
+ abort(400)
152
+
153
+ filepath = exporters[fmt](job_id)
154
+ if not filepath or not Path(filepath).exists():
155
+ flash("Export failed. No results to export.", "danger")
156
+ return redirect(url_for("jobs.detail", job_id=job_id))
157
+
158
+ mime_map = {
159
+ "json": "application/json",
160
+ "csv": "text/csv",
161
+ "excel": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
162
+ }
163
+ return send_file(filepath, mimetype=mime_map[fmt], as_attachment=True)
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # REST API Routes
168
+ # ---------------------------------------------------------------------------
169
+
170
+
171
+ @jobs_bp.route("/api/jobs", methods=["GET"])
172
+ def api_list():
173
+ page = request.args.get("page", 1, type=int)
174
+ per_page = min(request.args.get("per_page", 20, type=int), 100)
175
+ status = request.args.get("status")
176
+ search = request.args.get("search")
177
+ pagination = list_jobs(page=page, per_page=per_page, status=status, search=search)
178
+ return jsonify(
179
+ {
180
+ "jobs": [j.to_dict() for j in pagination.items],
181
+ "total": pagination.total,
182
+ "page": pagination.page,
183
+ "pages": pagination.pages,
184
+ "per_page": pagination.per_page,
185
+ }
186
+ )
187
+
188
+
189
+ @jobs_bp.route("/api/jobs", methods=["POST"])
190
+ def api_create():
191
+ data = request.get_json(silent=True)
192
+ if not data:
193
+ return jsonify({"error": "JSON body required"}), 400
194
+
195
+ valid, errors = validate_job_data(data)
196
+ if not valid:
197
+ return jsonify({"error": "Validation failed", "details": errors}), 422
198
+
199
+ job = create_job(data)
200
+ execute_job_async(job.id, current_app._get_current_object())
201
+ return jsonify({"job": job.to_dict(), "message": "Job created and queued"}), 201
202
+
203
+
204
+ @jobs_bp.route("/api/jobs/<int:job_id>", methods=["GET"])
205
+ def api_get(job_id: int):
206
+ job = get_job(job_id)
207
+ if not job:
208
+ return jsonify({"error": "Job not found"}), 404
209
+ return jsonify({"job": job.to_dict()})
210
+
211
+
212
+ @jobs_bp.route("/api/jobs/<int:job_id>", methods=["DELETE"])
213
+ def api_delete(job_id: int):
214
+ if not delete_job(job_id):
215
+ return jsonify({"error": "Job not found"}), 404
216
+ return jsonify({"message": f"Job {job_id} deleted"}), 200
217
+
218
+
219
+ @jobs_bp.route("/api/jobs/<int:job_id>/results", methods=["GET"])
220
+ def api_results(job_id: int):
221
+ job = get_job(job_id)
222
+ if not job:
223
+ return jsonify({"error": "Job not found"}), 404
224
+ page = request.args.get("page", 1, type=int)
225
+ per_page = min(request.args.get("per_page", 50, type=int), 500)
226
+ pagination = get_job_results(job_id, page=page, per_page=per_page)
227
+ return jsonify(
228
+ {
229
+ "results": [r.to_dict() for r in pagination.items],
230
+ "total": pagination.total,
231
+ "page": pagination.page,
232
+ "pages": pagination.pages,
233
+ }
234
+ )
235
+
236
+
237
+ @jobs_bp.route("/api/jobs/<int:job_id>/logs", methods=["GET"])
238
+ def api_logs(job_id: int):
239
+ job = get_job(job_id)
240
+ if not job:
241
+ return jsonify({"error": "Job not found"}), 404
242
+ page = request.args.get("page", 1, type=int)
243
+ pagination = get_job_logs(job_id, page=page, per_page=100)
244
+ return jsonify(
245
+ {
246
+ "logs": [l.to_dict() for l in pagination.items],
247
+ "total": pagination.total,
248
+ }
249
+ )
250
+
251
+
252
+ @jobs_bp.route("/api/jobs/<int:job_id>/status", methods=["GET"])
253
+ def api_status(job_id: int):
254
+ """Lightweight polling endpoint for live status updates."""
255
+ job = get_job(job_id)
256
+ if not job:
257
+ return jsonify({"error": "Job not found"}), 404
258
+ return jsonify(
259
+ {
260
+ "status": job.status,
261
+ "total_items": job.total_items,
262
+ "pages_scraped": job.pages_scraped,
263
+ "error_count": job.error_count,
264
+ }
265
+ )
app/routes/main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dashboard and main UI routes.
3
+ Blueprint: 'main' — prefix: /
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+
9
+ from flask import Blueprint, render_template, jsonify
10
+
11
+ from app.services import get_dashboard_stats
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ main_bp = Blueprint("main", __name__)
16
+
17
+
18
+ @main_bp.route("/")
19
+ def index():
20
+ stats = get_dashboard_stats()
21
+ return render_template("pages/dashboard.html", stats=stats)
22
+
23
+
24
+ @main_bp.route("/health")
25
+ def health():
26
+ """Health check endpoint for load balancers and monitoring."""
27
+ return jsonify({"status": "ok", "service": "WebScraper Platform"}), 200
28
+
29
+
30
+ @main_bp.route("/metrics")
31
+ def metrics():
32
+ """Simple performance metrics endpoint."""
33
+ stats = get_dashboard_stats()
34
+ return jsonify(stats), 200
app/scrapers/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .engine import run_scrape, ScrapeRequest, ScrapeResponse, ScrapedItem
2
+
3
+ __all__ = ["run_scrape", "ScrapeRequest", "ScrapeResponse", "ScrapedItem"]
app/scrapers/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (283 Bytes). View file
 
app/scrapers/__pycache__/engine.cpython-310.pyc ADDED
Binary file (14 kB). View file
 
app/scrapers/engine.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core scraping engine.
3
+ Handles static (Requests + BS4) and dynamic (Playwright) scraping.
4
+ Implements UA rotation, retry logic, robots.txt checking, pagination, and infinite scroll.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import json
10
+ import logging
11
+ import re
12
+ import time
13
+ import urllib.robotparser
14
+ from dataclasses import dataclass, field
15
+ from typing import Any, Generator, Optional
16
+ from urllib.parse import urljoin, urlparse
17
+
18
+ import requests
19
+ from bs4 import BeautifulSoup
20
+ from fake_useragent import UserAgent
21
+ from tenacity import (
22
+ retry,
23
+ retry_if_exception_type,
24
+ stop_after_attempt,
25
+ wait_exponential,
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Data structures
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ @dataclass
36
+ class ScrapeRequest:
37
+ """Fully-typed request configuration for the scraping engine."""
38
+
39
+ url: str
40
+ html_tag: Optional[str] = None
41
+ css_selector: Optional[str] = None
42
+ xpath_selector: Optional[str] = None
43
+ attribute_name: Optional[str] = None
44
+ extraction_type: str = "text" # text | images | links | attributes | table | json_ld | full_html
45
+ scrape_type: str = "static" # static | dynamic
46
+ follow_pagination: bool = False
47
+ max_pages: int = 1
48
+ infinite_scroll: bool = False
49
+ scroll_count: int = 3
50
+ download_images: bool = False
51
+ custom_headers: dict = field(default_factory=dict)
52
+ user_agent: Optional[str] = None
53
+ delay_seconds: float = 1.0
54
+ timeout_seconds: int = 30
55
+ max_retries: int = 3
56
+ check_robots_txt: bool = True
57
+ deduplicate: bool = True
58
+
59
+
60
+ @dataclass
61
+ class ScrapedItem:
62
+ """A single scraped datum."""
63
+
64
+ content: str
65
+ content_type: str
66
+ page_url: str
67
+ page_num: int
68
+ item_index: int
69
+ content_hash: str
70
+ metadata: dict = field(default_factory=dict)
71
+
72
+
73
+ @dataclass
74
+ class ScrapeResponse:
75
+ """Final response from the scraping engine."""
76
+
77
+ items: list[ScrapedItem]
78
+ pages_scraped: int
79
+ error_count: int
80
+ errors: list[str]
81
+ duration_seconds: float
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # User-Agent rotation
86
+ # ---------------------------------------------------------------------------
87
+
88
+ _ua_instance: Optional[UserAgent] = None
89
+
90
+
91
+ def _get_user_agent(preferred: Optional[str] = None) -> str:
92
+ global _ua_instance
93
+ if preferred:
94
+ return preferred
95
+ try:
96
+ if _ua_instance is None:
97
+ _ua_instance = UserAgent(fallback="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
98
+ return _ua_instance.random
99
+ except Exception:
100
+ return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # robots.txt checker
105
+ # ---------------------------------------------------------------------------
106
+
107
+
108
+ def _is_allowed_by_robots(url: str, user_agent: str = "*") -> bool:
109
+ """Check whether a URL is allowed by the target site's robots.txt."""
110
+ try:
111
+ parsed = urlparse(url)
112
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
113
+ rp = urllib.robotparser.RobotFileParser()
114
+ rp.set_url(robots_url)
115
+ rp.read()
116
+ return rp.can_fetch(user_agent, url)
117
+ except Exception as exc:
118
+ logger.warning("robots.txt check failed for %s: %s", url, exc)
119
+ return True # Allow on error — be conservative
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Content hashing for deduplication
124
+ # ---------------------------------------------------------------------------
125
+
126
+
127
+ def _content_hash(content: str) -> str:
128
+ return hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()[:16]
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # HTML Parsers
133
+ # ---------------------------------------------------------------------------
134
+
135
+
136
+ def _parse_html(html: str) -> BeautifulSoup:
137
+ try:
138
+ return BeautifulSoup(html, "lxml")
139
+ except Exception:
140
+ return BeautifulSoup(html, "html.parser")
141
+
142
+
143
+ def _extract_with_css(soup: BeautifulSoup, selector: str) -> list[Any]:
144
+ return soup.select(selector)
145
+
146
+
147
+ def _extract_with_xpath(html: str, xpath: str) -> list[str]:
148
+ """XPath extraction via lxml."""
149
+ try:
150
+ from lxml import etree
151
+
152
+ tree = etree.fromstring(html.encode(), parser=etree.HTMLParser())
153
+ results = tree.xpath(xpath)
154
+ texts = []
155
+ for r in results:
156
+ if isinstance(r, str):
157
+ texts.append(r.strip())
158
+ elif hasattr(r, "text_content"):
159
+ texts.append(r.text_content().strip())
160
+ elif hasattr(r, "text"):
161
+ texts.append((r.text or "").strip())
162
+ return [t for t in texts if t]
163
+ except Exception as exc:
164
+ logger.warning("XPath extraction failed: %s", exc)
165
+ return []
166
+
167
+
168
+ def _extract_json_ld(soup: BeautifulSoup) -> list[dict]:
169
+ results = []
170
+ for tag in soup.find_all("script", type="application/ld+json"):
171
+ try:
172
+ data = json.loads(tag.string or "")
173
+ results.append(data)
174
+ except (json.JSONDecodeError, TypeError):
175
+ pass
176
+ return results
177
+
178
+
179
+ def _extract_tables(soup: BeautifulSoup) -> list[list[list[str]]]:
180
+ tables = []
181
+ for table in soup.find_all("table"):
182
+ rows = []
183
+ for tr in table.find_all("tr"):
184
+ cells = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])]
185
+ if cells:
186
+ rows.append(cells)
187
+ if rows:
188
+ tables.append(rows)
189
+ return tables
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Content extractors
194
+ # ---------------------------------------------------------------------------
195
+
196
+
197
+ def _extract_items(
198
+ soup: BeautifulSoup,
199
+ raw_html: str,
200
+ req: ScrapeRequest,
201
+ page_url: str,
202
+ page_num: int,
203
+ ) -> list[ScrapedItem]:
204
+ """Route extraction to the appropriate extractor."""
205
+ items: list[ScrapedItem] = []
206
+
207
+ if req.extraction_type == "json_ld":
208
+ for idx, data in enumerate(_extract_json_ld(soup)):
209
+ content = json.dumps(data, ensure_ascii=False)
210
+ items.append(
211
+ ScrapedItem(
212
+ content=content,
213
+ content_type="json_ld",
214
+ page_url=page_url,
215
+ page_num=page_num,
216
+ item_index=idx,
217
+ content_hash=_content_hash(content),
218
+ )
219
+ )
220
+ return items
221
+
222
+ if req.extraction_type == "table":
223
+ for t_idx, table in enumerate(_extract_tables(soup)):
224
+ content = json.dumps(table, ensure_ascii=False)
225
+ items.append(
226
+ ScrapedItem(
227
+ content=content,
228
+ content_type="table",
229
+ page_url=page_url,
230
+ page_num=page_num,
231
+ item_index=t_idx,
232
+ content_hash=_content_hash(content),
233
+ )
234
+ )
235
+ return items
236
+
237
+ if req.extraction_type == "full_html":
238
+ content = str(soup)
239
+ items.append(
240
+ ScrapedItem(
241
+ content=content,
242
+ content_type="html",
243
+ page_url=page_url,
244
+ page_num=page_num,
245
+ item_index=0,
246
+ content_hash=_content_hash(content),
247
+ )
248
+ )
249
+ return items
250
+
251
+ # --- Resolve elements via CSS or XPath or Tag ---
252
+ elements = []
253
+
254
+ if req.xpath_selector:
255
+ texts = _extract_with_xpath(raw_html, req.xpath_selector)
256
+ for idx, text in enumerate(texts):
257
+ items.append(
258
+ ScrapedItem(
259
+ content=text,
260
+ content_type="text",
261
+ page_url=page_url,
262
+ page_num=page_num,
263
+ item_index=idx,
264
+ content_hash=_content_hash(text),
265
+ )
266
+ )
267
+ return items
268
+
269
+ if req.css_selector:
270
+ elements = _extract_with_css(soup, req.css_selector)
271
+ elif req.html_tag:
272
+ elements = soup.find_all(req.html_tag)
273
+ else:
274
+ elements = soup.find_all(True) # All elements
275
+
276
+ for idx, el in enumerate(elements):
277
+ if req.extraction_type == "text":
278
+ content = el.get_text(separator=" ", strip=True)
279
+ elif req.extraction_type == "links":
280
+ href = el.get("href", "") if el.name == "a" else ""
281
+ if not href:
282
+ link_el = el.find("a")
283
+ href = link_el.get("href", "") if link_el else ""
284
+ if href:
285
+ content = urljoin(page_url, href)
286
+ else:
287
+ continue
288
+ elif req.extraction_type == "images":
289
+ src = el.get("src", "") if el.name == "img" else ""
290
+ if not src:
291
+ img_el = el.find("img")
292
+ src = img_el.get("src", "") if img_el else ""
293
+ if src:
294
+ content = urljoin(page_url, src)
295
+ else:
296
+ continue
297
+ elif req.extraction_type == "attributes":
298
+ if req.attribute_name:
299
+ content = el.get(req.attribute_name, "")
300
+ else:
301
+ content = json.dumps(dict(el.attrs), ensure_ascii=False)
302
+ else:
303
+ content = el.get_text(separator=" ", strip=True)
304
+
305
+ content = content.strip()
306
+ if not content:
307
+ continue
308
+
309
+ items.append(
310
+ ScrapedItem(
311
+ content=content,
312
+ content_type=req.extraction_type,
313
+ page_url=page_url,
314
+ page_num=page_num,
315
+ item_index=idx,
316
+ content_hash=_content_hash(content),
317
+ )
318
+ )
319
+
320
+ return items
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # Static scraper (Requests + BeautifulSoup)
325
+ # ---------------------------------------------------------------------------
326
+
327
+
328
+ class StaticScraper:
329
+ """HTTP scraper using Requests with retry, UA rotation, and timeout handling."""
330
+
331
+ def __init__(self, req: ScrapeRequest) -> None:
332
+ self.req = req
333
+ self.session = requests.Session()
334
+ self._configure_session()
335
+
336
+ def _configure_session(self) -> None:
337
+ ua = _get_user_agent(self.req.user_agent)
338
+ self.session.headers.update(
339
+ {
340
+ "User-Agent": ua,
341
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
342
+ "Accept-Language": "en-US,en;q=0.5",
343
+ "Connection": "keep-alive",
344
+ "Upgrade-Insecure-Requests": "1",
345
+ }
346
+ )
347
+ if self.req.custom_headers:
348
+ self.session.headers.update(self.req.custom_headers)
349
+
350
+ @retry(
351
+ retry=retry_if_exception_type((requests.ConnectionError, requests.Timeout)),
352
+ stop=stop_after_attempt(3),
353
+ wait=wait_exponential(multiplier=1, min=1, max=10),
354
+ reraise=True,
355
+ )
356
+ def _fetch(self, url: str) -> requests.Response:
357
+ response = self.session.get(url, timeout=self.req.timeout_seconds, allow_redirects=True)
358
+ response.raise_for_status()
359
+ return response
360
+
361
+ def scrape(self) -> Generator[tuple[str, str, int], None, None]:
362
+ """Yields (html, page_url, page_num) for each page."""
363
+ url = self.req.url
364
+ page_num = 1
365
+
366
+ while url and page_num <= self.req.max_pages:
367
+ try:
368
+ # Rotate UA per request
369
+ self.session.headers["User-Agent"] = _get_user_agent(self.req.user_agent)
370
+ response = self._fetch(url)
371
+ html = response.text
372
+ yield html, response.url, page_num
373
+
374
+ if not self.req.follow_pagination or page_num >= self.req.max_pages:
375
+ break
376
+
377
+ # Find next page link
378
+ soup = _parse_html(html)
379
+ next_url = _find_next_page(soup, response.url)
380
+ if not next_url or next_url == url:
381
+ break
382
+
383
+ url = next_url
384
+ page_num += 1
385
+ time.sleep(self.req.delay_seconds)
386
+
387
+ except requests.HTTPError as exc:
388
+ logger.error("HTTP error fetching %s: %s", url, exc)
389
+ raise
390
+ except Exception as exc:
391
+ logger.error("Error fetching %s: %s", url, exc)
392
+ raise
393
+
394
+ def close(self) -> None:
395
+ self.session.close()
396
+
397
+
398
+ def _find_next_page(soup: BeautifulSoup, current_url: str) -> Optional[str]:
399
+ """Heuristic: find next pagination link."""
400
+ patterns = [
401
+ "a[rel='next']",
402
+ "a.next",
403
+ "a.pagination-next",
404
+ "li.next a",
405
+ "a[aria-label='Next']",
406
+ ".next-page a",
407
+ "#next a",
408
+ ]
409
+ for sel in patterns:
410
+ el = soup.select_one(sel)
411
+ if el and el.get("href"):
412
+ return urljoin(current_url, el["href"])
413
+
414
+ # Fallback: look for links with text "next"
415
+ for a in soup.find_all("a", href=True):
416
+ text = a.get_text(strip=True).lower()
417
+ if text in ("next", "next »", "»", "›", "next page"):
418
+ return urljoin(current_url, a["href"])
419
+
420
+ return None
421
+
422
+
423
+ # ---------------------------------------------------------------------------
424
+ # Dynamic scraper (Playwright)
425
+ # ---------------------------------------------------------------------------
426
+
427
+
428
+ class DynamicScraper:
429
+ """Playwright-based scraper for JS-rendered content with infinite scroll support."""
430
+
431
+ def scrape(self, req: ScrapeRequest) -> Generator[tuple[str, str, int], None, None]:
432
+ try:
433
+ from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
434
+ except ImportError:
435
+ raise RuntimeError("Playwright not installed. Run: playwright install chromium")
436
+
437
+ ua = _get_user_agent(req.user_agent)
438
+
439
+ with sync_playwright() as p:
440
+ browser = p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"])
441
+ context = browser.new_context(
442
+ user_agent=ua,
443
+ viewport={"width": 1920, "height": 1080},
444
+ extra_http_headers=req.custom_headers or {},
445
+ )
446
+ page = context.new_page()
447
+ page.set_default_timeout(req.timeout_seconds * 1000)
448
+
449
+ url = req.url
450
+ page_num = 1
451
+
452
+ while url and page_num <= req.max_pages:
453
+ try:
454
+ page.goto(url, wait_until="networkidle", timeout=req.timeout_seconds * 1000)
455
+
456
+ if req.infinite_scroll:
457
+ _perform_infinite_scroll(page, req.scroll_count)
458
+
459
+ html = page.content()
460
+ final_url = page.url
461
+ yield html, final_url, page_num
462
+
463
+ if not req.follow_pagination or page_num >= req.max_pages:
464
+ break
465
+
466
+ # Try clicking next page button
467
+ next_url = _playwright_next_page(page, final_url)
468
+ if not next_url or next_url == url:
469
+ break
470
+
471
+ url = next_url
472
+ page_num += 1
473
+ time.sleep(req.delay_seconds)
474
+
475
+ except PWTimeout as exc:
476
+ logger.error("Playwright timeout on %s: %s", url, exc)
477
+ raise
478
+ except Exception as exc:
479
+ logger.error("Playwright error on %s: %s", url, exc)
480
+ raise
481
+
482
+ context.close()
483
+ browser.close()
484
+
485
+
486
+ def _perform_infinite_scroll(page, scroll_count: int) -> None:
487
+ """Scroll to bottom repeatedly to trigger lazy loading."""
488
+ for _ in range(scroll_count):
489
+ prev_height = page.evaluate("document.body.scrollHeight")
490
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
491
+ page.wait_for_timeout(1500)
492
+ new_height = page.evaluate("document.body.scrollHeight")
493
+ if new_height == prev_height:
494
+ break
495
+
496
+
497
+ def _playwright_next_page(page, current_url: str) -> Optional[str]:
498
+ """Try to find and return next page URL from Playwright page."""
499
+ selectors = ["a[rel='next']", "a.next", "li.next a", "[aria-label='Next']"]
500
+ for sel in selectors:
501
+ try:
502
+ el = page.query_selector(sel)
503
+ if el:
504
+ href = el.get_attribute("href")
505
+ if href:
506
+ return urljoin(current_url, href)
507
+ except Exception:
508
+ pass
509
+ return None
510
+
511
+
512
+ # ---------------------------------------------------------------------------
513
+ # Main engine entrypoint
514
+ # ---------------------------------------------------------------------------
515
+
516
+
517
+ def run_scrape(req: ScrapeRequest) -> ScrapeResponse:
518
+ """
519
+ Execute a scrape job and return structured results.
520
+ Handles robots.txt, deduplication, pagination, and error accounting.
521
+ """
522
+ start_time = time.monotonic()
523
+ items: list[ScrapedItem] = []
524
+ errors: list[str] = []
525
+ pages_scraped = 0
526
+ seen_hashes: set[str] = set()
527
+
528
+ # robots.txt
529
+ if req.check_robots_txt:
530
+ if not _is_allowed_by_robots(req.url):
531
+ return ScrapeResponse(
532
+ items=[],
533
+ pages_scraped=0,
534
+ error_count=1,
535
+ errors=[f"robots.txt disallows scraping: {req.url}"],
536
+ duration_seconds=time.monotonic() - start_time,
537
+ )
538
+
539
+ try:
540
+ if req.scrape_type == "dynamic":
541
+ scraper = DynamicScraper()
542
+ pages = scraper.scrape(req)
543
+ else:
544
+ scraper = StaticScraper(req)
545
+ pages = scraper.scrape()
546
+
547
+ for html, page_url, page_num in pages:
548
+ pages_scraped += 1
549
+ soup = _parse_html(html)
550
+ page_items = _extract_items(soup, html, req, page_url, page_num)
551
+
552
+ for item in page_items:
553
+ if req.deduplicate and item.content_hash in seen_hashes:
554
+ continue
555
+ seen_hashes.add(item.content_hash)
556
+ items.append(item)
557
+
558
+ if req.scrape_type == "static" and hasattr(scraper, "close"):
559
+ scraper.close()
560
+
561
+ except Exception as exc:
562
+ error_msg = f"Scrape failed: {type(exc).__name__}: {exc}"
563
+ logger.exception(error_msg)
564
+ errors.append(error_msg)
565
+
566
+ return ScrapeResponse(
567
+ items=items,
568
+ pages_scraped=pages_scraped,
569
+ error_count=len(errors),
570
+ errors=errors,
571
+ duration_seconds=time.monotonic() - start_time,
572
+ )
app/services/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .job_service import (
2
+ create_job,
3
+ get_job,
4
+ list_jobs,
5
+ delete_job,
6
+ cancel_job,
7
+ execute_job_async,
8
+ get_job_logs,
9
+ get_job_results,
10
+ get_dashboard_stats,
11
+ )
12
+ from .export_service import export_json, export_csv, export_excel, get_job_exports
13
+
14
+ __all__ = [
15
+ "create_job",
16
+ "get_job",
17
+ "list_jobs",
18
+ "delete_job",
19
+ "cancel_job",
20
+ "execute_job_async",
21
+ "get_job_logs",
22
+ "get_job_results",
23
+ "get_dashboard_stats",
24
+ "export_json",
25
+ "export_csv",
26
+ "export_excel",
27
+ "get_job_exports",
28
+ ]
app/services/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (591 Bytes). View file
 
app/services/__pycache__/export_service.cpython-310.pyc ADDED
Binary file (5.21 kB). View file
 
app/services/__pycache__/job_service.cpython-310.pyc ADDED
Binary file (8.15 kB). View file
 
app/services/export_service.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Export service — generates JSON, CSV, and Excel files from scraped results.
3
+ Files are stored in the exports/ directory and tracked in the database.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import csv
8
+ import json
9
+ import logging
10
+ import os
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ import openpyxl
16
+ from openpyxl.styles import Font, PatternFill, Alignment
17
+
18
+ from app.models import db, ScrapeJob, ScrapeResult, ExportRecord
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ EXPORT_DIR = Path("exports")
23
+
24
+
25
+ def _ensure_export_dir() -> Path:
26
+ EXPORT_DIR.mkdir(parents=True, exist_ok=True)
27
+ return EXPORT_DIR
28
+
29
+
30
+ def _get_results(job_id: int) -> list[ScrapeResult]:
31
+ return (
32
+ ScrapeResult.query.filter_by(job_id=job_id)
33
+ .order_by(ScrapeResult.page_num, ScrapeResult.item_index)
34
+ .all()
35
+ )
36
+
37
+
38
+ def _record_export(job_id: int, fmt: str, filename: str, filepath: str, row_count: int) -> ExportRecord:
39
+ size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
40
+ record = ExportRecord(
41
+ job_id=job_id,
42
+ format=fmt,
43
+ filename=filename,
44
+ filepath=filepath,
45
+ file_size_bytes=size,
46
+ row_count=row_count,
47
+ )
48
+ db.session.add(record)
49
+ db.session.commit()
50
+ return record
51
+
52
+
53
+ def export_json(job_id: int) -> Optional[str]:
54
+ """Export results as a JSON file. Returns the filepath."""
55
+ job = ScrapeJob.query.get(job_id)
56
+ if not job:
57
+ return None
58
+
59
+ results = _get_results(job_id)
60
+ export_dir = _ensure_export_dir()
61
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
62
+ filename = f"job_{job_id}_{ts}.json"
63
+ filepath = str(export_dir / filename)
64
+
65
+ data = {
66
+ "job": job.to_dict(),
67
+ "total_items": len(results),
68
+ "exported_at": datetime.now(timezone.utc).isoformat(),
69
+ "results": [r.to_dict() for r in results],
70
+ }
71
+
72
+ with open(filepath, "w", encoding="utf-8") as f:
73
+ json.dump(data, f, ensure_ascii=False, indent=2)
74
+
75
+ _record_export(job_id, "json", filename, filepath, len(results))
76
+ logger.info("JSON export for job %s: %s", job_id, filepath)
77
+ return filepath
78
+
79
+
80
+ def export_csv(job_id: int) -> Optional[str]:
81
+ """Export results as a CSV file."""
82
+ job = ScrapeJob.query.get(job_id)
83
+ if not job:
84
+ return None
85
+
86
+ results = _get_results(job_id)
87
+ export_dir = _ensure_export_dir()
88
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
89
+ filename = f"job_{job_id}_{ts}.csv"
90
+ filepath = str(export_dir / filename)
91
+
92
+ fieldnames = ["id", "job_id", "page_num", "item_index", "page_url", "content_type", "content", "created_at"]
93
+
94
+ with open(filepath, "w", newline="", encoding="utf-8-sig") as f: # BOM for Excel compat
95
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
96
+ writer.writeheader()
97
+ for r in results:
98
+ row = r.to_dict()
99
+ row.pop("metadata", None)
100
+ writer.writerow(row)
101
+
102
+ _record_export(job_id, "csv", filename, filepath, len(results))
103
+ logger.info("CSV export for job %s: %s", job_id, filepath)
104
+ return filepath
105
+
106
+
107
+ def export_excel(job_id: int) -> Optional[str]:
108
+ """Export results as a styled Excel (.xlsx) file."""
109
+ job = ScrapeJob.query.get(job_id)
110
+ if not job:
111
+ return None
112
+
113
+ results = _get_results(job_id)
114
+ export_dir = _ensure_export_dir()
115
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
116
+ filename = f"job_{job_id}_{ts}.xlsx"
117
+ filepath = str(export_dir / filename)
118
+
119
+ wb = openpyxl.Workbook()
120
+
121
+ # --- Results sheet ---
122
+ ws = wb.active
123
+ ws.title = "Results"
124
+
125
+ header_fill = PatternFill("solid", fgColor="1A1A2E")
126
+ header_font = Font(color="FFFFFF", bold=True)
127
+ headers = ["#", "Page", "Index", "URL", "Type", "Content", "Scraped At"]
128
+
129
+ for col_idx, header in enumerate(headers, 1):
130
+ cell = ws.cell(row=1, column=col_idx, value=header)
131
+ cell.fill = header_fill
132
+ cell.font = header_font
133
+ cell.alignment = Alignment(horizontal="center")
134
+
135
+ for row_idx, r in enumerate(results, 2):
136
+ ws.cell(row=row_idx, column=1, value=r.id)
137
+ ws.cell(row=row_idx, column=2, value=r.page_num)
138
+ ws.cell(row=row_idx, column=3, value=r.item_index)
139
+ ws.cell(row=row_idx, column=4, value=r.page_url)
140
+ ws.cell(row=row_idx, column=5, value=r.content_type)
141
+ ws.cell(row=row_idx, column=6, value=(r.content or "")[:32767]) # Excel cell limit
142
+ ws.cell(row=row_idx, column=7, value=r.created_at.isoformat() if r.created_at else "")
143
+
144
+ ws.column_dimensions["D"].width = 40
145
+ ws.column_dimensions["F"].width = 60
146
+
147
+ # --- Summary sheet ---
148
+ ws2 = wb.create_sheet("Summary")
149
+ job_dict = job.to_dict()
150
+ ws2.cell(1, 1, "Field").font = Font(bold=True)
151
+ ws2.cell(1, 2, "Value").font = Font(bold=True)
152
+ for i, (k, v) in enumerate(job_dict.items(), 2):
153
+ ws2.cell(i, 1, k)
154
+ ws2.cell(i, 2, str(v))
155
+ ws2.column_dimensions["A"].width = 25
156
+ ws2.column_dimensions["B"].width = 50
157
+
158
+ wb.save(filepath)
159
+ _record_export(job_id, "excel", filename, filepath, len(results))
160
+ logger.info("Excel export for job %s: %s", job_id, filepath)
161
+ return filepath
162
+
163
+
164
+ def get_job_exports(job_id: int) -> list[ExportRecord]:
165
+ return ExportRecord.query.filter_by(job_id=job_id).order_by(ExportRecord.created_at.desc()).all()
app/services/job_service.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Service layer for scrape job management.
3
+ Handles job creation, execution, status updates, and result persistence.
4
+ Keeps routes thin and business logic centralized here.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ import threading
11
+ from datetime import datetime, timezone
12
+ from typing import Any, Optional
13
+
14
+ from app.models import db, ScrapeJob, ScrapeResult, JobLog, ExportRecord
15
+ from app.scrapers.engine import ScrapeRequest, run_scrape, ScrapedItem
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def utcnow() -> datetime:
21
+ return datetime.now(timezone.utc).replace(tzinfo=None)
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Job CRUD
26
+ # ---------------------------------------------------------------------------
27
+
28
+
29
+ def create_job(data: dict, user_id: Optional[int] = None) -> ScrapeJob:
30
+ """Create and persist a new ScrapeJob from validated form/API data."""
31
+ custom_headers = data.get("custom_headers") or {}
32
+ if isinstance(custom_headers, str):
33
+ try:
34
+ custom_headers = json.loads(custom_headers)
35
+ except json.JSONDecodeError:
36
+ custom_headers = {}
37
+
38
+ job = ScrapeJob(
39
+ user_id=user_id,
40
+ name=data.get("name", f"Job - {data['url'][:50]}"),
41
+ url=data["url"],
42
+ html_tag=data.get("html_tag") or None,
43
+ css_selector=data.get("css_selector") or None,
44
+ xpath_selector=data.get("xpath_selector") or None,
45
+ attribute_name=data.get("attribute_name") or None,
46
+ extraction_type=data.get("extraction_type", "text"),
47
+ scrape_type=data.get("scrape_type", "static"),
48
+ follow_pagination=bool(data.get("follow_pagination", False)),
49
+ max_pages=int(data.get("max_pages", 1)),
50
+ infinite_scroll=bool(data.get("infinite_scroll", False)),
51
+ scroll_count=int(data.get("scroll_count", 3)),
52
+ download_images=bool(data.get("download_images", False)),
53
+ custom_headers=json.dumps(custom_headers),
54
+ user_agent=data.get("user_agent") or None,
55
+ delay_seconds=float(data.get("delay_seconds", 1.0)),
56
+ timeout_seconds=int(data.get("timeout_seconds", 30)),
57
+ max_retries=int(data.get("max_retries", 3)),
58
+ check_robots_txt=bool(data.get("check_robots_txt", True)),
59
+ deduplicate=bool(data.get("deduplicate", True)),
60
+ status="pending",
61
+ )
62
+ db.session.add(job)
63
+ db.session.commit()
64
+ _add_log(job.id, "INFO", f"Job created: {job.name}")
65
+ return job
66
+
67
+
68
+ def get_job(job_id: int) -> Optional[ScrapeJob]:
69
+ return ScrapeJob.query.get(job_id)
70
+
71
+
72
+ def list_jobs(
73
+ page: int = 1,
74
+ per_page: int = 20,
75
+ status: Optional[str] = None,
76
+ search: Optional[str] = None,
77
+ ) -> Any:
78
+ q = ScrapeJob.query.order_by(ScrapeJob.created_at.desc())
79
+ if status:
80
+ q = q.filter(ScrapeJob.status == status)
81
+ if search:
82
+ q = q.filter(
83
+ ScrapeJob.name.ilike(f"%{search}%") | ScrapeJob.url.ilike(f"%{search}%")
84
+ )
85
+ return q.paginate(page=page, per_page=per_page, error_out=False)
86
+
87
+
88
+ def delete_job(job_id: int) -> bool:
89
+ job = ScrapeJob.query.get(job_id)
90
+ if not job:
91
+ return False
92
+ db.session.delete(job)
93
+ db.session.commit()
94
+ return True
95
+
96
+
97
+ def cancel_job(job_id: int) -> bool:
98
+ job = ScrapeJob.query.get(job_id)
99
+ if not job or job.status not in ("pending", "running"):
100
+ return False
101
+ job.status = "cancelled"
102
+ db.session.commit()
103
+ _add_log(job_id, "WARNING", "Job cancelled by user")
104
+ return True
105
+
106
+
107
+ # ---------------------------------------------------------------------------
108
+ # Execution
109
+ # ---------------------------------------------------------------------------
110
+
111
+
112
+ def execute_job(job_id: int, app=None) -> None:
113
+ """
114
+ Execute a scrape job synchronously.
115
+ Designed to be called in a background thread with app context.
116
+ """
117
+ if app:
118
+ with app.app_context():
119
+ _run_job(job_id)
120
+ else:
121
+ _run_job(job_id)
122
+
123
+
124
+ def execute_job_async(job_id: int, app) -> threading.Thread:
125
+ """Spawn a daemon thread for async job execution."""
126
+ thread = threading.Thread(target=execute_job, args=(job_id, app), daemon=True)
127
+ thread.start()
128
+ return thread
129
+
130
+
131
+ def _run_job(job_id: int) -> None:
132
+ job = ScrapeJob.query.get(job_id)
133
+ if not job:
134
+ logger.error("Job %s not found", job_id)
135
+ return
136
+
137
+ job.status = "running"
138
+ job.started_at = utcnow()
139
+ db.session.commit()
140
+
141
+ _add_log(job_id, "INFO", f"Starting scrape: {job.url}")
142
+
143
+ try:
144
+ req = ScrapeRequest(
145
+ url=job.url,
146
+ html_tag=job.html_tag,
147
+ css_selector=job.css_selector,
148
+ xpath_selector=job.xpath_selector,
149
+ attribute_name=job.attribute_name,
150
+ extraction_type=job.extraction_type,
151
+ scrape_type=job.scrape_type,
152
+ follow_pagination=job.follow_pagination,
153
+ max_pages=job.max_pages,
154
+ infinite_scroll=job.infinite_scroll,
155
+ scroll_count=job.scroll_count,
156
+ download_images=job.download_images,
157
+ custom_headers=job.custom_headers_dict,
158
+ user_agent=job.user_agent,
159
+ delay_seconds=job.delay_seconds,
160
+ timeout_seconds=job.timeout_seconds,
161
+ max_retries=job.max_retries,
162
+ check_robots_txt=job.check_robots_txt,
163
+ deduplicate=job.deduplicate,
164
+ )
165
+
166
+ response = run_scrape(req)
167
+
168
+ # Persist results
169
+ _save_results(job_id, response.items)
170
+
171
+ # Log errors
172
+ for err in response.errors:
173
+ _add_log(job_id, "ERROR", err)
174
+
175
+ job.status = "completed" if response.error_count == 0 else "failed"
176
+ job.total_items = len(response.items)
177
+ job.pages_scraped = response.pages_scraped
178
+ job.error_count = response.error_count
179
+ job.duration_seconds = response.duration_seconds
180
+ job.completed_at = utcnow()
181
+ db.session.commit()
182
+
183
+ _add_log(
184
+ job_id,
185
+ "INFO",
186
+ f"Completed: {len(response.items)} items from {response.pages_scraped} pages "
187
+ f"in {response.duration_seconds:.2f}s",
188
+ )
189
+
190
+ except Exception as exc:
191
+ logger.exception("Job %s failed with unexpected error", job_id)
192
+ job.status = "failed"
193
+ job.error_count = (job.error_count or 0) + 1
194
+ job.completed_at = utcnow()
195
+ db.session.commit()
196
+ _add_log(job_id, "ERROR", f"Fatal error: {type(exc).__name__}: {exc}")
197
+
198
+
199
+ def _save_results(job_id: int, items: list[ScrapedItem]) -> None:
200
+ """Bulk insert scraped results."""
201
+ if not items:
202
+ return
203
+ objs = [
204
+ ScrapeResult(
205
+ job_id=job_id,
206
+ page_url=item.page_url,
207
+ page_num=item.page_num,
208
+ item_index=item.item_index,
209
+ content=item.content,
210
+ content_type=item.content_type,
211
+ content_hash=item.content_hash,
212
+ metadata_=json.dumps(item.metadata) if item.metadata else None,
213
+ )
214
+ for item in items
215
+ ]
216
+ db.session.bulk_save_objects(objs)
217
+ db.session.commit()
218
+
219
+
220
+ # ---------------------------------------------------------------------------
221
+ # Logs
222
+ # ---------------------------------------------------------------------------
223
+
224
+
225
+ def _add_log(job_id: int, level: str, message: str, details: Optional[dict] = None) -> None:
226
+ try:
227
+ log = JobLog(
228
+ job_id=job_id,
229
+ level=level,
230
+ message=message,
231
+ details=json.dumps(details) if details else None,
232
+ )
233
+ db.session.add(log)
234
+ db.session.commit()
235
+ except Exception as exc:
236
+ logger.warning("Could not persist log for job %s: %s", job_id, exc)
237
+
238
+
239
+ def get_job_logs(job_id: int, page: int = 1, per_page: int = 100) -> Any:
240
+ return (
241
+ JobLog.query.filter_by(job_id=job_id)
242
+ .order_by(JobLog.created_at.asc())
243
+ .paginate(page=page, per_page=per_page, error_out=False)
244
+ )
245
+
246
+
247
+ def get_job_results(job_id: int, page: int = 1, per_page: int = 50) -> Any:
248
+ return (
249
+ ScrapeResult.query.filter_by(job_id=job_id)
250
+ .order_by(ScrapeResult.page_num, ScrapeResult.item_index)
251
+ .paginate(page=page, per_page=per_page, error_out=False)
252
+ )
253
+
254
+
255
+ # ---------------------------------------------------------------------------
256
+ # Dashboard stats
257
+ # ---------------------------------------------------------------------------
258
+
259
+
260
+ def get_dashboard_stats() -> dict:
261
+ total = ScrapeJob.query.count()
262
+ completed = ScrapeJob.query.filter_by(status="completed").count()
263
+ failed = ScrapeJob.query.filter_by(status="failed").count()
264
+ running = ScrapeJob.query.filter_by(status="running").count()
265
+ pending = ScrapeJob.query.filter_by(status="pending").count()
266
+ total_items = db.session.query(db.func.sum(ScrapeJob.total_items)).scalar() or 0
267
+
268
+ recent_jobs = (
269
+ ScrapeJob.query.order_by(ScrapeJob.created_at.desc()).limit(5).all()
270
+ )
271
+
272
+ return {
273
+ "total_jobs": total,
274
+ "completed_jobs": completed,
275
+ "failed_jobs": failed,
276
+ "running_jobs": running,
277
+ "pending_jobs": pending,
278
+ "total_items_scraped": total_items,
279
+ "recent_jobs": [j.to_dict() for j in recent_jobs],
280
+ "success_rate": round((completed / total * 100) if total else 0, 1),
281
+ }
app/static/css/style.css ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ========================================
2
+ WebScraper.pro — Premium Dark Theme
3
+ ======================================== */
4
+
5
+ /* --- CSS Variables --- */
6
+ :root {
7
+ --bg-primary: #0a0a12;
8
+ --bg-secondary: #12121e;
9
+ --bg-card: #1a1a2e;
10
+ --bg-card-hover: #1f1f35;
11
+ --bg-input: #16162a;
12
+ --border: #2a2a45;
13
+ --border-focus: #6c5ce7;
14
+ --text-primary: #e8e8f0;
15
+ --text-secondary: #9898b8;
16
+ --text-muted: #6868a0;
17
+ --accent: #6c5ce7;
18
+ --accent-glow: rgba(108, 92, 231, 0.4);
19
+ --success: #00cec9;
20
+ --warning: #fdcb6e;
21
+ --danger: #ff6b6b;
22
+ --info: #74b9ff;
23
+ --font: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
24
+ --radius: 12px;
25
+ --radius-sm: 8px;
26
+ --shadow: 0 4px 24px rgba(0,0,0,0.3);
27
+ --shadow-lg: 0 8px 40px rgba(0,0,0,0.4);
28
+ --transition: 0.25s cubic-bezier(0.4, 0, 0.2, 1);
29
+ --nav-height: 64px;
30
+ }
31
+
32
+ /* --- Reset & Base --- */
33
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
34
+ html { scroll-behavior: smooth; }
35
+ body {
36
+ font-family: var(--font);
37
+ background: var(--bg-primary);
38
+ color: var(--text-primary);
39
+ line-height: 1.6;
40
+ min-height: 100vh;
41
+ display: flex;
42
+ flex-direction: column;
43
+ -webkit-font-smoothing: antialiased;
44
+ }
45
+ a { color: var(--accent); text-decoration: none; transition: color var(--transition); }
46
+ a:hover { color: #8b7ff0; }
47
+
48
+ /* --- Layout --- */
49
+ .container { max-width: 1280px; margin: 0 auto; padding: 0 24px; width: 100%; }
50
+ .main-content { flex: 1; padding: calc(var(--nav-height) + 32px) 0 48px; }
51
+
52
+ /* --- Navbar --- */
53
+ .navbar {
54
+ position: fixed; top: 0; left: 0; right: 0; z-index: 100;
55
+ height: var(--nav-height);
56
+ background: rgba(10, 10, 18, 0.85);
57
+ backdrop-filter: blur(20px);
58
+ border-bottom: 1px solid var(--border);
59
+ }
60
+ .nav-container {
61
+ max-width: 1280px; margin: 0 auto; padding: 0 24px;
62
+ display: flex; align-items: center; justify-content: space-between; height: 100%;
63
+ }
64
+ .nav-brand { display: flex; align-items: center; gap: 10px; font-weight: 700; font-size: 1.2rem; color: var(--text-primary); }
65
+ .brand-icon { font-size: 1.5rem; }
66
+ .brand-accent { color: var(--accent); }
67
+ .nav-links { display: flex; align-items: center; gap: 8px; }
68
+ .nav-link {
69
+ display: flex; align-items: center; gap: 6px; padding: 8px 16px;
70
+ border-radius: var(--radius-sm); color: var(--text-secondary);
71
+ font-size: 0.9rem; font-weight: 500; transition: all var(--transition);
72
+ }
73
+ .nav-link:hover, .nav-link.active { color: var(--text-primary); background: rgba(108, 92, 231, 0.1); }
74
+ .nav-link.active { color: var(--accent); }
75
+ .nav-icon { width: 18px; height: 18px; }
76
+ .btn-nav-primary {
77
+ background: linear-gradient(135deg, var(--accent), #8b5cf6);
78
+ color: #fff !important; border-radius: var(--radius-sm);
79
+ }
80
+ .btn-nav-primary:hover { transform: translateY(-1px); box-shadow: 0 4px 15px var(--accent-glow); }
81
+ .nav-toggle { display: none; background: none; border: none; cursor: pointer; padding: 8px; }
82
+ .nav-toggle span { display: block; width: 22px; height: 2px; background: var(--text-primary); margin: 5px 0; transition: var(--transition); }
83
+
84
+ /* --- Page Header --- */
85
+ .page-header {
86
+ display: flex; align-items: flex-start; justify-content: space-between;
87
+ margin-bottom: 32px; flex-wrap: wrap; gap: 16px;
88
+ }
89
+ .page-title { font-size: 2rem; font-weight: 800; letter-spacing: -0.02em; }
90
+ .page-subtitle { color: var(--text-muted); margin-top: 4px; font-size: 0.95rem; }
91
+ .page-actions { display: flex; gap: 8px; flex-wrap: wrap; }
92
+
93
+ /* --- Cards --- */
94
+ .card {
95
+ background: var(--bg-card); border: 1px solid var(--border);
96
+ border-radius: var(--radius); margin-bottom: 24px;
97
+ transition: border-color var(--transition);
98
+ }
99
+ .card:hover { border-color: rgba(108, 92, 231, 0.3); }
100
+ .card-header {
101
+ display: flex; align-items: center; justify-content: space-between;
102
+ padding: 20px 24px; border-bottom: 1px solid var(--border);
103
+ }
104
+ .card-title { font-size: 1.1rem; font-weight: 600; }
105
+ .card-subtitle { color: var(--text-muted); font-size: 0.85rem; }
106
+ .card-body { padding: 24px; }
107
+
108
+ /* --- Stats Grid --- */
109
+ .stats-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px; margin-bottom: 32px; }
110
+ .stat-card {
111
+ background: var(--bg-card); border: 1px solid var(--border); border-radius: var(--radius);
112
+ padding: 24px; display: flex; align-items: center; gap: 16px;
113
+ transition: all var(--transition); cursor: default;
114
+ }
115
+ .stat-card:hover { transform: translateY(-3px); box-shadow: var(--shadow); }
116
+ .stat-icon { font-size: 2rem; }
117
+ .stat-value { font-size: 1.8rem; font-weight: 800; display: block; line-height: 1; }
118
+ .stat-label { font-size: 0.8rem; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.05em; margin-top: 4px; display: block; }
119
+ .stat-card-primary { border-left: 3px solid var(--accent); }
120
+ .stat-card-success { border-left: 3px solid var(--success); }
121
+ .stat-card-warning { border-left: 3px solid var(--warning); }
122
+ .stat-card-danger { border-left: 3px solid var(--danger); }
123
+ .stat-card-info { border-left: 3px solid var(--info); }
124
+ .stat-card-accent { border-left: 3px solid #a29bfe; }
125
+
126
+ /* --- Buttons --- */
127
+ .btn {
128
+ display: inline-flex; align-items: center; gap: 8px;
129
+ padding: 10px 20px; border-radius: var(--radius-sm);
130
+ font-family: var(--font); font-size: 0.9rem; font-weight: 600;
131
+ border: none; cursor: pointer; transition: all var(--transition);
132
+ text-decoration: none; line-height: 1.4;
133
+ }
134
+ .btn-primary { background: linear-gradient(135deg, var(--accent), #8b5cf6); color: #fff; }
135
+ .btn-primary:hover { transform: translateY(-1px); box-shadow: 0 4px 20px var(--accent-glow); color: #fff; }
136
+ .btn-secondary { background: var(--bg-card); color: var(--text-primary); border: 1px solid var(--border); }
137
+ .btn-secondary:hover { border-color: var(--accent); }
138
+ .btn-warning { background: rgba(253, 203, 110, 0.15); color: var(--warning); border: 1px solid rgba(253, 203, 110, 0.3); }
139
+ .btn-danger { background: rgba(255, 107, 107, 0.15); color: var(--danger); border: 1px solid rgba(255, 107, 107, 0.3); }
140
+ .btn-outline { background: transparent; color: var(--text-secondary); border: 1px solid var(--border); }
141
+ .btn-outline:hover { border-color: var(--accent); color: var(--accent); }
142
+ .btn-ghost { background: transparent; color: var(--text-secondary); }
143
+ .btn-ghost:hover { color: var(--text-primary); }
144
+ .btn-sm { padding: 6px 14px; font-size: 0.82rem; }
145
+ .btn-lg { padding: 14px 28px; font-size: 1rem; }
146
+ .btn-glow:hover { box-shadow: 0 0 30px var(--accent-glow), 0 4px 20px rgba(0,0,0,0.3); }
147
+ .btn-icon { width: 18px; height: 18px; }
148
+ .btn-icon-sm { background: none; border: none; cursor: pointer; padding: 6px; font-size: 1rem; border-radius: 6px; transition: var(--transition); }
149
+ .btn-icon-sm:hover { background: rgba(255,255,255,0.05); }
150
+ .btn-danger-ghost:hover { background: rgba(255, 107, 107, 0.1); }
151
+
152
+ /* --- Tables --- */
153
+ .table-responsive { overflow-x: auto; }
154
+ .table { width: 100%; border-collapse: collapse; font-size: 0.9rem; }
155
+ .table th {
156
+ padding: 12px 16px; text-align: left; font-weight: 600;
157
+ color: var(--text-muted); font-size: 0.78rem; text-transform: uppercase;
158
+ letter-spacing: 0.05em; border-bottom: 1px solid var(--border);
159
+ }
160
+ .table td { padding: 14px 16px; border-bottom: 1px solid rgba(42, 42, 69, 0.5); }
161
+ .table-row-hover:hover { background: rgba(108, 92, 231, 0.04); cursor: pointer; }
162
+ .table-link { color: var(--text-primary); font-weight: 500; }
163
+ .table-link:hover { color: var(--accent); }
164
+ .table-compact td { padding: 10px 14px; font-size: 0.85rem; }
165
+ .text-truncate { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
166
+ .text-muted { color: var(--text-muted); }
167
+ .content-cell { max-width: 400px; }
168
+ .content-preview { font-size: 0.83rem; color: var(--text-secondary); word-break: break-word; max-height: 60px; overflow: hidden; }
169
+
170
+ /* --- Badges --- */
171
+ .badge {
172
+ display: inline-flex; align-items: center; padding: 3px 10px;
173
+ border-radius: 20px; font-size: 0.75rem; font-weight: 600;
174
+ }
175
+ .badge-muted { background: rgba(152, 152, 184, 0.1); color: var(--text-muted); }
176
+ .badge-type { background: rgba(108, 92, 231, 0.12); color: var(--accent); }
177
+ .count-badge { background: rgba(108, 92, 231, 0.15); color: var(--accent); padding: 2px 10px; border-radius: 12px; font-size: 0.8rem; margin-left: 8px; }
178
+
179
+ /* --- Status Badges --- */
180
+ .status-badge {
181
+ display: inline-flex; align-items: center; gap: 6px;
182
+ padding: 4px 12px; border-radius: 20px; font-size: 0.78rem; font-weight: 600;
183
+ }
184
+ .status-dot { width: 7px; height: 7px; border-radius: 50%; }
185
+ .status-pending { background: rgba(253, 203, 110, 0.12); color: var(--warning); }
186
+ .status-pending .status-dot { background: var(--warning); }
187
+ .status-running { background: rgba(116, 185, 255, 0.12); color: var(--info); }
188
+ .status-running .status-dot { background: var(--info); animation: pulse 1.5s infinite; }
189
+ .status-completed { background: rgba(0, 206, 201, 0.12); color: var(--success); }
190
+ .status-completed .status-dot { background: var(--success); }
191
+ .status-failed { background: rgba(255, 107, 107, 0.12); color: var(--danger); }
192
+ .status-failed .status-dot { background: var(--danger); }
193
+ .status-cancelled { background: rgba(152, 152, 184, 0.12); color: var(--text-muted); }
194
+ .status-cancelled .status-dot { background: var(--text-muted); }
195
+ .status-lg { padding: 8px 18px; font-size: 0.85rem; }
196
+
197
+ @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.4; } }
198
+
199
+ /* --- Status Bar --- */
200
+ .status-bar {
201
+ background: var(--bg-card); border: 1px solid var(--border);
202
+ border-radius: var(--radius); margin-bottom: 24px; overflow: hidden;
203
+ }
204
+ .status-bar-inner { display: flex; align-items: center; justify-content: space-between; padding: 20px 24px; flex-wrap: wrap; gap: 12px; }
205
+ .status-meta { display: flex; gap: 20px; color: var(--text-secondary); font-size: 0.9rem; flex-wrap: wrap; }
206
+ .progress-bar { height: 3px; background: var(--bg-input); }
207
+ .progress-bar-fill { height: 100%; background: linear-gradient(90deg, var(--accent), var(--info)); border-radius: 3px; }
208
+ .progress-animate { width: 100%; animation: progress-sweep 2s ease-in-out infinite; }
209
+ @keyframes progress-sweep {
210
+ 0% { transform: translateX(-100%); }
211
+ 100% { transform: translateX(100%); }
212
+ }
213
+
214
+ /* --- Detail Grid --- */
215
+ .detail-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 24px; }
216
+ .detail-list { display: flex; flex-direction: column; gap: 14px; }
217
+ .detail-item { display: flex; justify-content: space-between; align-items: flex-start; gap: 16px; }
218
+ .detail-key { font-size: 0.85rem; color: var(--text-muted); min-width: 120px; flex-shrink: 0; }
219
+ .detail-value { font-size: 0.9rem; text-align: right; word-break: break-all; }
220
+ .detail-value.code { font-family: 'Fira Code', monospace; background: var(--bg-input); padding: 2px 8px; border-radius: 4px; font-size: 0.82rem; }
221
+ .detail-value.link { color: var(--accent); }
222
+
223
+ /* --- Export Buttons --- */
224
+ .export-buttons { display: flex; gap: 12px; flex-wrap: wrap; }
225
+ .btn-export {
226
+ flex: 1; min-width: 100px; display: flex; flex-direction: column; align-items: center;
227
+ gap: 8px; padding: 20px; background: var(--bg-input); border: 1px solid var(--border);
228
+ border-radius: var(--radius-sm); color: var(--text-primary); font-weight: 600;
229
+ font-size: 0.9rem; cursor: pointer; transition: all var(--transition); text-decoration: none;
230
+ }
231
+ .btn-export:hover { border-color: var(--accent); transform: translateY(-2px); box-shadow: var(--shadow); color: var(--text-primary); }
232
+ .export-icon { font-size: 1.5rem; }
233
+ .export-history { margin-top: 20px; }
234
+ .export-history h4 { font-size: 0.85rem; color: var(--text-muted); margin-bottom: 10px; }
235
+ .export-item { display: flex; align-items: center; gap: 10px; padding: 8px 0; border-bottom: 1px solid rgba(42,42,69,0.3); font-size: 0.85rem; }
236
+
237
+ /* --- Logs --- */
238
+ .logs-container { font-family: 'Fira Code', 'Cascadia Code', monospace; font-size: 0.82rem; max-height: 400px; overflow-y: auto; }
239
+ .log-entry { display: flex; gap: 12px; padding: 6px 8px; border-radius: 4px; }
240
+ .log-entry:hover { background: rgba(255,255,255,0.02); }
241
+ .log-time { color: var(--text-muted); min-width: 65px; }
242
+ .log-level { font-weight: 700; min-width: 60px; }
243
+ .log-info .log-level { color: var(--info); }
244
+ .log-warning .log-level { color: var(--warning); }
245
+ .log-error .log-level { color: var(--danger); }
246
+ .log-debug .log-level { color: var(--text-muted); }
247
+ .log-message { color: var(--text-secondary); word-break: break-word; }
248
+
249
+ /* --- Forms --- */
250
+ .form-section { margin-bottom: 24px; }
251
+ .form-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
252
+ .form-full { grid-column: 1 / -1; }
253
+ .form-group { display: flex; flex-direction: column; gap: 6px; }
254
+ .form-label { font-size: 0.85rem; font-weight: 600; color: var(--text-secondary); }
255
+ .required { color: var(--danger); }
256
+ .form-input, .form-select, .form-textarea {
257
+ padding: 12px 16px; background: var(--bg-input); border: 1px solid var(--border);
258
+ border-radius: var(--radius-sm); color: var(--text-primary); font-family: var(--font);
259
+ font-size: 0.9rem; transition: all var(--transition); outline: none;
260
+ }
261
+ .form-input:focus, .form-select:focus, .form-textarea:focus {
262
+ border-color: var(--accent); box-shadow: 0 0 0 3px var(--accent-glow);
263
+ }
264
+ .form-input.form-error, .form-select.form-error { border-color: var(--danger); }
265
+ .form-error-text { color: var(--danger); font-size: 0.8rem; }
266
+ .form-hint { color: var(--text-muted); font-size: 0.78rem; }
267
+ .form-select { appearance: none; background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%239898b8' d='M6 8L1 3h10z'/%3E%3C/svg%3E"); background-repeat: no-repeat; background-position: right 14px center; padding-right: 36px; }
268
+ .form-textarea { resize: vertical; min-height: 80px; }
269
+
270
+ /* --- Checkboxes --- */
271
+ .form-checkboxes { display: flex; flex-wrap: wrap; gap: 16px; margin-top: 20px; padding-top: 20px; border-top: 1px solid var(--border); }
272
+ .checkbox-label {
273
+ display: flex; align-items: center; gap: 10px; cursor: pointer;
274
+ font-size: 0.88rem; color: var(--text-secondary); user-select: none;
275
+ }
276
+ .checkbox-label input { display: none; }
277
+ .checkbox-custom {
278
+ width: 20px; height: 20px; border: 2px solid var(--border); border-radius: 5px;
279
+ display: flex; align-items: center; justify-content: center; transition: all var(--transition);
280
+ }
281
+ .checkbox-label input:checked + .checkbox-custom {
282
+ background: var(--accent); border-color: var(--accent);
283
+ }
284
+ .checkbox-label input:checked + .checkbox-custom::after {
285
+ content: '✓'; color: #fff; font-size: 0.7rem; font-weight: 700;
286
+ }
287
+
288
+ /* --- Collapsible --- */
289
+ .collapsible { cursor: pointer; user-select: none; }
290
+ .collapse-icon { width: 20px; height: 20px; color: var(--text-muted); transition: transform var(--transition); }
291
+ .collapsible.collapsed .collapse-icon { transform: rotate(-90deg); }
292
+
293
+ /* --- Form Actions --- */
294
+ .form-actions { display: flex; justify-content: flex-end; gap: 12px; margin-top: 8px; }
295
+
296
+ /* --- Filter Bar --- */
297
+ .filter-bar { margin-bottom: 20px; }
298
+ .filter-bar .card-body, .filter-form { padding: 16px 20px; display: flex; align-items: flex-end; gap: 16px; flex-wrap: wrap; }
299
+ .filter-group { display: flex; flex-direction: column; gap: 4px; }
300
+ .filter-label { font-size: 0.75rem; font-weight: 600; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.05em; }
301
+ .filter-search { flex: 1; min-width: 200px; }
302
+
303
+ /* --- Pagination --- */
304
+ .pagination { display: flex; justify-content: center; gap: 6px; margin-top: 24px; padding-top: 20px; border-top: 1px solid var(--border); }
305
+ .pagination-link {
306
+ padding: 8px 14px; border-radius: var(--radius-sm); font-size: 0.85rem;
307
+ color: var(--text-secondary); background: var(--bg-input); border: 1px solid var(--border);
308
+ transition: all var(--transition);
309
+ }
310
+ .pagination-link:hover, .pagination-link.active { background: var(--accent); color: #fff; border-color: var(--accent); }
311
+ .pagination-ellipsis { padding: 8px 6px; color: var(--text-muted); }
312
+
313
+ /* --- Empty State --- */
314
+ .empty-state { text-align: center; padding: 60px 24px; }
315
+ .empty-icon { font-size: 3.5rem; margin-bottom: 16px; }
316
+ .empty-state h3 { font-size: 1.2rem; margin-bottom: 8px; }
317
+ .empty-state p { color: var(--text-muted); margin-bottom: 20px; }
318
+ .empty-state-sm { text-align: center; padding: 32px 16px; }
319
+
320
+ /* --- Error Page --- */
321
+ .error-page { display: flex; align-items: center; justify-content: center; min-height: 60vh; gap: 60px; }
322
+ .error-code { font-size: 7rem; font-weight: 900; background: linear-gradient(135deg, var(--accent), var(--danger)); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1; }
323
+ .error-title { font-size: 1.5rem; margin: 12px 0; }
324
+ .error-description { color: var(--text-muted); margin-bottom: 24px; max-width: 400px; }
325
+ .error-actions { display: flex; gap: 12px; }
326
+ .error-visual { font-size: 5rem; opacity: 0.15; }
327
+
328
+ /* --- Toast --- */
329
+ .toast-container { position: fixed; top: calc(var(--nav-height) + 16px); right: 24px; z-index: 200; display: flex; flex-direction: column; gap: 10px; }
330
+ .toast {
331
+ display: flex; align-items: center; gap: 12px; padding: 14px 20px;
332
+ background: var(--bg-card); border: 1px solid var(--border); border-radius: var(--radius-sm);
333
+ box-shadow: var(--shadow-lg); animation: slideIn 0.3s ease; min-width: 300px; max-width: 500px;
334
+ }
335
+ .toast-success { border-left: 3px solid var(--success); }
336
+ .toast-danger, .toast-error { border-left: 3px solid var(--danger); }
337
+ .toast-warning { border-left: 3px solid var(--warning); }
338
+ .toast-info { border-left: 3px solid var(--info); }
339
+ .toast-icon { font-size: 1.1rem; font-weight: 700; }
340
+ .toast-success .toast-icon { color: var(--success); }
341
+ .toast-danger .toast-icon, .toast-error .toast-icon { color: var(--danger); }
342
+ .toast-warning .toast-icon { color: var(--warning); }
343
+ .toast-body { flex: 1; font-size: 0.9rem; }
344
+ .toast-close { background: none; border: none; color: var(--text-muted); cursor: pointer; font-size: 1.2rem; padding: 0 4px; }
345
+ @keyframes slideIn { from { transform: translateX(100%); opacity: 0; } to { transform: translateX(0); opacity: 1; } }
346
+
347
+ /* --- Footer --- */
348
+ .footer { border-top: 1px solid var(--border); padding: 24px 0; margin-top: auto; }
349
+ .footer-inner { display: flex; justify-content: space-between; align-items: center; }
350
+ .footer-text { font-size: 0.82rem; color: var(--text-muted); }
351
+ .footer-links { display: flex; gap: 16px; }
352
+ .footer-link { font-size: 0.82rem; color: var(--text-muted); }
353
+ .footer-link:hover { color: var(--accent); }
354
+
355
+ /* --- Inline form --- */
356
+ .inline-form { display: inline; }
357
+
358
+ /* --- Action Group --- */
359
+ .action-group { display: flex; gap: 4px; }
360
+
361
+ /* --- Responsive --- */
362
+ @media (max-width: 768px) {
363
+ .nav-links { display: none; position: fixed; top: var(--nav-height); left: 0; right: 0; background: var(--bg-secondary); flex-direction: column; padding: 16px; border-bottom: 1px solid var(--border); }
364
+ .nav-links.open { display: flex; }
365
+ .nav-toggle { display: block; }
366
+ .page-header { flex-direction: column; }
367
+ .stats-grid { grid-template-columns: repeat(2, 1fr); }
368
+ .form-grid { grid-template-columns: 1fr; }
369
+ .detail-grid { grid-template-columns: 1fr; }
370
+ .filter-form { flex-direction: column; align-items: stretch; }
371
+ .error-page { flex-direction: column; text-align: center; }
372
+ .footer-inner { flex-direction: column; gap: 12px; text-align: center; }
373
+ .status-bar-inner { flex-direction: column; align-items: flex-start; }
374
+ }
375
+ @media (max-width: 480px) {
376
+ .stats-grid { grid-template-columns: 1fr; }
377
+ .container { padding: 0 16px; }
378
+ .page-title { font-size: 1.5rem; }
379
+ }
app/static/js/app.js ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ========================================
2
+ WebScraper.pro — Interactive JS
3
+ ======================================== */
4
+
5
+ document.addEventListener('DOMContentLoaded', () => {
6
+ // --- Mobile Nav Toggle ---
7
+ const toggle = document.getElementById('nav-toggle');
8
+ const links = document.getElementById('nav-links');
9
+ if (toggle && links) {
10
+ toggle.addEventListener('click', () => links.classList.toggle('open'));
11
+ }
12
+
13
+ // --- Auto-dismiss Toasts ---
14
+ document.querySelectorAll('.toast[data-auto-dismiss]').forEach(toast => {
15
+ const delay = parseInt(toast.dataset.autoDismiss, 10) || 5000;
16
+ setTimeout(() => {
17
+ toast.style.animation = 'slideOut 0.3s ease forwards';
18
+ setTimeout(() => toast.remove(), 300);
19
+ }, delay);
20
+ });
21
+
22
+ // --- Stat Card Entrance Animation ---
23
+ document.querySelectorAll('.stat-card').forEach((card, i) => {
24
+ card.style.opacity = '0';
25
+ card.style.transform = 'translateY(20px)';
26
+ setTimeout(() => {
27
+ card.style.transition = 'all 0.5s cubic-bezier(0.4, 0, 0.2, 1)';
28
+ card.style.opacity = '1';
29
+ card.style.transform = 'translateY(0)';
30
+ }, 80 * i);
31
+ });
32
+ });
33
+
34
+ // --- Collapsible Sections ---
35
+ function toggleSection(id) {
36
+ const el = document.getElementById(id);
37
+ if (!el) return;
38
+ const header = el.previousElementSibling || el.parentElement.querySelector('.collapsible');
39
+ if (el.style.display === 'none') {
40
+ el.style.display = '';
41
+ if (header) header.classList.remove('collapsed');
42
+ } else {
43
+ el.style.display = 'none';
44
+ if (header) header.classList.add('collapsed');
45
+ }
46
+ }
47
+
48
+ // --- Slide Out animation for toasts ---
49
+ const style = document.createElement('style');
50
+ style.textContent = `@keyframes slideOut { to { transform: translateX(120%); opacity: 0; } }`;
51
+ document.head.appendChild(style);
app/templates/base.html ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en" data-theme="dark">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <meta name="description" content="WebScraper Platform — Professional web scraping with visual configuration, exports, and scheduling.">
7
+ <title>{% block title %}WebScraper Platform{% endblock %}</title>
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&display=swap" rel="stylesheet">
11
+ <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
12
+ {% block extra_head %}{% endblock %}
13
+ </head>
14
+ <body>
15
+ <!-- Navigation -->
16
+ <nav class="navbar" id="main-nav">
17
+ <div class="nav-container">
18
+ <a href="{{ url_for('main.index') }}" class="nav-brand">
19
+ <span class="brand-icon">🕷️</span>
20
+ <span class="brand-text">WebScraper<span class="brand-accent">.pro</span></span>
21
+ </a>
22
+ <div class="nav-links" id="nav-links">
23
+ <a href="{{ url_for('main.index') }}" class="nav-link {% if request.endpoint == 'main.index' %}active{% endif %}">
24
+ <svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"/><polyline points="9 22 9 12 15 12 15 22"/></svg>
25
+ Dashboard
26
+ </a>
27
+ <a href="{{ url_for('jobs.list_view') }}" class="nav-link {% if request.endpoint and request.endpoint.startswith('jobs.') and request.endpoint != 'jobs.new_job' %}active{% endif %}">
28
+ <svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="2" y="7" width="20" height="14" rx="2" ry="2"/><path d="M16 21V5a2 2 0 0 0-2-2h-4a2 2 0 0 0-2 2v16"/></svg>
29
+ Jobs
30
+ </a>
31
+ <a href="{{ url_for('jobs.new_job') }}" class="nav-link btn-nav-primary {% if request.endpoint == 'jobs.new_job' %}active{% endif %}">
32
+ <svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
33
+ New Scrape
34
+ </a>
35
+ </div>
36
+ <button class="nav-toggle" id="nav-toggle" aria-label="Toggle navigation">
37
+ <span></span><span></span><span></span>
38
+ </button>
39
+ </div>
40
+ </nav>
41
+
42
+ <!-- Flash Messages -->
43
+ {% with messages = get_flashed_messages(with_categories=true) %}
44
+ {% if messages %}
45
+ <div class="toast-container" id="toast-container">
46
+ {% for category, message in messages %}
47
+ <div class="toast toast-{{ category }}" data-auto-dismiss="5000">
48
+ <div class="toast-icon">
49
+ {% if category == 'success' %}✓{% elif category == 'danger' or category == 'error' %}✕{% elif category == 'warning' %}⚠{% else %}ℹ{% endif %}
50
+ </div>
51
+ <div class="toast-body">{{ message }}</div>
52
+ <button class="toast-close" onclick="this.parentElement.remove()">×</button>
53
+ </div>
54
+ {% endfor %}
55
+ </div>
56
+ {% endif %}
57
+ {% endwith %}
58
+
59
+ <!-- Main Content -->
60
+ <main class="main-content">
61
+ <div class="container">
62
+ {% block content %}{% endblock %}
63
+ </div>
64
+ </main>
65
+
66
+ <!-- Footer -->
67
+ <footer class="footer">
68
+ <div class="container footer-inner">
69
+ <p class="footer-text">© 2026 WebScraper<span class="brand-accent">.pro</span> — Built with Flask & BeautifulSoup</p>
70
+ <div class="footer-links">
71
+ <a href="{{ url_for('main.health') }}" class="footer-link">Health</a>
72
+ <a href="{{ url_for('main.metrics') }}" class="footer-link">Metrics</a>
73
+ </div>
74
+ </div>
75
+ </footer>
76
+
77
+ <script src="{{ url_for('static', filename='js/app.js') }}"></script>
78
+ {% block extra_scripts %}{% endblock %}
79
+ </body>
80
+ </html>
app/templates/pages/dashboard.html ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block title %}Dashboard — WebScraper.pro{% endblock %}
3
+
4
+ {% block content %}
5
+ <div class="page-header">
6
+ <div class="page-header-content">
7
+ <h1 class="page-title">Dashboard</h1>
8
+ <p class="page-subtitle">Monitor your scraping operations at a glance</p>
9
+ </div>
10
+ <a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary btn-glow">
11
+ <svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
12
+ New Scrape Job
13
+ </a>
14
+ </div>
15
+
16
+ <!-- Stats Grid -->
17
+ <div class="stats-grid" id="stats-grid">
18
+ <div class="stat-card stat-card-primary">
19
+ <div class="stat-icon">📊</div>
20
+ <div class="stat-info">
21
+ <span class="stat-value">{{ stats.total_jobs }}</span>
22
+ <span class="stat-label">Total Jobs</span>
23
+ </div>
24
+ </div>
25
+ <div class="stat-card stat-card-success">
26
+ <div class="stat-icon">✅</div>
27
+ <div class="stat-info">
28
+ <span class="stat-value">{{ stats.completed_jobs }}</span>
29
+ <span class="stat-label">Completed</span>
30
+ </div>
31
+ </div>
32
+ <div class="stat-card stat-card-warning">
33
+ <div class="stat-icon">⏳</div>
34
+ <div class="stat-info">
35
+ <span class="stat-value">{{ stats.running_jobs + stats.pending_jobs }}</span>
36
+ <span class="stat-label">In Progress</span>
37
+ </div>
38
+ </div>
39
+ <div class="stat-card stat-card-danger">
40
+ <div class="stat-icon">❌</div>
41
+ <div class="stat-info">
42
+ <span class="stat-value">{{ stats.failed_jobs }}</span>
43
+ <span class="stat-label">Failed</span>
44
+ </div>
45
+ </div>
46
+ <div class="stat-card stat-card-info">
47
+ <div class="stat-icon">📦</div>
48
+ <div class="stat-info">
49
+ <span class="stat-value">{{ stats.total_items_scraped }}</span>
50
+ <span class="stat-label">Items Scraped</span>
51
+ </div>
52
+ </div>
53
+ <div class="stat-card stat-card-accent">
54
+ <div class="stat-icon">🎯</div>
55
+ <div class="stat-info">
56
+ <span class="stat-value">{{ stats.success_rate }}%</span>
57
+ <span class="stat-label">Success Rate</span>
58
+ </div>
59
+ </div>
60
+ </div>
61
+
62
+ <!-- Recent Jobs -->
63
+ <div class="card" id="recent-jobs-card">
64
+ <div class="card-header">
65
+ <h2 class="card-title">Recent Jobs</h2>
66
+ <a href="{{ url_for('jobs.list_view') }}" class="btn btn-outline btn-sm">View All →</a>
67
+ </div>
68
+ <div class="card-body">
69
+ {% if stats.recent_jobs %}
70
+ <div class="table-responsive">
71
+ <table class="table" id="recent-jobs-table">
72
+ <thead>
73
+ <tr>
74
+ <th>ID</th>
75
+ <th>Name</th>
76
+ <th>URL</th>
77
+ <th>Status</th>
78
+ <th>Items</th>
79
+ <th>Duration</th>
80
+ <th>Created</th>
81
+ </tr>
82
+ </thead>
83
+ <tbody>
84
+ {% for job in stats.recent_jobs %}
85
+ <tr class="table-row-hover" onclick="window.location='/jobs/{{ job.id }}'">
86
+ <td><span class="badge badge-muted">#{{ job.id }}</span></td>
87
+ <td class="text-truncate" style="max-width: 200px;">{{ job.name }}</td>
88
+ <td class="text-truncate text-muted" style="max-width: 250px;">{{ job.url }}</td>
89
+ <td>
90
+ <span class="status-badge status-{{ job.status }}">
91
+ <span class="status-dot"></span>
92
+ {{ job.status | capitalize }}
93
+ </span>
94
+ </td>
95
+ <td>{{ job.total_items }}</td>
96
+ <td>{% if job.duration_seconds %}{{ "%.1f"|format(job.duration_seconds) }}s{% else %}—{% endif %}</td>
97
+ <td class="text-muted">{{ job.created_at[:16] if job.created_at else '—' }}</td>
98
+ </tr>
99
+ {% endfor %}
100
+ </tbody>
101
+ </table>
102
+ </div>
103
+ {% else %}
104
+ <div class="empty-state">
105
+ <div class="empty-icon">🕸️</div>
106
+ <h3>No scrape jobs yet</h3>
107
+ <p>Create your first scraping job to get started.</p>
108
+ <a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary">Create Your First Job</a>
109
+ </div>
110
+ {% endif %}
111
+ </div>
112
+ </div>
113
+ {% endblock %}
app/templates/pages/error.html ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block title %}{{ code }} — WebScraper.pro{% endblock %}
3
+
4
+ {% block content %}
5
+ <div class="error-page">
6
+ <div class="error-content">
7
+ <div class="error-code">{{ code }}</div>
8
+ <h1 class="error-title">{{ message }}</h1>
9
+ <p class="error-description">
10
+ {% if code == 404 %}
11
+ The page you're looking for doesn't exist or has been moved.
12
+ {% elif code == 429 %}
13
+ You've made too many requests. Please wait a moment and try again.
14
+ {% elif code == 500 %}
15
+ Something went wrong on our end. Our team has been notified.
16
+ {% else %}
17
+ An unexpected error occurred. Please try again.
18
+ {% endif %}
19
+ </p>
20
+ <div class="error-actions">
21
+ <a href="{{ url_for('main.index') }}" class="btn btn-primary">← Back to Dashboard</a>
22
+ <a href="{{ url_for('jobs.list_view') }}" class="btn btn-secondary">View Jobs</a>
23
+ </div>
24
+ </div>
25
+ <div class="error-visual">
26
+ <div class="error-spider">🕷️</div>
27
+ <div class="error-web"></div>
28
+ </div>
29
+ </div>
30
+ {% endblock %}
app/templates/pages/job_detail.html ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block title %}Job #{{ job.id }} — WebScraper.pro{% endblock %}
3
+
4
+ {% block content %}
5
+ <div class="page-header">
6
+ <div class="page-header-content">
7
+ <h1 class="page-title">{{ job.name }}</h1>
8
+ <p class="page-subtitle">Job #{{ job.id }} — {{ job.url }}</p>
9
+ </div>
10
+ <div class="page-actions">
11
+ {% if job.status in ('pending', 'running') %}
12
+ <form method="post" action="{{ url_for('jobs.cancel', job_id=job.id) }}" class="inline-form">
13
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
14
+ <button type="submit" class="btn btn-warning" id="btn-cancel">⏸️ Cancel</button>
15
+ </form>
16
+ {% endif %}
17
+ {% if job.status in ('completed', 'failed', 'cancelled') %}
18
+ <form method="post" action="{{ url_for('jobs.rerun', job_id=job.id) }}" class="inline-form">
19
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
20
+ <button type="submit" class="btn btn-secondary" id="btn-rerun">🔄 Re-run</button>
21
+ </form>
22
+ {% endif %}
23
+ <form method="post" action="{{ url_for('jobs.delete', job_id=job.id) }}" class="inline-form" onsubmit="return confirm('Permanently delete this job and all results?')">
24
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
25
+ <button type="submit" class="btn btn-danger" id="btn-delete">🗑️ Delete</button>
26
+ </form>
27
+ </div>
28
+ </div>
29
+
30
+ <!-- Status Bar -->
31
+ <div class="status-bar status-bar-{{ job.status }}" id="job-status-bar" data-job-id="{{ job.id }}" data-status="{{ job.status }}">
32
+ <div class="status-bar-inner">
33
+ <span class="status-badge status-{{ job.status }} status-lg">
34
+ <span class="status-dot"></span>
35
+ {{ job.status | upper }}
36
+ </span>
37
+ <div class="status-meta">
38
+ <span>📦 <strong>{{ job.total_items }}</strong> items</span>
39
+ <span>📄 <strong>{{ job.pages_scraped }}</strong> pages</span>
40
+ {% if job.error_count %}<span>❌ <strong>{{ job.error_count }}</strong> errors</span>{% endif %}
41
+ {% if job.duration_seconds %}<span>⏱️ <strong>{{ "%.2f"|format(job.duration_seconds) }}</strong>s</span>{% endif %}
42
+ </div>
43
+ </div>
44
+ {% if job.status in ('pending', 'running') %}
45
+ <div class="progress-bar">
46
+ <div class="progress-bar-fill progress-animate"></div>
47
+ </div>
48
+ {% endif %}
49
+ </div>
50
+
51
+ <!-- Job Info -->
52
+ <div class="detail-grid">
53
+ <div class="card" id="job-config-card">
54
+ <div class="card-header">
55
+ <h2 class="card-title">⚙️ Configuration</h2>
56
+ </div>
57
+ <div class="card-body">
58
+ <div class="detail-list">
59
+ <div class="detail-item">
60
+ <span class="detail-key">URL</span>
61
+ <a href="{{ job.url }}" target="_blank" rel="noopener" class="detail-value link">{{ job.url }}</a>
62
+ </div>
63
+ <div class="detail-item">
64
+ <span class="detail-key">Scrape Type</span>
65
+ <span class="detail-value"><span class="badge badge-type">{{ job.scrape_type }}</span></span>
66
+ </div>
67
+ <div class="detail-item">
68
+ <span class="detail-key">Extraction</span>
69
+ <span class="detail-value">{{ job.extraction_type }}</span>
70
+ </div>
71
+ {% if job.css_selector %}
72
+ <div class="detail-item">
73
+ <span class="detail-key">CSS Selector</span>
74
+ <code class="detail-value code">{{ job.css_selector }}</code>
75
+ </div>
76
+ {% endif %}
77
+ {% if job.xpath_selector %}
78
+ <div class="detail-item">
79
+ <span class="detail-key">XPath</span>
80
+ <code class="detail-value code">{{ job.xpath_selector }}</code>
81
+ </div>
82
+ {% endif %}
83
+ {% if job.html_tag %}
84
+ <div class="detail-item">
85
+ <span class="detail-key">HTML Tag</span>
86
+ <code class="detail-value code">{{ job.html_tag }}</code>
87
+ </div>
88
+ {% endif %}
89
+ <div class="detail-item">
90
+ <span class="detail-key">Pagination</span>
91
+ <span class="detail-value">{{ 'Yes' if job.follow_pagination else 'No' }} (max {{ job.max_pages }} pages)</span>
92
+ </div>
93
+ <div class="detail-item">
94
+ <span class="detail-key">Delay</span>
95
+ <span class="detail-value">{{ job.delay_seconds }}s</span>
96
+ </div>
97
+ <div class="detail-item">
98
+ <span class="detail-key">Created</span>
99
+ <span class="detail-value">{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') if job.created_at else '—' }}</span>
100
+ </div>
101
+ </div>
102
+ </div>
103
+ </div>
104
+
105
+ <!-- Export -->
106
+ <div class="card" id="export-card">
107
+ <div class="card-header">
108
+ <h2 class="card-title">📤 Export Results</h2>
109
+ </div>
110
+ <div class="card-body">
111
+ {% if job.total_items > 0 %}
112
+ <div class="export-buttons">
113
+ <a href="{{ url_for('jobs.export', job_id=job.id, fmt='json') }}" class="btn btn-export" id="export-json">
114
+ <span class="export-icon">{ }</span>
115
+ <span>JSON</span>
116
+ </a>
117
+ <a href="{{ url_for('jobs.export', job_id=job.id, fmt='csv') }}" class="btn btn-export" id="export-csv">
118
+ <span class="export-icon">📊</span>
119
+ <span>CSV</span>
120
+ </a>
121
+ <a href="{{ url_for('jobs.export', job_id=job.id, fmt='excel') }}" class="btn btn-export" id="export-excel">
122
+ <span class="export-icon">📗</span>
123
+ <span>Excel</span>
124
+ </a>
125
+ </div>
126
+ {% if exports %}
127
+ <div class="export-history">
128
+ <h4>Previous Exports</h4>
129
+ {% for exp in exports %}
130
+ <div class="export-item">
131
+ <span class="badge badge-type">{{ exp.format }}</span>
132
+ <span>{{ exp.filename }}</span>
133
+ <span class="text-muted">{{ exp.row_count }} rows</span>
134
+ </div>
135
+ {% endfor %}
136
+ </div>
137
+ {% endif %}
138
+ {% else %}
139
+ <div class="empty-state-sm">
140
+ <p class="text-muted">No results to export yet.</p>
141
+ </div>
142
+ {% endif %}
143
+ </div>
144
+ </div>
145
+ </div>
146
+
147
+ <!-- Results -->
148
+ <div class="card" id="results-card">
149
+ <div class="card-header">
150
+ <h2 class="card-title">📦 Scraped Results <span class="count-badge">{{ results.total }}</span></h2>
151
+ </div>
152
+ <div class="card-body">
153
+ {% if results.items %}
154
+ <div class="table-responsive">
155
+ <table class="table table-compact" id="results-table">
156
+ <thead>
157
+ <tr>
158
+ <th>#</th>
159
+ <th>Page</th>
160
+ <th>Type</th>
161
+ <th>Content</th>
162
+ <th>URL</th>
163
+ </tr>
164
+ </thead>
165
+ <tbody>
166
+ {% for r in results.items %}
167
+ <tr>
168
+ <td>{{ r.item_index }}</td>
169
+ <td>{{ r.page_num }}</td>
170
+ <td><span class="badge badge-muted">{{ r.content_type }}</span></td>
171
+ <td class="content-cell">
172
+ <div class="content-preview">{{ r.content[:300] if r.content else '—' }}{% if r.content and r.content|length > 300 %}…{% endif %}</div>
173
+ </td>
174
+ <td class="text-truncate text-muted" style="max-width:180px;">{{ r.page_url }}</td>
175
+ </tr>
176
+ {% endfor %}
177
+ </tbody>
178
+ </table>
179
+ </div>
180
+ {% if results.pages > 1 %}
181
+ <div class="pagination">
182
+ {% if results.has_prev %}
183
+ <a href="{{ url_for('jobs.detail', job_id=job.id, rpage=results.prev_num) }}" class="pagination-link">← Prev</a>
184
+ {% endif %}
185
+ {% for p in results.iter_pages(left_edge=1, right_edge=1, left_current=2, right_current=2) %}
186
+ {% if p %}
187
+ <a href="{{ url_for('jobs.detail', job_id=job.id, rpage=p) }}" class="pagination-link {% if p == results.page %}active{% endif %}">{{ p }}</a>
188
+ {% else %}
189
+ <span class="pagination-ellipsis">…</span>
190
+ {% endif %}
191
+ {% endfor %}
192
+ {% if results.has_next %}
193
+ <a href="{{ url_for('jobs.detail', job_id=job.id, rpage=results.next_num) }}" class="pagination-link">Next →</a>
194
+ {% endif %}
195
+ </div>
196
+ {% endif %}
197
+ {% else %}
198
+ <div class="empty-state-sm">
199
+ <p class="text-muted">No results scraped yet.</p>
200
+ </div>
201
+ {% endif %}
202
+ </div>
203
+ </div>
204
+
205
+ <!-- Logs -->
206
+ <div class="card" id="logs-card">
207
+ <div class="card-header collapsible" onclick="toggleSection('logs-body')">
208
+ <h2 class="card-title">📋 Job Logs <span class="count-badge">{{ logs.total }}</span></h2>
209
+ <svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="6 9 12 15 18 9"/></svg>
210
+ </div>
211
+ <div class="card-body" id="logs-body">
212
+ {% if logs.items %}
213
+ <div class="logs-container">
214
+ {% for log in logs.items %}
215
+ <div class="log-entry log-{{ log.level | lower }}">
216
+ <span class="log-time">{{ log.created_at.strftime('%H:%M:%S') if log.created_at else '' }}</span>
217
+ <span class="log-level">{{ log.level }}</span>
218
+ <span class="log-message">{{ log.message }}</span>
219
+ </div>
220
+ {% endfor %}
221
+ </div>
222
+ {% else %}
223
+ <p class="text-muted">No logs recorded.</p>
224
+ {% endif %}
225
+ </div>
226
+ </div>
227
+ {% endblock %}
228
+
229
+ {% block extra_scripts %}
230
+ {% if job.status in ('pending', 'running') %}
231
+ <script>
232
+ // Poll for status updates
233
+ (function() {
234
+ const jobId = {{ job.id }};
235
+ const statusBar = document.getElementById('job-status-bar');
236
+ const interval = setInterval(async () => {
237
+ try {
238
+ const res = await fetch(`/jobs/api/jobs/${jobId}/status`);
239
+ const data = await res.json();
240
+ if (data.status !== '{{ job.status }}') {
241
+ window.location.reload();
242
+ }
243
+ // Update live counts
244
+ const meta = statusBar.querySelector('.status-meta');
245
+ if (meta) {
246
+ meta.innerHTML = `
247
+ <span>📦 <strong>${data.total_items}</strong> items</span>
248
+ <span>📄 <strong>${data.pages_scraped}</strong> pages</span>
249
+ ${data.error_count ? `<span>❌ <strong>${data.error_count}</strong> errors</span>` : ''}
250
+ `;
251
+ }
252
+ } catch(e) { /* silently continue */ }
253
+ }, 2000);
254
+
255
+ // Stop polling when job completes
256
+ if (!['pending', 'running'].includes('{{ job.status }}')) {
257
+ clearInterval(interval);
258
+ }
259
+ })();
260
+ </script>
261
+ {% endif %}
262
+ {% endblock %}
app/templates/pages/jobs.html ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block title %}Jobs — WebScraper.pro{% endblock %}
3
+
4
+ {% block content %}
5
+ <div class="page-header">
6
+ <div class="page-header-content">
7
+ <h1 class="page-title">Scrape Jobs</h1>
8
+ <p class="page-subtitle">Manage and monitor all your scraping tasks</p>
9
+ </div>
10
+ <a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary btn-glow">
11
+ <svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="10"/><line x1="12" y1="8" x2="12" y2="16"/><line x1="8" y1="12" x2="16" y2="12"/></svg>
12
+ New Scrape Job
13
+ </a>
14
+ </div>
15
+
16
+ <!-- Filters -->
17
+ <div class="card filter-bar" id="job-filters">
18
+ <form method="get" class="filter-form">
19
+ <div class="filter-group">
20
+ <label class="filter-label">Status</label>
21
+ <select name="status" class="form-select" id="filter-status">
22
+ <option value="">All Statuses</option>
23
+ <option value="pending" {% if status_filter == 'pending' %}selected{% endif %}>Pending</option>
24
+ <option value="running" {% if status_filter == 'running' %}selected{% endif %}>Running</option>
25
+ <option value="completed" {% if status_filter == 'completed' %}selected{% endif %}>Completed</option>
26
+ <option value="failed" {% if status_filter == 'failed' %}selected{% endif %}>Failed</option>
27
+ <option value="cancelled" {% if status_filter == 'cancelled' %}selected{% endif %}>Cancelled</option>
28
+ </select>
29
+ </div>
30
+ <div class="filter-group filter-search">
31
+ <label class="filter-label">Search</label>
32
+ <input type="text" name="search" class="form-input" placeholder="Search by name or URL..." value="{{ search or '' }}" id="filter-search">
33
+ </div>
34
+ <button type="submit" class="btn btn-secondary" id="filter-submit">Filter</button>
35
+ {% if status_filter or search %}
36
+ <a href="{{ url_for('jobs.list_view') }}" class="btn btn-ghost" id="filter-clear">Clear</a>
37
+ {% endif %}
38
+ </form>
39
+ </div>
40
+
41
+ <!-- Jobs Table -->
42
+ <div class="card" id="jobs-list-card">
43
+ <div class="card-body">
44
+ {% if pagination.items %}
45
+ <div class="table-responsive">
46
+ <table class="table" id="jobs-table">
47
+ <thead>
48
+ <tr>
49
+ <th>ID</th>
50
+ <th>Name</th>
51
+ <th>URL</th>
52
+ <th>Type</th>
53
+ <th>Status</th>
54
+ <th>Items</th>
55
+ <th>Duration</th>
56
+ <th>Created</th>
57
+ <th>Actions</th>
58
+ </tr>
59
+ </thead>
60
+ <tbody>
61
+ {% for job in pagination.items %}
62
+ <tr class="table-row-hover">
63
+ <td><span class="badge badge-muted">#{{ job.id }}</span></td>
64
+ <td>
65
+ <a href="{{ url_for('jobs.detail', job_id=job.id) }}" class="table-link">{{ job.name }}</a>
66
+ </td>
67
+ <td class="text-truncate text-muted" style="max-width: 220px;" title="{{ job.url }}">{{ job.url }}</td>
68
+ <td><span class="badge badge-type">{{ job.scrape_type }}</span></td>
69
+ <td>
70
+ <span class="status-badge status-{{ job.status }}">
71
+ <span class="status-dot"></span>
72
+ {{ job.status | capitalize }}
73
+ </span>
74
+ </td>
75
+ <td>{{ job.total_items }}</td>
76
+ <td>{% if job.duration_seconds %}{{ "%.1f"|format(job.duration_seconds) }}s{% else %}—{% endif %}</td>
77
+ <td class="text-muted">{{ job.created_at.strftime('%Y-%m-%d %H:%M') if job.created_at else '—' }}</td>
78
+ <td>
79
+ <div class="action-group">
80
+ <a href="{{ url_for('jobs.detail', job_id=job.id) }}" class="btn btn-icon-sm" title="View Details">👁️</a>
81
+ <form method="post" action="{{ url_for('jobs.delete', job_id=job.id) }}" class="inline-form" onsubmit="return confirm('Delete job #{{ job.id }}?')">
82
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
83
+ <button type="submit" class="btn btn-icon-sm btn-danger-ghost" title="Delete">🗑️</button>
84
+ </form>
85
+ </div>
86
+ </td>
87
+ </tr>
88
+ {% endfor %}
89
+ </tbody>
90
+ </table>
91
+ </div>
92
+
93
+ <!-- Pagination -->
94
+ {% if pagination.pages > 1 %}
95
+ <div class="pagination" id="jobs-pagination">
96
+ {% if pagination.has_prev %}
97
+ <a href="{{ url_for('jobs.list_view', page=pagination.prev_num, status=status_filter, search=search) }}" class="pagination-link">← Prev</a>
98
+ {% endif %}
99
+ {% for p in pagination.iter_pages(left_edge=1, right_edge=1, left_current=2, right_current=2) %}
100
+ {% if p %}
101
+ <a href="{{ url_for('jobs.list_view', page=p, status=status_filter, search=search) }}" class="pagination-link {% if p == pagination.page %}active{% endif %}">{{ p }}</a>
102
+ {% else %}
103
+ <span class="pagination-ellipsis">…</span>
104
+ {% endif %}
105
+ {% endfor %}
106
+ {% if pagination.has_next %}
107
+ <a href="{{ url_for('jobs.list_view', page=pagination.next_num, status=status_filter, search=search) }}" class="pagination-link">Next →</a>
108
+ {% endif %}
109
+ </div>
110
+ {% endif %}
111
+
112
+ {% else %}
113
+ <div class="empty-state">
114
+ <div class="empty-icon">🔍</div>
115
+ <h3>No jobs found</h3>
116
+ <p>{% if status_filter or search %}Try adjusting your filters.{% else %}Create your first scraping job to get started.{% endif %}</p>
117
+ <a href="{{ url_for('jobs.new_job') }}" class="btn btn-primary">Create New Job</a>
118
+ </div>
119
+ {% endif %}
120
+ </div>
121
+ </div>
122
+ {% endblock %}
app/templates/pages/new_job.html ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block title %}New Scrape Job — WebScraper.pro{% endblock %}
3
+
4
+ {% block content %}
5
+ <div class="page-header">
6
+ <div class="page-header-content">
7
+ <h1 class="page-title">Create Scrape Job</h1>
8
+ <p class="page-subtitle">Configure and launch a new web scraping task</p>
9
+ </div>
10
+ </div>
11
+
12
+ <form method="post" action="{{ url_for('jobs.new_job') }}" id="new-job-form" class="job-form">
13
+ <input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
14
+
15
+ <!-- Basic Config -->
16
+ <div class="card form-section" id="section-basic">
17
+ <div class="card-header">
18
+ <h2 class="card-title">🎯 Basic Configuration</h2>
19
+ </div>
20
+ <div class="card-body">
21
+ <div class="form-grid">
22
+ <div class="form-group form-full">
23
+ <label for="name" class="form-label">Job Name</label>
24
+ <input type="text" name="name" id="name" class="form-input {% if errors.get('name') %}form-error{% endif %}" placeholder="e.g. Product Prices - Amazon" value="{{ form_data.get('name', '') }}">
25
+ {% if errors.get('name') %}<span class="form-error-text">{{ errors.name }}</span>{% endif %}
26
+ </div>
27
+
28
+ <div class="form-group form-full">
29
+ <label for="url" class="form-label">Target URL <span class="required">*</span></label>
30
+ <input type="url" name="url" id="url" class="form-input {% if errors.get('url') %}form-error{% endif %}" placeholder="https://example.com/page" value="{{ form_data.get('url', '') }}" required>
31
+ {% if errors.get('url') %}<span class="form-error-text">{{ errors.url }}</span>{% endif %}
32
+ </div>
33
+
34
+ <div class="form-group">
35
+ <label for="scrape_type" class="form-label">Scrape Type</label>
36
+ <select name="scrape_type" id="scrape_type" class="form-select">
37
+ <option value="static" {% if form_data.get('scrape_type') == 'static' %}selected{% endif %}>Static (Requests + BS4)</option>
38
+ <option value="dynamic" {% if form_data.get('scrape_type') == 'dynamic' %}selected{% endif %}>Dynamic (Playwright)</option>
39
+ </select>
40
+ <span class="form-hint">Use Dynamic for JS-rendered pages</span>
41
+ </div>
42
+
43
+ <div class="form-group">
44
+ <label for="extraction_type" class="form-label">Extraction Type</label>
45
+ <select name="extraction_type" id="extraction_type" class="form-select {% if errors.get('extraction_type') %}form-error{% endif %}">
46
+ <option value="text" {% if form_data.get('extraction_type') == 'text' %}selected{% endif %}>Text Content</option>
47
+ <option value="links" {% if form_data.get('extraction_type') == 'links' %}selected{% endif %}>Links (URLs)</option>
48
+ <option value="images" {% if form_data.get('extraction_type') == 'images' %}selected{% endif %}>Image URLs</option>
49
+ <option value="attributes" {% if form_data.get('extraction_type') == 'attributes' %}selected{% endif %}>HTML Attributes</option>
50
+ <option value="table" {% if form_data.get('extraction_type') == 'table' %}selected{% endif %}>Table Data</option>
51
+ <option value="json_ld" {% if form_data.get('extraction_type') == 'json_ld' %}selected{% endif %}>JSON-LD Schema</option>
52
+ <option value="full_html" {% if form_data.get('extraction_type') == 'full_html' %}selected{% endif %}>Full HTML</option>
53
+ </select>
54
+ {% if errors.get('extraction_type') %}<span class="form-error-text">{{ errors.extraction_type }}</span>{% endif %}
55
+ </div>
56
+ </div>
57
+ </div>
58
+ </div>
59
+
60
+ <!-- Selectors -->
61
+ <div class="card form-section" id="section-selectors">
62
+ <div class="card-header">
63
+ <h2 class="card-title">🔎 Selectors</h2>
64
+ <span class="card-subtitle">Define what to extract (leave blank for all elements)</span>
65
+ </div>
66
+ <div class="card-body">
67
+ <div class="form-grid">
68
+ <div class="form-group">
69
+ <label for="css_selector" class="form-label">CSS Selector</label>
70
+ <input type="text" name="css_selector" id="css_selector" class="form-input {% if errors.get('css_selector') %}form-error{% endif %}" placeholder="e.g. .product-card h2" value="{{ form_data.get('css_selector', '') }}">
71
+ {% if errors.get('css_selector') %}<span class="form-error-text">{{ errors.css_selector }}</span>{% endif %}
72
+ </div>
73
+
74
+ <div class="form-group">
75
+ <label for="xpath_selector" class="form-label">XPath Selector</label>
76
+ <input type="text" name="xpath_selector" id="xpath_selector" class="form-input {% if errors.get('xpath_selector') %}form-error{% endif %}" placeholder="e.g. //div[@class='item']/h2" value="{{ form_data.get('xpath_selector', '') }}">
77
+ {% if errors.get('xpath_selector') %}<span class="form-error-text">{{ errors.xpath_selector }}</span>{% endif %}
78
+ </div>
79
+
80
+ <div class="form-group">
81
+ <label for="html_tag" class="form-label">HTML Tag</label>
82
+ <input type="text" name="html_tag" id="html_tag" class="form-input" placeholder="e.g. h2, p, div" value="{{ form_data.get('html_tag', '') }}">
83
+ </div>
84
+
85
+ <div class="form-group">
86
+ <label for="attribute_name" class="form-label">Attribute Name</label>
87
+ <input type="text" name="attribute_name" id="attribute_name" class="form-input" placeholder="e.g. href, src, data-id" value="{{ form_data.get('attribute_name', '') }}">
88
+ <span class="form-hint">Used when extraction type is "Attributes"</span>
89
+ </div>
90
+ </div>
91
+ </div>
92
+ </div>
93
+
94
+ <!-- Advanced Options -->
95
+ <div class="card form-section" id="section-advanced">
96
+ <div class="card-header collapsible" onclick="toggleSection('advanced-body')">
97
+ <h2 class="card-title">⚙️ Advanced Options</h2>
98
+ <svg class="collapse-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="6 9 12 15 18 9"/></svg>
99
+ </div>
100
+ <div class="card-body" id="advanced-body">
101
+ <div class="form-grid">
102
+ <div class="form-group">
103
+ <label for="delay_seconds" class="form-label">Delay Between Requests (s)</label>
104
+ <input type="number" name="delay_seconds" id="delay_seconds" class="form-input {% if errors.get('delay_seconds') %}form-error{% endif %}" value="{{ form_data.get('delay_seconds', '1.0') }}" step="0.1" min="0" max="60">
105
+ {% if errors.get('delay_seconds') %}<span class="form-error-text">{{ errors.delay_seconds }}</span>{% endif %}
106
+ </div>
107
+
108
+ <div class="form-group">
109
+ <label for="timeout_seconds" class="form-label">Timeout (s)</label>
110
+ <input type="number" name="timeout_seconds" id="timeout_seconds" class="form-input" value="{{ form_data.get('timeout_seconds', '30') }}" min="5" max="120">
111
+ </div>
112
+
113
+ <div class="form-group">
114
+ <label for="max_retries" class="form-label">Max Retries</label>
115
+ <input type="number" name="max_retries" id="max_retries" class="form-input" value="{{ form_data.get('max_retries', '3') }}" min="0" max="10">
116
+ </div>
117
+
118
+ <div class="form-group">
119
+ <label for="max_pages" class="form-label">Max Pages</label>
120
+ <input type="number" name="max_pages" id="max_pages" class="form-input {% if errors.get('max_pages') %}form-error{% endif %}" value="{{ form_data.get('max_pages', '1') }}" min="1" max="200">
121
+ {% if errors.get('max_pages') %}<span class="form-error-text">{{ errors.max_pages }}</span>{% endif %}
122
+ </div>
123
+
124
+ <div class="form-group">
125
+ <label for="scroll_count" class="form-label">Scroll Count</label>
126
+ <input type="number" name="scroll_count" id="scroll_count" class="form-input" value="{{ form_data.get('scroll_count', '3') }}" min="1" max="50">
127
+ <span class="form-hint">For infinite scroll pages</span>
128
+ </div>
129
+
130
+ <div class="form-group">
131
+ <label for="user_agent" class="form-label">Custom User Agent</label>
132
+ <input type="text" name="user_agent" id="user_agent" class="form-input" placeholder="Leave blank for random UA" value="{{ form_data.get('user_agent', '') }}">
133
+ </div>
134
+
135
+ <div class="form-group form-full">
136
+ <label for="custom_headers" class="form-label">Custom Headers (JSON)</label>
137
+ <textarea name="custom_headers" id="custom_headers" class="form-textarea" rows="3" placeholder='{"Accept": "text/html", "Cookie": "session=abc123"}'>{{ form_data.get('custom_headers', '') }}</textarea>
138
+ </div>
139
+ </div>
140
+
141
+ <!-- Checkboxes -->
142
+ <div class="form-checkboxes">
143
+ <label class="checkbox-label">
144
+ <input type="checkbox" name="follow_pagination" {% if form_data.get('follow_pagination') %}checked{% endif %}>
145
+ <span class="checkbox-custom"></span>
146
+ Follow Pagination
147
+ </label>
148
+ <label class="checkbox-label">
149
+ <input type="checkbox" name="infinite_scroll" {% if form_data.get('infinite_scroll') %}checked{% endif %}>
150
+ <span class="checkbox-custom"></span>
151
+ Infinite Scroll
152
+ </label>
153
+ <label class="checkbox-label">
154
+ <input type="checkbox" name="check_robots_txt" {% if form_data.get('check_robots_txt', True) %}checked{% endif %}>
155
+ <span class="checkbox-custom"></span>
156
+ Respect robots.txt
157
+ </label>
158
+ <label class="checkbox-label">
159
+ <input type="checkbox" name="deduplicate" {% if form_data.get('deduplicate', True) %}checked{% endif %}>
160
+ <span class="checkbox-custom"></span>
161
+ Deduplicate Results
162
+ </label>
163
+ <label class="checkbox-label">
164
+ <input type="checkbox" name="download_images" {% if form_data.get('download_images') %}checked{% endif %}>
165
+ <span class="checkbox-custom"></span>
166
+ Download Images
167
+ </label>
168
+ </div>
169
+ </div>
170
+ </div>
171
+
172
+ <!-- Submit -->
173
+ <div class="form-actions">
174
+ <a href="{{ url_for('jobs.list_view') }}" class="btn btn-ghost">Cancel</a>
175
+ <button type="submit" class="btn btn-primary btn-glow btn-lg" id="submit-job">
176
+ <svg class="btn-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polygon points="5 3 19 12 5 21 5 3"/></svg>
177
+ Launch Scrape Job
178
+ </button>
179
+ </div>
180
+ </form>
181
+ {% endblock %}
app/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .validators import validate_url, validate_job_data, sanitize_string
2
+ from .logging_config import configure_logging
3
+
4
+ __all__ = ["validate_url", "validate_job_data", "sanitize_string", "configure_logging"]
app/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (347 Bytes). View file
 
app/utils/__pycache__/logging_config.cpython-310.pyc ADDED
Binary file (1.51 kB). View file
 
app/utils/__pycache__/validators.cpython-310.pyc ADDED
Binary file (3.43 kB). View file
 
app/utils/logging_config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Structured logging configuration.
3
+ Sets up file + console handlers with proper formatting.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import logging.handlers
9
+ import sys
10
+ from pathlib import Path
11
+
12
+
13
+ def configure_logging(log_level: str = "INFO", log_dir: str = "logs") -> None:
14
+ """Configure root logger with rotating file and console handlers."""
15
+ log_path = Path(log_dir)
16
+ log_path.mkdir(parents=True, exist_ok=True)
17
+
18
+ level = getattr(logging, log_level.upper(), logging.INFO)
19
+ fmt = logging.Formatter(
20
+ "%(asctime)s [%(levelname)-8s] %(name)s: %(message)s",
21
+ datefmt="%Y-%m-%d %H:%M:%S",
22
+ )
23
+
24
+ # Root logger
25
+ root = logging.getLogger()
26
+ root.setLevel(level)
27
+ root.handlers.clear()
28
+
29
+ # Console handler
30
+ console = logging.StreamHandler(sys.stdout)
31
+ console.setFormatter(fmt)
32
+ console.setLevel(level)
33
+ root.addHandler(console)
34
+
35
+ # Rotating file handler
36
+ file_handler = logging.handlers.RotatingFileHandler(
37
+ log_path / "app.log",
38
+ maxBytes=10 * 1024 * 1024, # 10 MB
39
+ backupCount=5,
40
+ encoding="utf-8",
41
+ )
42
+ file_handler.setFormatter(fmt)
43
+ file_handler.setLevel(level)
44
+ root.addHandler(file_handler)
45
+
46
+ # Error-only file
47
+ error_handler = logging.handlers.RotatingFileHandler(
48
+ log_path / "errors.log",
49
+ maxBytes=5 * 1024 * 1024,
50
+ backupCount=3,
51
+ encoding="utf-8",
52
+ )
53
+ error_handler.setFormatter(fmt)
54
+ error_handler.setLevel(logging.ERROR)
55
+ root.addHandler(error_handler)
56
+
57
+ # Suppress noisy libs
58
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
59
+ logging.getLogger("requests").setLevel(logging.WARNING)
60
+ logging.getLogger("playwright").setLevel(logging.WARNING)
app/utils/validators.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Input validation and sanitization utilities.
3
+ All user-supplied strings pass through here before entering business logic.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from typing import Any, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ import bleach
12
+
13
+
14
+ # Allowed HTML tags for any rich-text fields (none for our use case)
15
+ ALLOWED_TAGS: list[str] = []
16
+ ALLOWED_ATTRS: dict = {}
17
+
18
+ # Valid CSS selector pattern (permissive but blocks script injection)
19
+ _CSS_UNSAFE_PATTERN = re.compile(r"[<>\"']|javascript:", re.IGNORECASE)
20
+ _XPATH_UNSAFE_PATTERN = re.compile(r"[<>]|javascript:", re.IGNORECASE)
21
+
22
+
23
+ def sanitize_string(value: Any, max_length: int = 500) -> str:
24
+ """Strip HTML tags and trim whitespace."""
25
+ if not value:
26
+ return ""
27
+ cleaned = bleach.clean(str(value), tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRS, strip=True)
28
+ return cleaned[:max_length].strip()
29
+
30
+
31
+ def validate_url(url: str) -> tuple[bool, str]:
32
+ """Validate URL format. Returns (is_valid, error_message)."""
33
+ if not url:
34
+ return False, "URL is required."
35
+ url = url.strip()
36
+ try:
37
+ parsed = urlparse(url)
38
+ if parsed.scheme not in ("http", "https"):
39
+ return False, "URL must use http or https."
40
+ if not parsed.netloc:
41
+ return False, "URL must include a valid domain."
42
+ # Block localhost/private IPs (SSRF prevention)
43
+ host = parsed.hostname or ""
44
+ blocked = ["localhost", "127.0.0.1", "0.0.0.0", "::1"]
45
+ if host in blocked or host.startswith("192.168.") or host.startswith("10."):
46
+ return False, "Requests to internal addresses are not allowed."
47
+ except Exception:
48
+ return False, "Invalid URL format."
49
+ return True, ""
50
+
51
+
52
+ def validate_css_selector(selector: Optional[str]) -> tuple[bool, str]:
53
+ if not selector:
54
+ return True, ""
55
+ if _CSS_UNSAFE_PATTERN.search(selector):
56
+ return False, "CSS selector contains unsafe characters."
57
+ return True, ""
58
+
59
+
60
+ def validate_xpath(xpath: Optional[str]) -> tuple[bool, str]:
61
+ if not xpath:
62
+ return True, ""
63
+ if _XPATH_UNSAFE_PATTERN.search(xpath):
64
+ return False, "XPath contains unsafe characters."
65
+ return True, ""
66
+
67
+
68
+ def validate_job_data(data: dict) -> tuple[bool, dict]:
69
+ """
70
+ Validate all fields for a scrape job.
71
+ Returns (is_valid, error_dict).
72
+ """
73
+ errors: dict[str, str] = {}
74
+
75
+ url = data.get("url", "")
76
+ valid, msg = validate_url(url)
77
+ if not valid:
78
+ errors["url"] = msg
79
+
80
+ css = data.get("css_selector")
81
+ valid, msg = validate_css_selector(css)
82
+ if not valid:
83
+ errors["css_selector"] = msg
84
+
85
+ xpath = data.get("xpath_selector")
86
+ valid, msg = validate_xpath(xpath)
87
+ if not valid:
88
+ errors["xpath_selector"] = msg
89
+
90
+ extraction_type = data.get("extraction_type", "text")
91
+ valid_extractions = ("text", "images", "links", "attributes", "table", "json_ld", "full_html")
92
+ if extraction_type not in valid_extractions:
93
+ errors["extraction_type"] = f"Must be one of: {', '.join(valid_extractions)}"
94
+
95
+ scrape_type = data.get("scrape_type", "static")
96
+ if scrape_type not in ("static", "dynamic"):
97
+ errors["scrape_type"] = "Must be 'static' or 'dynamic'."
98
+
99
+ try:
100
+ max_pages = int(data.get("max_pages", 1))
101
+ if not 1 <= max_pages <= 200:
102
+ errors["max_pages"] = "max_pages must be between 1 and 200."
103
+ except (TypeError, ValueError):
104
+ errors["max_pages"] = "max_pages must be an integer."
105
+
106
+ try:
107
+ delay = float(data.get("delay_seconds", 1.0))
108
+ if not 0 <= delay <= 60:
109
+ errors["delay_seconds"] = "delay_seconds must be between 0 and 60."
110
+ except (TypeError, ValueError):
111
+ errors["delay_seconds"] = "delay_seconds must be a number."
112
+
113
+ return len(errors) == 0, errors
database/scraper.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78dfc516cfcade3e0ab261153191e2958ef3ac2ec9589c8f8ac1606a414dc293
3
+ size 114688
env.example ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Application
2
+ FLASK_ENV=development
3
+ SECRET_KEY=dev-secret-key-change-in-production-use-32-chars
4
+ DEBUG=True
5
+
6
+ # Database
7
+ DATABASE_URL=sqlite:///database/scraper.db
8
+
9
+ # Security
10
+ WTF_CSRF_ENABLED=True
11
+ SESSION_COOKIE_SECURE=False
12
+ SESSION_COOKIE_HTTPONLY=True
13
+ SESSION_COOKIE_SAMESITE=Lax
14
+
15
+ # Rate Limiting
16
+ RATELIMIT_DEFAULT=100 per hour
17
+ RATELIMIT_STORAGE_URL=memory://
18
+
19
+ # Scraping
20
+ MAX_CONCURRENT_JOBS=5
21
+ REQUEST_TIMEOUT=30
22
+ MAX_RETRIES=3
23
+ DEFAULT_DELAY=1.0
24
+
25
+ # Exports
26
+ EXPORT_DIR=exports
27
+
28
+ # Logging
29
+ LOG_LEVEL=DEBUG
30
+ LOG_DIR=logs
logs/app.log ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-17 14:44:10 [INFO ] app: WebScraper Platform started [development]
2
+ 2026-05-17 14:44:10 [INFO ] werkzeug: WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
3
+ * Running on http://127.0.0.1:5000
4
+ 2026-05-17 14:44:10 [INFO ] werkzeug: Press CTRL+C to quit
5
+ 2026-05-17 14:44:10 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
6
+ 2026-05-17 14:44:11 [INFO ] app: WebScraper Platform started [development]
7
+ 2026-05-17 14:44:11 [WARNING ] werkzeug: * Debugger is active!
8
+ 2026-05-17 14:44:11 [INFO ] werkzeug: * Debugger PIN: 590-942-950
9
+ 2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET / -> 200 (94.00ms)
10
+ 2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET / HTTP/1.1" 200 -
11
+ 2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 200 (0.00ms)
12
+ 2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET /static/css/style.css HTTP/1.1" 200 -
13
+ 2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 200 (0.00ms)
14
+ 2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET /static/js/app.js HTTP/1.1" 200 -
15
+ 2026-05-17 14:44:32 [DEBUG ] app.middleware.security: GET /favicon.ico -> 404 (0.00ms)
16
+ 2026-05-17 14:44:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:32] "GET /favicon.ico HTTP/1.1" 404 -
17
+ 2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (94.00ms)
18
+ 2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "GET /jobs/new HTTP/1.1" 200 -
19
+ 2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
20
+ 2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "GET /static/css/style.css HTTP/1.1" 304 -
21
+ 2026-05-17 14:44:44 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
22
+ 2026-05-17 14:44:44 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:44:44] "GET /static/js/app.js HTTP/1.1" 304 -
23
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (1).py', reloading
24
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (2).py', reloading
25
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (3).py', reloading
26
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (4).py', reloading
27
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (5).py', reloading
28
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (6).py', reloading
29
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__ (7).py', reloading
30
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\__init__.py', reloading
31
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\engine.py', reloading
32
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\models.py', reloading
33
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\job_service.py', reloading
34
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\export_service.py', reloading
35
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\security.py', reloading
36
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\main.py', reloading
37
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\jobs.py', reloading
38
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\validators.py', reloading
39
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\logging_config.py', reloading
40
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\settings.py', reloading
41
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\pygments\\plugin.py', reloading
42
+ 2026-05-17 14:45:18 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\dns\\rdtypes\\IN\\A.py', reloading
43
+ 2026-05-17 14:45:19 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
44
+ 2026-05-17 14:45:20 [INFO ] app: WebScraper Platform started [development]
45
+ 2026-05-17 14:45:20 [WARNING ] werkzeug: * Debugger is active!
46
+ 2026-05-17 14:45:20 [INFO ] werkzeug: * Debugger PIN: 590-942-950
47
+ 2026-05-17 14:46:31 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (62.00ms)
48
+ 2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "POST /jobs/new HTTP/1.1" 302 -
49
+ 2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (78.00ms)
50
+ 2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "GET /jobs/1 HTTP/1.1" 200 -
51
+ 2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
52
+ 2026-05-17 14:46:31 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
53
+ 2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "GET /static/js/app.js HTTP/1.1" 304 -
54
+ 2026-05-17 14:46:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:31] "GET /static/css/style.css HTTP/1.1" 304 -
55
+ 2026-05-17 14:46:34 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
56
+ 2026-05-17 14:46:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:34] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
57
+ 2026-05-17 14:46:36 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
58
+ 2026-05-17 14:46:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:36] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
59
+ 2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/1/status -> 200 (0.00ms)
60
+ 2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /jobs/api/jobs/1/status HTTP/1.1" 200 -
61
+ 2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (0.00ms)
62
+ 2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /jobs/1 HTTP/1.1" 200 -
63
+ 2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
64
+ 2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /static/css/style.css HTTP/1.1" 304 -
65
+ 2026-05-17 14:46:38 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
66
+ 2026-05-17 14:46:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:46:38] "GET /static/js/app.js HTTP/1.1" 304 -
67
+ 2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET / -> 200 (62.00ms)
68
+ 2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "GET / HTTP/1.1" 200 -
69
+ 2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
70
+ 2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "GET /static/css/style.css HTTP/1.1" 304 -
71
+ 2026-05-17 14:47:34 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
72
+ 2026-05-17 14:47:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:34] "GET /static/js/app.js HTTP/1.1" 304 -
73
+ 2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /jobs/1 -> 200 (16.00ms)
74
+ 2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "GET /jobs/1 HTTP/1.1" 200 -
75
+ 2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
76
+ 2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "GET /static/css/style.css HTTP/1.1" 304 -
77
+ 2026-05-17 14:47:55 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
78
+ 2026-05-17 14:47:55 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:47:55] "GET /static/js/app.js HTTP/1.1" 304 -
79
+ 2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (31.00ms)
80
+ 2026-05-17 14:48:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:14] "GET /jobs/new HTTP/1.1" 200 -
81
+ 2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
82
+ 2026-05-17 14:48:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:14] "GET /static/css/style.css HTTP/1.1" 304 -
83
+ 2026-05-17 14:48:14 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
84
+ 2026-05-17 14:48:15 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:15] "GET /static/js/app.js HTTP/1.1" 304 -
85
+ 2026-05-17 14:48:38 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (31.00ms)
86
+ 2026-05-17 14:48:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:38] "POST /jobs/new HTTP/1.1" 302 -
87
+ 2026-05-17 14:48:38 [DEBUG ] app.middleware.security: GET /jobs/2 -> 200 (79.00ms)
88
+ 2026-05-17 14:48:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:38] "GET /jobs/2 HTTP/1.1" 200 -
89
+ 2026-05-17 14:48:39 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
90
+ 2026-05-17 14:48:39 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:39] "GET /static/css/style.css HTTP/1.1" 304 -
91
+ 2026-05-17 14:48:39 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
92
+ 2026-05-17 14:48:39 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:39] "GET /static/js/app.js HTTP/1.1" 304 -
93
+ 2026-05-17 14:48:41 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/2/status -> 200 (15.00ms)
94
+ 2026-05-17 14:48:41 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:41] "GET /jobs/api/jobs/2/status HTTP/1.1" 200 -
95
+ 2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/2/status -> 200 (16.00ms)
96
+ 2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /jobs/api/jobs/2/status HTTP/1.1" 200 -
97
+ 2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /jobs/2 -> 200 (47.00ms)
98
+ 2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /jobs/2 HTTP/1.1" 200 -
99
+ 2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
100
+ 2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /static/css/style.css HTTP/1.1" 304 -
101
+ 2026-05-17 14:48:43 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
102
+ 2026-05-17 14:48:43 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:43] "GET /static/js/app.js HTTP/1.1" 304 -
103
+ 2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
104
+ 2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "GET /jobs/new HTTP/1.1" 200 -
105
+ 2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
106
+ 2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "GET /static/css/style.css HTTP/1.1" 304 -
107
+ 2026-05-17 14:48:53 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
108
+ 2026-05-17 14:48:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:48:53] "GET /static/js/app.js HTTP/1.1" 304 -
109
+ 2026-05-17 14:49:14 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (32.00ms)
110
+ 2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "POST /jobs/new HTTP/1.1" 302 -
111
+ 2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /jobs/3 -> 200 (47.00ms)
112
+ 2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "GET /jobs/3 HTTP/1.1" 200 -
113
+ 2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
114
+ 2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "GET /static/css/style.css HTTP/1.1" 304 -
115
+ 2026-05-17 14:49:14 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
116
+ 2026-05-17 14:49:14 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:14] "GET /static/js/app.js HTTP/1.1" 304 -
117
+ 2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/3/status -> 200 (0.00ms)
118
+ 2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /jobs/api/jobs/3/status HTTP/1.1" 200 -
119
+ 2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /jobs/3 -> 200 (47.00ms)
120
+ 2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /jobs/3 HTTP/1.1" 200 -
121
+ 2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (15.00ms)
122
+ 2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /static/css/style.css HTTP/1.1" 304 -
123
+ 2026-05-17 14:49:16 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
124
+ 2026-05-17 14:49:16 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:49:16] "GET /static/js/app.js HTTP/1.1" 304 -
125
+ 2026-05-17 14:50:47 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\app\\scrapers\\engine.py', reloading
126
+ 2026-05-17 14:50:47 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\Desktop\\webscraping\\app\\scrapers\\engine.py', reloading
127
+ 2026-05-17 14:50:47 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
128
+ 2026-05-17 14:50:48 [INFO ] app: WebScraper Platform started [development]
129
+ 2026-05-17 14:50:48 [WARNING ] werkzeug: * Debugger is active!
130
+ 2026-05-17 14:50:48 [INFO ] werkzeug: * Debugger PIN: 590-942-950
131
+ 2026-05-17 14:51:04 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\brotli.py', reloading
132
+ 2026-05-17 14:51:04 [INFO ] werkzeug: * Detected change in 'C:\\Users\\princ\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\brotli.py', reloading
133
+ 2026-05-17 14:51:05 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
134
+ 2026-05-17 14:51:06 [INFO ] app: WebScraper Platform started [development]
135
+ 2026-05-17 14:51:06 [WARNING ] werkzeug: * Debugger is active!
136
+ 2026-05-17 14:51:06 [INFO ] werkzeug: * Debugger PIN: 590-942-950
137
+ 2026-05-17 14:52:00 [INFO ] app: WebScraper Platform started [development]
138
+ 2026-05-17 14:52:00 [INFO ] werkzeug: WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
139
+ * Running on http://127.0.0.1:5000
140
+ 2026-05-17 14:52:00 [INFO ] werkzeug: Press CTRL+C to quit
141
+ 2026-05-17 14:52:00 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
142
+ 2026-05-17 14:52:01 [INFO ] app: WebScraper Platform started [development]
143
+ 2026-05-17 14:52:01 [WARNING ] werkzeug: * Debugger is active!
144
+ 2026-05-17 14:52:01 [INFO ] werkzeug: * Debugger PIN: 590-942-950
145
+ 2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (31.00ms)
146
+ 2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "GET /jobs/new HTTP/1.1" 200 -
147
+ 2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
148
+ 2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "GET /static/css/style.css HTTP/1.1" 304 -
149
+ 2026-05-17 14:52:12 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
150
+ 2026-05-17 14:52:12 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:52:12] "GET /static/js/app.js HTTP/1.1" 304 -
151
+ 2026-05-17 14:53:26 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (171.00ms)
152
+ 2026-05-17 14:53:26 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:26] "POST /jobs/new HTTP/1.1" 302 -
153
+ 2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (250.00ms)
154
+ 2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "GET /jobs/4 HTTP/1.1" 200 -
155
+ 2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (16.00ms)
156
+ 2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "GET /static/css/style.css HTTP/1.1" 304 -
157
+ 2026-05-17 14:53:27 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
158
+ 2026-05-17 14:53:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:27] "GET /static/js/app.js HTTP/1.1" 304 -
159
+ 2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (15.00ms)
160
+ 2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
161
+ 2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (15.00ms)
162
+ 2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /jobs/4 HTTP/1.1" 200 -
163
+ 2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
164
+ 2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /static/css/style.css HTTP/1.1" 304 -
165
+ 2026-05-17 14:53:29 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (16.00ms)
166
+ 2026-05-17 14:53:29 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:29] "GET /static/js/app.js HTTP/1.1" 304 -
167
+ 2026-05-17 14:53:31 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (0.00ms)
168
+ 2026-05-17 14:53:31 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:31] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
169
+ 2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/4/status -> 200 (15.00ms)
170
+ 2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /jobs/api/jobs/4/status HTTP/1.1" 200 -
171
+ 2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /jobs/4 -> 200 (16.00ms)
172
+ 2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /jobs/4 HTTP/1.1" 200 -
173
+ 2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (15.00ms)
174
+ 2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /static/css/style.css HTTP/1.1" 304 -
175
+ 2026-05-17 14:53:33 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
176
+ 2026-05-17 14:53:33 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:53:33] "GET /static/js/app.js HTTP/1.1" 304 -
177
+ 2026-05-17 14:57:40 [INFO ] app: WebScraper Platform started [development]
178
+ 2026-05-17 14:57:40 [INFO ] werkzeug: WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
179
+ * Running on http://127.0.0.1:5000
180
+ 2026-05-17 14:57:40 [INFO ] werkzeug: Press CTRL+C to quit
181
+ 2026-05-17 14:57:40 [INFO ] werkzeug: * Restarting with watchdog (windowsapi)
182
+ 2026-05-17 14:57:41 [INFO ] app: WebScraper Platform started [development]
183
+ 2026-05-17 14:57:41 [WARNING ] werkzeug: * Debugger is active!
184
+ 2026-05-17 14:57:41 [INFO ] werkzeug: * Debugger PIN: 590-942-950
185
+ 2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET / -> 200 (63.00ms)
186
+ 2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET / HTTP/1.1" 200 -
187
+ 2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 200 (0.00ms)
188
+ 2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET /static/css/style.css HTTP/1.1" 200 -
189
+ 2026-05-17 14:57:53 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 200 (0.00ms)
190
+ 2026-05-17 14:57:53 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:53] "GET /static/js/app.js HTTP/1.1" 200 -
191
+ 2026-05-17 14:57:54 [DEBUG ] app.middleware.security: GET /favicon.ico -> 404 (15.00ms)
192
+ 2026-05-17 14:57:54 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:57:54] "GET /favicon.ico HTTP/1.1" 404 -
193
+ 2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /jobs/ -> 200 (47.00ms)
194
+ 2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "GET /jobs/ HTTP/1.1" 200 -
195
+ 2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
196
+ 2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "GET /static/css/style.css HTTP/1.1" 304 -
197
+ 2026-05-17 14:58:06 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
198
+ 2026-05-17 14:58:06 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:06] "GET /static/js/app.js HTTP/1.1" 304 -
199
+ 2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (16.00ms)
200
+ 2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "GET /jobs/new HTTP/1.1" 200 -
201
+ 2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
202
+ 2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "GET /static/css/style.css HTTP/1.1" 304 -
203
+ 2026-05-17 14:58:10 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
204
+ 2026-05-17 14:58:10 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:10] "GET /static/js/app.js HTTP/1.1" 304 -
205
+ 2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET / -> 200 (15.00ms)
206
+ 2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "GET / HTTP/1.1" 200 -
207
+ 2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
208
+ 2026-05-17 14:58:27 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
209
+ 2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "GET /static/css/style.css HTTP/1.1" 304 -
210
+ 2026-05-17 14:58:27 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:27] "GET /static/js/app.js HTTP/1.1" 304 -
211
+ 2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /jobs/ -> 200 (16.00ms)
212
+ 2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "GET /jobs/ HTTP/1.1" 200 -
213
+ 2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
214
+ 2026-05-17 14:58:32 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
215
+ 2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "GET /static/css/style.css HTTP/1.1" 304 -
216
+ 2026-05-17 14:58:32 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:32] "GET /static/js/app.js HTTP/1.1" 304 -
217
+ 2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
218
+ 2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "GET /jobs/new HTTP/1.1" 200 -
219
+ 2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
220
+ 2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "GET /static/css/style.css HTTP/1.1" 304 -
221
+ 2026-05-17 14:58:34 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
222
+ 2026-05-17 14:58:34 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:34] "GET /static/js/app.js HTTP/1.1" 304 -
223
+ 2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET / -> 200 (0.00ms)
224
+ 2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "GET / HTTP/1.1" 200 -
225
+ 2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
226
+ 2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "GET /static/css/style.css HTTP/1.1" 304 -
227
+ 2026-05-17 14:58:36 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
228
+ 2026-05-17 14:58:36 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:36] "GET /static/js/app.js HTTP/1.1" 304 -
229
+ 2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
230
+ 2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "GET /jobs/new HTTP/1.1" 200 -
231
+ 2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
232
+ 2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "GET /static/css/style.css HTTP/1.1" 304 -
233
+ 2026-05-17 14:58:38 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
234
+ 2026-05-17 14:58:38 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:38] "GET /static/js/app.js HTTP/1.1" 304 -
235
+ 2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /jobs/new -> 200 (0.00ms)
236
+ 2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "GET /jobs/new HTTP/1.1" 200 -
237
+ 2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
238
+ 2026-05-17 14:58:49 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
239
+ 2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "GET /static/css/style.css HTTP/1.1" 304 -
240
+ 2026-05-17 14:58:49 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:58:49] "GET /static/js/app.js HTTP/1.1" 304 -
241
+ 2026-05-17 14:59:09 [DEBUG ] app.middleware.security: POST /jobs/new -> 302 (31.00ms)
242
+ 2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "POST /jobs/new HTTP/1.1" 302 -
243
+ 2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /jobs/5 -> 200 (78.00ms)
244
+ 2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "GET /jobs/5 HTTP/1.1" 200 -
245
+ 2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
246
+ 2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "GET /static/css/style.css HTTP/1.1" 304 -
247
+ 2026-05-17 14:59:09 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
248
+ 2026-05-17 14:59:09 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:09] "GET /static/js/app.js HTTP/1.1" 304 -
249
+ 2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /jobs/api/jobs/5/status -> 200 (16.00ms)
250
+ 2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /jobs/api/jobs/5/status HTTP/1.1" 200 -
251
+ 2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /jobs/5 -> 200 (15.00ms)
252
+ 2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /jobs/5 HTTP/1.1" 200 -
253
+ 2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (16.00ms)
254
+ 2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /static/css/style.css HTTP/1.1" 304 -
255
+ 2026-05-17 14:59:11 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
256
+ 2026-05-17 14:59:11 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:11] "GET /static/js/app.js HTTP/1.1" 304 -
257
+ 2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET / -> 200 (16.00ms)
258
+ 2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "GET / HTTP/1.1" 200 -
259
+ 2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET /static/css/style.css -> 304 (0.00ms)
260
+ 2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "GET /static/css/style.css HTTP/1.1" 304 -
261
+ 2026-05-17 14:59:42 [DEBUG ] app.middleware.security: GET /static/js/app.js -> 304 (0.00ms)
262
+ 2026-05-17 14:59:42 [INFO ] werkzeug: 127.0.0.1 - - [17/May/2026 14:59:42] "GET /static/js/app.js HTTP/1.1" 304 -
logs/errors.log ADDED
File without changes