File size: 1,884 Bytes
69a077e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
from pydantic import BaseModel
from typing import Dict, List
from dotenv import load_dotenv


load_dotenv()

class DatabaseConfig(BaseModel):
    mongo_uri: str = os.getenv("mongo_uri")
    mongo_db: str = os.getenv("mongo_db")
    neo4j_uri: str = os.getenv("neo4j_uri")
    neo4j_user: str = os.getenv("neo4j_user")
    neo4j_password: str = os.getenv("neo4j_password")

class ScrapingConfig(BaseModel):
    timeout: int = 30000
    wait_for_selector: str = "body"
    headless: bool = True
    user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    max_retries: int = 3
    delay_between_requests: float = 1.0

class ExtractionConfig(BaseModel):
    content_selectors: List[str] = [
        "article", "main", ".content", "#content", 
        ".post", ".article-body", "p", "h1", "h2", "h3"
    ]
    ignore_selectors: List[str] = [
        "script", "style", "nav", "footer", "header", 
        ".advertisement", ".ads", ".sidebar"
    ]
    min_text_length: int = 50
    extract_images: bool = True
    extract_links: bool = True

class Settings:
    def __init__(self):
        self.database = DatabaseConfig()
        self.scraping = ScrapingConfig()
        self.extraction = ExtractionConfig()
        
    def update_from_env(self):
        # Update from environment variables if available
        if os.getenv("mongo_uri"):
            self.database.mongo_uri = os.getenv("mongo_uri")
        if os.getenv("mongo_db"):
            self.database.mongo_db = os.getenv("mongo_db")
        if os.getenv("neo4j_uri"):
            self.database.neo4j_uri = os.getenv("neo4j_uri")
        if os.getenv("neo4j_user"):
            self.database.neo4j_user = os.getenv("neo4j_user")
        if os.getenv("neo4j_password"):
            self.database.neo4j_password = os.getenv("neo4j_password")

settings = Settings()
settings.update_from_env()