Spaces:
Sleeping
Sleeping
| import os | |
| from pydantic import BaseModel | |
| from typing import Dict, List | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| class DatabaseConfig(BaseModel): | |
| mongo_uri: str = os.getenv("mongo_uri") | |
| mongo_db: str = os.getenv("mongo_db") | |
| neo4j_uri: str = os.getenv("neo4j_uri") | |
| neo4j_user: str = os.getenv("neo4j_user") | |
| neo4j_password: str = os.getenv("neo4j_password") | |
| class ScrapingConfig(BaseModel): | |
| timeout: int = 30000 | |
| wait_for_selector: str = "body" | |
| headless: bool = True | |
| user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| max_retries: int = 3 | |
| delay_between_requests: float = 1.0 | |
| class ExtractionConfig(BaseModel): | |
| content_selectors: List[str] = [ | |
| "article", "main", ".content", "#content", | |
| ".post", ".article-body", "p", "h1", "h2", "h3" | |
| ] | |
| ignore_selectors: List[str] = [ | |
| "script", "style", "nav", "footer", "header", | |
| ".advertisement", ".ads", ".sidebar" | |
| ] | |
| min_text_length: int = 50 | |
| extract_images: bool = True | |
| extract_links: bool = True | |
| class Settings: | |
| def __init__(self): | |
| self.database = DatabaseConfig() | |
| self.scraping = ScrapingConfig() | |
| self.extraction = ExtractionConfig() | |
| def update_from_env(self): | |
| # Update from environment variables if available | |
| if os.getenv("mongo_uri"): | |
| self.database.mongo_uri = os.getenv("mongo_uri") | |
| if os.getenv("mongo_db"): | |
| self.database.mongo_db = os.getenv("mongo_db") | |
| if os.getenv("neo4j_uri"): | |
| self.database.neo4j_uri = os.getenv("neo4j_uri") | |
| if os.getenv("neo4j_user"): | |
| self.database.neo4j_user = os.getenv("neo4j_user") | |
| if os.getenv("neo4j_password"): | |
| self.database.neo4j_password = os.getenv("neo4j_password") | |
| settings = Settings() | |
| settings.update_from_env() |