diff --git a/.env b/.env new file mode 100644 index 0000000000000000000000000000000000000000..a81d72e27918cec5c7181d8cab0dbdc6b258807a --- /dev/null +++ b/.env @@ -0,0 +1,9 @@ +FLASK_APP=app.py +FLASK_ENV=development +SECRET_KEY=your-secret-key-here +DATABASE_URL=postgresql://user:password@localhost:5432/facebook_ads +CELERY_BROKER_URL=redis://localhost:6379/0 +CELERY_RESULT_BACKEND=redis://localhost:6379/0 +OPENAI_API_KEY=your-openai-api-key-here +INSTANCE_PATH=/tmp/instance +SELENIUM_HUB_URL=http://selenium-hub:4444/wd/hub \ No newline at end of file diff --git a/.env.example b/.env.example index 3cd957030b41bcd49639cc5e36ea979fd0b3141e..a81d72e27918cec5c7181d8cab0dbdc6b258807a 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,9 @@ FLASK_APP=app.py FLASK_ENV=development SECRET_KEY=your-secret-key-here -DATABASE_URL=postgresql://user:password@localhost:5432/dbname -REDIS_URL=redis://localhost:6379/0 +DATABASE_URL=postgresql://user:password@localhost:5432/facebook_ads +CELERY_BROKER_URL=redis://localhost:6379/0 +CELERY_RESULT_BACKEND=redis://localhost:6379/0 +OPENAI_API_KEY=your-openai-api-key-here +INSTANCE_PATH=/tmp/instance SELENIUM_HUB_URL=http://selenium-hub:4444/wd/hub \ No newline at end of file diff --git a/app.py b/app.py index e6accf4b6c527073f00f88f1101236837c1ebc46..2c1a68d1a22cd6124c67854b6af9834972853b09 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,16 @@ from flask import Flask +from flask_migrate import Migrate +from app import db, create_app as create_flask_app +from config import get_config + +migrate = Migrate() def create_app(): - app = Flask(__name__) - app.config['INSTANCE_PATH'] = '/tmp/instance' # Ensure this path exists + app = create_flask_app() + app.config.from_object(get_config()) + migrate.init_app(app, db) return app + +if __name__ == "__main__": + app = create_app() + app.run(debug=True) diff --git a/app/__init__.py b/app/__init__.py index 6d7c1a77d5e2878591513affe2c1f95b2339f57d..d32fa60849b77a5cd729715a127da39dc6fd71a4 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -3,35 +3,53 @@ from flask_sqlalchemy import SQLAlchemy from flask_login import LoginManager from celery import Celery import redis +import os +from pathlib import Path +# Initialize extensions db = SQLAlchemy() login = LoginManager() +login.login_view = 'auth.login' celery = Celery(__name__) -cache = redis.Redis() +cache = None # Initialize later when app context is available -def create_app(): - # Create the Flask app first +def create_app(config_class=None): + # Create the Flask app app = Flask(__name__) - + # Load configuration - app.config.from_object('config.Config') - - # Set the instance path after loading the config + if config_class is None: + app.config.from_object('config.Config') + else: + app.config.from_object(config_class) + + # Ensure instance path exists + Path(app.config['INSTANCE_PATH']).mkdir(parents=True, exist_ok=True) app.instance_path = app.config['INSTANCE_PATH'] - + # Initialize extensions db.init_app(app) login.init_app(app) + + # Configure Celery celery.conf.update(app.config) + # Initialize Redis cache + global cache + cache = redis.Redis.from_url(app.config['CELERY_BROKER_URL']) + # Register Blueprints - from .routes.auth import auth_bp - from .routes.dashboard import dashboard_bp - from .routes.api import api_bp - from .routes.compliance import compliance_bp - app.register_blueprint(auth_bp) - app.register_blueprint(dashboard_bp) - app.register_blueprint(api_bp) - app.register_blueprint(compliance_bp) + with app.app_context(): + from .routes.auth import auth_bp + from .routes.dashboard import dashboard_bp + from .routes.api import api_bp + from .routes.compliance import compliance_bp + from .routes.google_ads import google_ads_bp + + app.register_blueprint(auth_bp) + app.register_blueprint(dashboard_bp) + app.register_blueprint(api_bp) + app.register_blueprint(compliance_bp) + app.register_blueprint(google_ads_bp) return app \ No newline at end of file diff --git a/app/__pycache__/__init__.cpython-312.pyc b/app/__pycache__/__init__.cpython-312.pyc index 2a04688b4ab604e9666e313d9e85f0acde975299..1e9598f6c060b6ed3e64e486f626fdaa6672afcd 100644 Binary files a/app/__pycache__/__init__.cpython-312.pyc and b/app/__pycache__/__init__.cpython-312.pyc differ diff --git a/app/models/google_ad.py b/app/models/google_ad.py new file mode 100644 index 0000000000000000000000000000000000000000..c1bca3ea60660c9afc22a5b5b304f57a609515bf --- /dev/null +++ b/app/models/google_ad.py @@ -0,0 +1,51 @@ +from app import db +from datetime import datetime +import uuid + +class GoogleAd(db.Model): + """Model for storing Google Ads data.""" + id = db.Column(db.String(36), primary_key=True, default=lambda: str(uuid.uuid4())) + ad_type = db.Column(db.String(20), nullable=False) # 'search' or 'display' + title = db.Column(db.String(255), nullable=True) + description = db.Column(db.Text, nullable=True) + display_url = db.Column(db.String(255), nullable=True) + target_url = db.Column(db.String(512), nullable=True) + image_url = db.Column(db.String(512), nullable=True) + position = db.Column(db.Integer, nullable=True) + search_query = db.Column(db.String(255), nullable=True) + page_url = db.Column(db.String(512), nullable=True) + raw_data = db.Column(db.JSON, nullable=True) + sentiment = db.Column(db.JSON, nullable=True) + created_at = db.Column(db.DateTime, default=datetime.utcnow) + updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=True) + + def __repr__(self): + return f'' + + @classmethod + def from_search_ad_data(cls, ad_data, search_query, user_id=None): + """Create a GoogleAd instance from scraped search ad data.""" + return cls( + ad_type='search', + title=ad_data.get('title'), + description=ad_data.get('description'), + display_url=ad_data.get('display_url'), + target_url=ad_data.get('target_url'), + position=ad_data.get('position'), + search_query=search_query, + raw_data=ad_data, + user_id=user_id + ) + + @classmethod + def from_display_ad_data(cls, ad_data, user_id=None): + """Create a GoogleAd instance from scraped display ad data.""" + return cls( + ad_type='display', + image_url=ad_data.get('image_url'), + target_url=ad_data.get('target_url'), + page_url=ad_data.get('page_url'), + raw_data=ad_data, + user_id=user_id + ) \ No newline at end of file diff --git a/app/routes/compliance.py b/app/routes/compliance.py index a25b83ea019a49d2824a267203c9c1953f07b049..a53db5faf4db2a1a7ad2597e8be9ecf1d6e03a54 100644 --- a/app/routes/compliance.py +++ b/app/routes/compliance.py @@ -3,15 +3,11 @@ from flask_login import login_required from ..models import Ad from ..utils.decorators import admin_required from .. import db +import logging -compliance_bp = Blueprint('compliance', __name__) +logger = logging.getLogger(__name__) -@compliance_bp.route('/report') -@login_required -@admin_required -def compliance_report(): - ads = Ad.query.all() - return render_template('compliance_report.html', ads=ads) +compliance_bp = Blueprint('compliance', __name__) @compliance_bp.route('/anonymize/', methods=['POST']) @login_required @@ -20,8 +16,12 @@ def anonymize_ad(ad_id): try: ad = Ad.query.get_or_404(ad_id) ad.content = "REDACTED" + db.session.add(ad) db.session.commit() return jsonify({'status': 'success'}) except Exception as e: db.session.rollback() - return jsonify({'status': 'error', 'message': str(e)}), 500 \ No newline at end of file + logger.error(f"Error anonymizing ad {ad_id}: {str(e)}") + return jsonify({'status': 'error', 'message': str(e)}), 500 + finally: + db.session.close() \ No newline at end of file diff --git a/app/routes/google_ads.py b/app/routes/google_ads.py new file mode 100644 index 0000000000000000000000000000000000000000..4d43d13808b0d6ab0e65b7ae6b8e13a0fd00b4c8 --- /dev/null +++ b/app/routes/google_ads.py @@ -0,0 +1,188 @@ +from flask import Blueprint, render_template, request, jsonify, current_app +from flask_login import login_required, current_user +from app.services.google_scraper import GoogleAdsScraper +from app.models.google_ad import GoogleAd +from app.services.ai_processor import AIPipeline +from app import db, celery +import logging + +logger = logging.getLogger(__name__) +google_ads_bp = Blueprint('google_ads', __name__, url_prefix='/google-ads') + +@google_ads_bp.route('/', methods=['GET']) +@login_required +def index(): + """Google Ads dashboard page.""" + return render_template('google_ads/index.html') + +@google_ads_bp.route('/search', methods=['GET', 'POST']) +@login_required +def search_ads(): + """Search for Google Ads.""" + if request.method == 'POST': + search_query = request.form.get('query') + num_pages = int(request.form.get('num_pages', 3)) + + # Start async task for scraping + task = scrape_google_search_ads.delay(search_query, num_pages, current_user.id) + + return jsonify({ + 'status': 'success', + 'message': 'Google Ads scraping started', + 'task_id': task.id + }) + + # GET request - show search form + return render_template('google_ads/search.html') + +@google_ads_bp.route('/display', methods=['GET', 'POST']) +@login_required +def display_ads(): + """Scrape display ads from a URL.""" + if request.method == 'POST': + target_url = request.form.get('url') + scroll_count = int(request.form.get('scroll_count', 5)) + + # Start async task for scraping + task = scrape_google_display_ads.delay(target_url, scroll_count, current_user.id) + + return jsonify({ + 'status': 'success', + 'message': 'Google Display Ads scraping started', + 'task_id': task.id + }) + + # GET request - show form + return render_template('google_ads/display.html') + +@google_ads_bp.route('/results', methods=['GET']) +@login_required +def view_results(): + """View Google Ads results.""" + ad_type = request.args.get('type', 'all') + query = request.args.get('query', '') + + # Build query + ads_query = GoogleAd.query + + if ad_type != 'all': + ads_query = ads_query.filter(GoogleAd.ad_type == ad_type) + + if query: + ads_query = ads_query.filter( + (GoogleAd.title.ilike(f'%{query}%')) | + (GoogleAd.description.ilike(f'%{query}%')) | + (GoogleAd.search_query.ilike(f'%{query}%')) + ) + + # Get results + ads = ads_query.order_by(GoogleAd.created_at.desc()).all() + + return render_template('google_ads/results.html', ads=ads, ad_type=ad_type, query=query) + +@google_ads_bp.route('/api/ads', methods=['GET']) +@login_required +def api_get_ads(): + """API endpoint to get Google Ads data.""" + ad_type = request.args.get('type', 'all') + query = request.args.get('query', '') + limit = int(request.args.get('limit', 50)) + + # Build query + ads_query = GoogleAd.query + + if ad_type != 'all': + ads_query = ads_query.filter(GoogleAd.ad_type == ad_type) + + if query: + ads_query = ads_query.filter( + (GoogleAd.title.ilike(f'%{query}%')) | + (GoogleAd.description.ilike(f'%{query}%')) | + (GoogleAd.search_query.ilike(f'%{query}%')) + ) + + # Get results + ads = ads_query.order_by(GoogleAd.created_at.desc()).limit(limit).all() + + # Convert to JSON + result = [] + for ad in ads: + ad_data = { + 'id': ad.id, + 'ad_type': ad.ad_type, + 'title': ad.title, + 'description': ad.description, + 'display_url': ad.display_url, + 'target_url': ad.target_url, + 'image_url': ad.image_url, + 'position': ad.position, + 'search_query': ad.search_query, + 'page_url': ad.page_url, + 'sentiment': ad.sentiment, + 'created_at': ad.created_at.isoformat() if ad.created_at else None + } + result.append(ad_data) + + return jsonify(result) + +@celery.task +def scrape_google_search_ads(search_query, num_pages, user_id): + """Celery task to scrape Google search ads.""" + try: + scraper = GoogleAdsScraper() + ads_data = scraper.scrape_search_ads(search_query, num_pages) + + # Process and store ads + ai_pipeline = AIPipeline() + + for ad_data in ads_data: + # Create GoogleAd instance + ad = GoogleAd.from_search_ad_data(ad_data, search_query, user_id) + + # Process with AI if there's content + if ad.title or ad.description: + try: + # Create a simple object with content for AI processing + ad_content = type('obj', (object,), { + 'content': f"{ad.title} {ad.description}" + }) + + # Process with AI + ai_results = ai_pipeline.process_ad(ad_content) + ad.sentiment = ai_results.get('sentiment') + except Exception as e: + logger.error(f"Error processing ad with AI: {e}") + + # Save to database + db.session.add(ad) + + db.session.commit() + return {'status': 'success', 'count': len(ads_data)} + + except Exception as e: + logger.error(f"Error in Google search ads scraping task: {e}") + db.session.rollback() + return {'status': 'error', 'message': str(e)} + +@celery.task +def scrape_google_display_ads(target_url, scroll_count, user_id): + """Celery task to scrape Google display ads.""" + try: + scraper = GoogleAdsScraper() + ads_data = scraper.scrape_display_ads(target_url, scroll_count) + + # Process and store ads + for ad_data in ads_data: + # Create GoogleAd instance + ad = GoogleAd.from_display_ad_data(ad_data, user_id) + + # Save to database + db.session.add(ad) + + db.session.commit() + return {'status': 'success', 'count': len(ads_data)} + + except Exception as e: + logger.error(f"Error in Google display ads scraping task: {e}") + db.session.rollback() + return {'status': 'error', 'message': str(e)} \ No newline at end of file diff --git a/app/services/ai_processor.py b/app/services/ai_processor.py index ac643d2e5e6f10b5358f190f5d1f30f7f965b5ca..e3a6898a2d32c812f8973ac7d8f79bc22efb8199 100644 --- a/app/services/ai_processor.py +++ b/app/services/ai_processor.py @@ -6,6 +6,10 @@ import logging logger = logging.getLogger(__name__) +class ProcessingError(Exception): + """Exception raised when ad processing fails.""" + pass + class AIPipeline: def __init__(self): try: @@ -26,6 +30,9 @@ class AIPipeline: raise def process_ad(self, ad): + if not ad: + raise ValueError("Ad content cannot be empty") + try: results = { "sentiment": self._analyze_sentiment(ad.content), @@ -34,8 +41,8 @@ class AIPipeline: } return results except Exception as e: - logger.error(f"Error processing ad: {e}") - return {"error": str(e)} + logger.error(f"Error processing ad: {str(e)}") + raise ProcessingError(f"Failed to process ad: {str(e)}") def _analyze_sentiment(self, text): try: diff --git a/app/services/google_scraper.py b/app/services/google_scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..6318810fcde8bd97d002493847705486b432ce41 --- /dev/null +++ b/app/services/google_scraper.py @@ -0,0 +1,172 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +import time +from selenium.common.exceptions import TimeoutException, WebDriverException +from contextlib import contextmanager +import logging +import json +import os + +logger = logging.getLogger(__name__) + +class GoogleAdsScraper: + def __init__(self, selenium_hub_url=None): + self.driver = None + self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL') + + def _setup_driver(self): + options = webdriver.ChromeOptions() + options.add_argument("--headless") + options.add_argument("--no-sandbox") + options.add_argument("--disable-dev-shm-usage") + + if self.selenium_hub_url: + logger.info(f"Using Selenium Hub at {self.selenium_hub_url}") + return webdriver.Remote( + command_executor=self.selenium_hub_url, + options=options + ) + else: + return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) + + @contextmanager + def _get_driver(self): + try: + self.driver = self._setup_driver() + yield self.driver + finally: + if self.driver: + self.driver.quit() + + def scrape_search_ads(self, search_query, num_pages=3): + """Scrape Google search ads for a given query.""" + with self._get_driver() as driver: + try: + url = f"https://www.google.com/search?q={search_query}" + driver.get(url) + driver.implicitly_wait(5) + + ads = [] + + # Process first page + ads.extend(self._extract_search_ads(driver)) + + # Navigate through additional pages if requested + for page in range(2, num_pages + 1): + try: + next_button = driver.find_element(By.ID, "pnnext") + next_button.click() + time.sleep(2) + ads.extend(self._extract_search_ads(driver)) + except Exception as e: + logger.warning(f"Could not navigate to page {page}: {e}") + break + + return ads + + except (TimeoutException, WebDriverException) as e: + logger.error(f"Error during Google Ads scraping: {e}") + return [] + + def _extract_search_ads(self, driver): + """Extract ad data from the current search results page.""" + ads = [] + try: + # Look for ad containers + ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd") + + for ad in ad_elements: + try: + ad_data = {} + + # Extract ad title + title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb") + ad_data["title"] = title_element.text if title_element else "" + + # Extract ad description + desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc") + ad_data["description"] = desc_element.text if desc_element else "" + + # Extract ad URL + url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc") + ad_data["display_url"] = url_element.text if url_element else "" + ad_data["target_url"] = url_element.get_attribute("href") if url_element else "" + + # Extract ad position + ad_data["position"] = len(ads) + 1 + + # Add timestamp + ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S") + + ads.append(ad_data) + except Exception as e: + logger.warning(f"Error extracting ad data: {e}") + continue + + return ads + except Exception as e: + logger.error(f"Error extracting search ads: {e}") + return [] + + def scrape_display_ads(self, target_url, scroll_count=5): + """Scrape Google display ads from a specific page.""" + with self._get_driver() as driver: + try: + driver.get(target_url) + driver.implicitly_wait(5) + + # Scroll to load dynamic content + for _ in range(scroll_count): + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + # Extract iframe ads + iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']") + + ads = [] + for iframe in iframes: + try: + # Switch to iframe context + driver.switch_to.frame(iframe) + + # Extract ad data + ad_data = { + "iframe_id": iframe.get_attribute("id"), + "width": iframe.get_attribute("width"), + "height": iframe.get_attribute("height"), + "scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"), + "page_url": target_url + } + + # Try to get the ad image + try: + img = driver.find_element(By.CSS_SELECTOR, "img") + ad_data["image_url"] = img.get_attribute("src") + except: + ad_data["image_url"] = None + + # Try to get the ad destination + try: + link = driver.find_element(By.CSS_SELECTOR, "a") + ad_data["target_url"] = link.get_attribute("href") + except: + ad_data["target_url"] = None + + ads.append(ad_data) + + # Switch back to main content + driver.switch_to.default_content() + except Exception as e: + logger.warning(f"Error processing iframe: {e}") + driver.switch_to.default_content() + continue + + return ads + + except (TimeoutException, WebDriverException) as e: + logger.error(f"Error during Google Display Ads scraping: {e}") + return [] \ No newline at end of file diff --git a/app/templates/base.html b/app/templates/base.html index 5a84523cf6ad4fc0044037bac8eed875ee1a5da3..6effcdd58cab05f1e9e6403b29a7b840b811967b 100644 --- a/app/templates/base.html +++ b/app/templates/base.html @@ -3,23 +3,55 @@ - Facebook Ad Analytics + {% block title %}Facebook Ad Analytics{% endblock %} + + + + {% block head_extra %}{% endblock %} -
-

Facebook Ad Analytics

- -
+ +
{% block content %}{% endblock %}
-