# scraper.py import time import re import logging import pandas as pd from datetime import datetime, timedelta from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException, NoSuchElementException import undetected_chromedriver as uc # --- FIX: Changed relative imports to direct imports --- from csv_util import ensure_csv_header, read_existing_data, write_data_to_csv, merge_new_data from detail_parser import parse_detail_table, detail_data_to_string logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) def parse_calendar_day(driver, the_date: datetime, scrape_details=False, existing_df=None) -> pd.DataFrame: date_str = the_date.strftime('%b%d.%Y').lower() url = f"https://www.forexfactory.com/calendar?day={date_str}" logger.info(f"Scraping URL: {url}") driver.get(url) try: WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//table[contains(@class,"calendar__table")]'))) except TimeoutException: logger.warning(f"Page did not load for day={the_date.date()}") return pd.DataFrame() rows = driver.find_elements(By.XPATH, '//tr[contains(@class,"calendar__row")]') data_list = [] for row in rows: if "day-breaker" in row.get_attribute("class") or "no-event" in row.get_attribute("class"): continue try: time_el, currency_el, impact_el, event_el, actual_el, forecast_el, previous_el = [ row.find_element(By.XPATH, f'.//td[contains(@class,"calendar__{cell}")]') for cell in ["time", "currency", "impact", "event", "actual", "forecast", "previous"] ] except NoSuchElementException: continue time_text = time_el.text.strip() currency_text = currency_el.text.strip() try: impact_text = impact_el.find_element(By.XPATH, './/span').get_attribute("title") or "" except Exception: impact_text = "" event_text = event_el.text.strip() event_dt = the_date m = re.match(r'(\d{1,2}):(\d{2})(am|pm)', time_text.lower()) if m: hh, mm, ampm = int(m.group(1)), int(m.group(2)), m.group(3) if ampm == 'pm' and hh < 12: hh += 12 if ampm == 'am' and hh == 12: hh = 0 event_dt = event_dt.replace(hour=hh, minute=mm, second=0) data_list.append({ "DateTime": event_dt.isoformat(), "Currency": currency_text, "Impact": impact_text, "Event": event_text, "Actual": actual_el.text.strip(), "Forecast": forecast_el.text.strip(), "Previous": previous_el.text.strip(), "Detail": "" }) return pd.DataFrame(data_list) def scrape_range_pandas(from_date: datetime, to_date: datetime, output_csv: str, scrape_details=False): ensure_csv_header(output_csv) existing_df = read_existing_data(output_csv) driver = uc.Chrome(headless=True, use_subprocess=False) driver.set_window_size(1400, 1000) try: current_day = from_date while current_day <= to_date: logger.info(f"Scraping day {current_day.strftime('%Y-%m-%d')}...") df_new = parse_calendar_day(driver, current_day, scrape_details=scrape_details, existing_df=existing_df) if not df_new.empty: existing_df = merge_new_data(existing_df, df_new) write_data_to_csv(existing_df, output_csv) current_day += timedelta(days=1) finally: if driver: driver.quit() logger.info(f"Scraping done.")