Spaces:
Runtime error
Runtime error
| # scraper.py | |
| import time | |
| import re | |
| import logging | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException, NoSuchElementException | |
| import undetected_chromedriver as uc | |
| # --- FIX: Changed relative imports to direct imports --- | |
| from csv_util import ensure_csv_header, read_existing_data, write_data_to_csv, merge_new_data | |
| from detail_parser import parse_detail_table, detail_data_to_string | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') | |
| logger = logging.getLogger(__name__) | |
| def parse_calendar_day(driver, the_date: datetime, scrape_details=False, existing_df=None) -> pd.DataFrame: | |
| date_str = the_date.strftime('%b%d.%Y').lower() | |
| url = f"https://www.forexfactory.com/calendar?day={date_str}" | |
| logger.info(f"Scraping URL: {url}") | |
| driver.get(url) | |
| try: | |
| WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//table[contains(@class,"calendar__table")]'))) | |
| except TimeoutException: | |
| logger.warning(f"Page did not load for day={the_date.date()}") | |
| return pd.DataFrame() | |
| rows = driver.find_elements(By.XPATH, '//tr[contains(@class,"calendar__row")]') | |
| data_list = [] | |
| for row in rows: | |
| if "day-breaker" in row.get_attribute("class") or "no-event" in row.get_attribute("class"): continue | |
| try: | |
| time_el, currency_el, impact_el, event_el, actual_el, forecast_el, previous_el = [ | |
| row.find_element(By.XPATH, f'.//td[contains(@class,"calendar__{cell}")]') | |
| for cell in ["time", "currency", "impact", "event", "actual", "forecast", "previous"] | |
| ] | |
| except NoSuchElementException: continue | |
| time_text = time_el.text.strip() | |
| currency_text = currency_el.text.strip() | |
| try: impact_text = impact_el.find_element(By.XPATH, './/span').get_attribute("title") or "" | |
| except Exception: impact_text = "" | |
| event_text = event_el.text.strip() | |
| event_dt = the_date | |
| m = re.match(r'(\d{1,2}):(\d{2})(am|pm)', time_text.lower()) | |
| if m: | |
| hh, mm, ampm = int(m.group(1)), int(m.group(2)), m.group(3) | |
| if ampm == 'pm' and hh < 12: hh += 12 | |
| if ampm == 'am' and hh == 12: hh = 0 | |
| event_dt = event_dt.replace(hour=hh, minute=mm, second=0) | |
| data_list.append({ | |
| "DateTime": event_dt.isoformat(), "Currency": currency_text, "Impact": impact_text, | |
| "Event": event_text, "Actual": actual_el.text.strip(), "Forecast": forecast_el.text.strip(), | |
| "Previous": previous_el.text.strip(), "Detail": "" | |
| }) | |
| return pd.DataFrame(data_list) | |
| def scrape_range_pandas(from_date: datetime, to_date: datetime, output_csv: str, scrape_details=False): | |
| ensure_csv_header(output_csv) | |
| existing_df = read_existing_data(output_csv) | |
| driver = uc.Chrome(headless=True, use_subprocess=False) | |
| driver.set_window_size(1400, 1000) | |
| try: | |
| current_day = from_date | |
| while current_day <= to_date: | |
| logger.info(f"Scraping day {current_day.strftime('%Y-%m-%d')}...") | |
| df_new = parse_calendar_day(driver, current_day, scrape_details=scrape_details, existing_df=existing_df) | |
| if not df_new.empty: | |
| existing_df = merge_new_data(existing_df, df_new) | |
| write_data_to_csv(existing_df, output_csv) | |
| current_day += timedelta(days=1) | |
| finally: | |
| if driver: driver.quit() | |
| logger.info(f"Scraping done.") |