Agili / scraper.py
Badumetsibb's picture
Update scraper.py
173c8bb verified
# scraper.py
import time
import re
import logging
import pandas as pd
from datetime import datetime, timedelta
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import undetected_chromedriver as uc
# --- FIX: Changed relative imports to direct imports ---
from csv_util import ensure_csv_header, read_existing_data, write_data_to_csv, merge_new_data
from detail_parser import parse_detail_table, detail_data_to_string
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
def parse_calendar_day(driver, the_date: datetime, scrape_details=False, existing_df=None) -> pd.DataFrame:
date_str = the_date.strftime('%b%d.%Y').lower()
url = f"https://www.forexfactory.com/calendar?day={date_str}"
logger.info(f"Scraping URL: {url}")
driver.get(url)
try:
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//table[contains(@class,"calendar__table")]')))
except TimeoutException:
logger.warning(f"Page did not load for day={the_date.date()}")
return pd.DataFrame()
rows = driver.find_elements(By.XPATH, '//tr[contains(@class,"calendar__row")]')
data_list = []
for row in rows:
if "day-breaker" in row.get_attribute("class") or "no-event" in row.get_attribute("class"): continue
try:
time_el, currency_el, impact_el, event_el, actual_el, forecast_el, previous_el = [
row.find_element(By.XPATH, f'.//td[contains(@class,"calendar__{cell}")]')
for cell in ["time", "currency", "impact", "event", "actual", "forecast", "previous"]
]
except NoSuchElementException: continue
time_text = time_el.text.strip()
currency_text = currency_el.text.strip()
try: impact_text = impact_el.find_element(By.XPATH, './/span').get_attribute("title") or ""
except Exception: impact_text = ""
event_text = event_el.text.strip()
event_dt = the_date
m = re.match(r'(\d{1,2}):(\d{2})(am|pm)', time_text.lower())
if m:
hh, mm, ampm = int(m.group(1)), int(m.group(2)), m.group(3)
if ampm == 'pm' and hh < 12: hh += 12
if ampm == 'am' and hh == 12: hh = 0
event_dt = event_dt.replace(hour=hh, minute=mm, second=0)
data_list.append({
"DateTime": event_dt.isoformat(), "Currency": currency_text, "Impact": impact_text,
"Event": event_text, "Actual": actual_el.text.strip(), "Forecast": forecast_el.text.strip(),
"Previous": previous_el.text.strip(), "Detail": ""
})
return pd.DataFrame(data_list)
def scrape_range_pandas(from_date: datetime, to_date: datetime, output_csv: str, scrape_details=False):
ensure_csv_header(output_csv)
existing_df = read_existing_data(output_csv)
driver = uc.Chrome(headless=True, use_subprocess=False)
driver.set_window_size(1400, 1000)
try:
current_day = from_date
while current_day <= to_date:
logger.info(f"Scraping day {current_day.strftime('%Y-%m-%d')}...")
df_new = parse_calendar_day(driver, current_day, scrape_details=scrape_details, existing_df=existing_df)
if not df_new.empty:
existing_df = merge_new_data(existing_df, df_new)
write_data_to_csv(existing_df, output_csv)
current_day += timedelta(days=1)
finally:
if driver: driver.quit()
logger.info(f"Scraping done.")