# ===========================================================================
#
# CommunityPowerEA - ForexFactory History Downloader script
# Links: https://forum.communitypowerea.com/communities/7/topics/593-script-news-download
# Last updated: 2025.08.01
#
# ChangeLog:
# ----------
#
# 2025-08-01:
#  - Updated the code to support both Windows and Linux directory paths using platform detection and dynamic path construction. Replaced hardcoded Windows paths with cross-platform logic based on Path.home() and Path handling.
#
# 2024-01-17:
#  - Changed the time value to 00:00 (All Day Event) for the events that are written as XXth-YYth, as it's a day range instead of a hour range
#
# 2024-01-06:
# - Fix bug for support for detecting time on hour-range events (i.e. 'HHth-HHth')
# - Update User-Agent string to a full up-to-date value of Google Chrome on Windows 10 x64
# - Automatically create the News\FF folder in the Common Data folder if it does not exist yet
# - Add input parameter to allow force re-downloading the last 2 months of data (to ensure they are up to date)
# - Add check to never re-download a file that has been last downloaded or created less than 1 hour ago (to avoid overloading the FF site)
#
# 2023-12-16:
# - Change default TimeZone to America/New_York to match CommunityPowerEA's default Timezone
#
# 2023-12-09:
# - Update script to match new HTML code of ForexFactory website
#
# 2023-08-03:
# - Initial release of this script
#
# ===========================================================================
#
# Requirements:
#sudo apt-get install python3-arrow
#pip install cloudscraper --upgrade
#pip install bs4 --upgrade
#pip install arrow --upgrade
#pip install pandas --upgrade
#mkdir -p AppData/Roaming/MetaQuotes/Terminal/Common/

import os
from io import StringIO

import cloudscraper
from arrow import Arrow

import pandas as pd
from bs4 import BeautifulSoup
import time

from datetime import datetime, timedelta
from os.path import expanduser

import argparse
import re

from pathlib import Path


prev_history_load: Arrow = Arrow.min

def was_file_modified_last_hour(file_path):
    # Return false if the file does not exist
    if not os.path.exists(file_path):
        return False

    # Get the last modification time
    last_modified_time = os.path.getmtime(file_path)

    # Get the current time
    current_time = time.time()

    # Check if the file was modified in the last hour
    if (current_time - last_modified_time) <= 3600:
        return True
    else:
        return False

def is_date_in_current_or_previous_month(date_str):
    try:
        given_date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        # Handle incorrect format
        print(f"News: error, incorrect date format. Please use 'YYYY-MM-DD HH:MM:SS'.")
        return False

    # Get the current date (without time)
    current_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

    # Get the first day of the current month
    first_day_current_month = datetime(current_date.year, current_date.month, 1)

    # Get the previous month by subtracting one day from the first day of the current month
    previous_month = first_day_current_month - timedelta(days=1)

    # Check if the given date is in the current month
    is_current_month = given_date.year == current_date.year and given_date.month == current_date.month

    # Check if the given date is in the previous month
    is_previous_month = given_date.year == previous_month.year and given_date.month == previous_month.month

    return is_current_month or is_previous_month

def load_all_history(scraper: cloudscraper.CloudScraper, start_date: Arrow, end_date: Arrow):
    if start_date < Arrow(2007, 1, 1) or end_date <= Arrow(2021, 1, 1) or start_date > end_date:
        print(f"News: can't load history, invalid start/end date!")
        return

    print(f"News: loading history from {start_date.strftime('%b %Y')} to {end_date.strftime('%b %Y')}...")
    if not os.path.exists(get_folder_name()):
        os.makedirs(get_folder_name())

    # Define the path of the MetaQuotes Common Folder
    metaquotes_common_folder_path = Path(expanduser("~"), "AppData", "Roaming", "MetaQuotes", "Terminal", "Common")

    # Make sure the MetaQuotes Common Folder exist
    if not os.path.exists(metaquotes_common_folder_path):
        print(f"News: error, unable to find the MetaQuotes Common Folder: {metaquotes_common_folder_path}")
        return

    # Define the path of the News Folder inside the MetaQuotes Common Folder
    news_folder_path = Path(metaquotes_common_folder_path, "Files", "News")
    print(news_folder_path)

    for date in Arrow.range("month", start_date, end_date):
        file_name = get_file_name(date)
        load_history(scraper, date, Path(news_folder_path, file_name))

    print(f"News: history loading finished!")


def load_history(scraper: cloudscraper.CloudScraper, date: Arrow, csv_name: str):

    # Show the name of the month we're currently trying to load the data for
    # print(f"News: verifying files for month {date.strftime('%Y-%m')}...")

    need_force_download = False
    if force_refresh:
        if is_date_in_current_or_previous_month(date.strftime('%Y-%m-%d %H:%M:%S')):
            # If the file was last modified during the last hour, skip the force refresh
            if was_file_modified_last_hour(csv_name):
                print(f"News: skipping already recently downloaded file for month {date.strftime('%Y-%m')}...")
            else:
                print(f"News: force refresh the files for month {date.strftime('%Y-%m')}...")
                need_force_download = True

    if os.path.exists(csv_name) and need_force_download == False:
        return False

    html_name = get_html_name(date)
    if os.path.exists(html_name) and need_force_download == False:
        if convert_html_to_csv(html_name, csv_name, date):
            return True
        print(f"News: can't convert existing \"{html_name}\" to \"{csv_name}\", reloading it...")

    # we don't want to overload server with requests, so we need to wait some time
    global prev_history_load
    news_history_loading_interval_sec: int = 10
    if prev_history_load != Arrow.min:
        time_delta = Arrow.now() - prev_history_load
        if time_delta.seconds < news_history_loading_interval_sec:
            time.sleep(news_history_loading_interval_sec - time_delta.seconds)

    # url should be like this: https://www.forexfactory.com/calendar?month=jan.2021
    url = f"https://www.forexfactory.com/calendar?month={date.strftime('%b.%Y')}"
    print(f"News: trying to load news for {date.strftime('%b %Y')} from {url}...")

    # to avoid "error 403: Forbidden", set the user-agent to a valid value
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    try:
        response = scraper.get(url, headers=headers).content.decode('utf-8')
    except Exception as e:
        print(f"News: failed to get web-page, exception: {e}")
        return False

    prev_history_load = Arrow.now()

    if not save_html(html_name, response):
        print(f"News: can't save \"{html_name}\"...")
        return False
    if not convert_html_to_csv(html_name, csv_name, date):
        print(f"News: can't convert \"{html_name}\" to \"{csv_name}\"...")
        return False
    return True

def get_feed_name():
    return "FF"

def get_folder_name():
    return f"{get_feed_name()}"

def get_file_name(date: Arrow):
    return Path(get_folder_name(),f"{date.strftime('%Y.%m')}.01.csv")

def get_html_name(date: Arrow):
    return Path(get_folder_name(),f"{date.strftime('%Y.%m')}.01.html")

def save_html(html_name: str, html: str):
    with open(html_name, "w") as f:
        f.write(html)
    return True

def time_2_digits(time: str):
    if type(time) is not str:
        return False
    # Define a regexp to try to match the expected time format
    match_hh_mm = re.fullmatch(r'([0-9]+):([0-9]+)', time)
    # If the time match 'HH:MM', extract the values
    if match_hh_mm:
        time_separator = time.split(":")
        time_hour = int(time_separator[0])
        time_minutes = int(time_separator[1])
        if time_hour > 24:
            # Else show an error because the hour value is not correct
            print(f"Error - Unexpected hour value in time format: {time}")
            return False
        if time_minutes > 59:
            # Else show an error because the minutes value is not correct
            print(f"Error - Unexpected minutes value in time format: {time}")
            return False
    # Else show an error because we don't know what's the time format
    else:
        print(f"Error - Unexpected time format {time}")
        return False

    # return the values we found
    return f"{time_hour:02}:{time_minutes:02}"

def detect_timelabel(timelabel: str, all_day_mask: list):
    if type(timelabel) is not str:
        return False
    if any(re.search(word, timelabel) for word in all_day_mask):
        return True
    return False

# ('Tue <span>Nov 1</span>', 'Nov 1, 2022') -> 1
def date_text_to_day(date_text: pd.Series):
    date_text = date_text.iloc[1]
    return int(date_text[date_text.find(" ") + 1: date_text.find(",")])


# 'icon--ff-impact-red', 'icon--ff-impact-yel', 'icon--ff-impact-gra', 'icon--ff-impact-ora'
def impact_text_to_int(impact_text: str):
    if impact_text.find("yel") >= 0:
        return 1
    elif impact_text.find("ora") >= 0:
        return 2
    elif impact_text.find("red") >= 0:
        return 3
    return 0  # including None


# converts date and time from html to datetime in correct GMT, returns 'YYYY.MM.DD HH:MM:SS'
def date_time_original_gmt(date: Arrow, time_24: str):
    # assume that time_24 is in our local timezone
    html_datetime: Arrow = Arrow.fromdatetime(pd.to_datetime(f"{date.format('YYYY-MM-DD')} {time_24}"), tzinfo="America/New_York")

    # html_datetime is now in local timezones, so we need to convert it to America/New_York timezone
    html_datetime = html_datetime.to("America/New_York")
    return html_datetime.strftime("%Y.%m.%d %H:%M:%S")


# <td class="calendar__cell calendar__previous"> <span class="revised better">31.9%<span class="icon icon--revised"></span></span> </td>
def previous_text_to_values(previous_cell) -> (str, str):
    if previous_cell is None:
        return "", ""
    if previous_cell.find("span") is None:
        return "", ""

    if previous_cell.find("span").find("span") is None:
        previous_value = previous_cell.find("span").text
        previous_revised = ""
        return previous_value, previous_revised

    previous_value = previous_cell.find("span").text
    previous_revised = previous_cell.find("span").find("span").text
    return previous_value, previous_revised


def convert_html_to_csv(html_name: str, csv_name: str, date: Arrow):
    if not os.path.exists(html_name):
        return False

    print(f"News: converting \"{html_name}\" to \"{csv_name}\"...")
    with open(html_name, "r") as f:
        html = f.read()

    # Looking for a table:
    # <script type="text/javascript">if (typeof window.calendarComponentStates === 'undefined') { window.calendarComponentStates = {} }
    # window.calendarComponentStates[1] = {
    # days: [{"date":"Tue <span>Nov 1<\/span>","dateline":1667253600,"add":"","events":[{"id":126990,"ebaseId":274,"name":"Final Manufacturing PMI","dateline":1667262600,"country":"JN","currency":"JPY","hasLinkedThreads":true,"hasNotice":false,"hasGraph":true,"checkedIn":false,"isMasterList":false,"firstInDay":true,"showGridLine":true,"greyed":true,"upNext":false,"releaser":"VG","checker":"JS","impactClass":"icon--ff-impact-yel","impactTitle":"Low Impact Expected","timeLabel":"2:30am","actual":"50.7","previous":"50.7","revision":"","forecast":"50.7","leaked":false,"actualBetterWorse":0,"revisionBetterWorse":0,"isSubscribable":true,"isSubscribed":false,"showDetails":false,"showGraph":false,"enableDetailComponent":false,"enableExpandComponent":false,"enableActualComponent":false,"showExpanded":false,"siteId":1,"editUrl":"","date":"Nov 1, 2022","url":"\/calendar?day=nov1.2022#detail=126990"},{"id":123631,"ebaseId":448,"name":"Caixin Manufacturing PMI","dateline":1667267100,"country":"CH","currency":"CNY","hasLinkedThreads":true,"hasNotice":false,"hasGraph":true,"checkedIn":false,"isMasterList":false,"firstInDay":false,"showGridLine":true,"greyed":true,"upNext":false,"releaser":"VG","checker":"TR","impactClass":"icon--ff-impact-yel","impactTitle":"Low Impact Expected","timeLabel":"3:45am","actual":"49.2","previous":"48.1","revision":"","forecast":"48.5","leaked":false,"actualBetterWorse":1,"revisionBetterWorse":0,"isSubscribable":true,"isSubscribed":false,"showDetails":false,"showGraph":false,"enableDetailComponent":false,"enableExpandComponent":false,"enableActualComponent":false,"showExpanded":false,"siteId":1,"editUrl":"","date":"Nov 1, 2022","url":"\/calendar?day=nov1.2022#detail=123631"}]}],
    soup = BeautifulSoup(html, 'html.parser')
    script = soup.find("script", string=lambda t: t and "window.calendarComponentStates" in t)
    if script is None:
        return False

    # getting all days from the table
    days = script.text.split("days:")[1].split("],")[0] + "]"

    # converting days to dataframe
    df = pd.read_json(StringIO(days))
    df = df.explode("events")
    df = df.reset_index(drop=True)

    # extracting all columns from events
    df = pd.concat([df.drop(['events'], axis=1), df['events'].apply(pd.Series)], axis=1)

    # extracting following data: {date_time_str};{currency};{impact};{event};{previous};{forecast};{actual};{actual_time};{prev_revised};{revised_time}
    df = df[['date', 'timeLabel', 'currency', 'impactClass', 'name', 'previous', 'forecast', 'actual', 'revision']]

    # replacing time "All Day", "Tentative", "Day 1", "Day 2", etc, and "Dec Data", "Jan Data", "Q2 Data", etc with the previous time
    # before doing this, adding this label to the event name
    all_day_mask = ['All Day', 'Tentative', r'Day \d+', r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) Data', r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)', r'Q\d Data', r'\d+(st|nd|rd|th)-\d+(st|nd|rd|th)']
    df['name'] = df.apply(lambda row: row['name'] + " (" + row['timeLabel'] + ")" if detect_timelabel(row['timeLabel'], all_day_mask) else row['name'], axis=1)
    df['timeLabel'] = df.apply(lambda row: "00:00" if detect_timelabel(row['timeLabel'], all_day_mask) else time_2_digits(row['timeLabel']), axis=1)

    # filling the empty time with the previous time
    df['timeLabel'] = df['timeLabel'].ffill()

    # removing rows with nan values
    df = df.dropna()

    # converting date and time to datetime in correct GMT, returns 'YYYY.MM.DD HH:MM:SS'
    df['date_time_str'] = df.apply(lambda row: date_time_original_gmt(Arrow(date.year, date.month, date_text_to_day(row['date'])), row['timeLabel']), axis=1)

    # extracting impact from impactClass
    df['impactClass'] = df['impactClass'].apply(lambda impact_text: impact_text_to_int(impact_text))
    df.rename(columns={'impactClass': 'impact'}, inplace=True)

    # dropping date, timeLabel and impactClass columns
    df = df.drop(['date', 'timeLabel'], axis=1)

    # setting column in correct order {date_time_str};{currency};{impact};{event};{previous};{forecast};{actual};{actual_time};{prev_revised};{revised_time}
    df['unused1'] = ""
    df['unused2'] = ""
    df = df[['date_time_str', 'currency', 'impact', 'name', 'previous', 'forecast', 'actual', 'unused1', 'revision', 'unused2']]

    # making sure the export folder exist
    if not os.path.exists(os.path.dirname(csv_name)):
        print(f"News: creating the folder {os.path.dirname(csv_name)}")
        os.makedirs(os.path.dirname(csv_name), exist_ok=True)

    # writing a file
    df.to_csv(csv_name, index=False, header=False, sep=';')

    return True


def set_time_zone():
    # Need cloudscraper to bypass cloudflare
    scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'platform': 'windows'})

    # Set TimeZone
    page_timezone = scraper.get('https://www.forexfactory.com/timezone').content.decode('utf-8')
    soup_timezone = BeautifulSoup(page_timezone, 'html.parser')
    meta_csrf = soup_timezone.find('meta', attrs={'name': 'csrf-token'} )
    csrf = meta_csrf.attrs['content']
    # Set the right Timezone
    # Set the Time Format to 24hs

    scraper.post('https://www.forexfactory.com/timezone', data={'_csrf': csrf, 'timezone': timezone_name, 'options[timeformat]': '1', 'redirect_uri': ''})
    # page_timezone_check = scraper.get('https://www.forexfactory.com/timezone').text
    # print(page_timezone_check)
    return scraper


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Python script to download News History from ForexFactory.com")
    parser.add_argument("--force", action='store_true', help='Force refreshing the calendar of the current month and previous month.')

    args = parser.parse_args()

    # Extract the value (True/False) of the --force argument, to be able to force the refresh of the data of the current month
    force_refresh = args.force

    # Set default timezone to America/New_York, as expected in CommunityPowerEA
    timezone_name = 'America/New_York'

    print(f"News: timezone: {timezone_name}")

    if force_refresh:
        print(f"News: force refresh is enabled. We will re-download the data for the current and previous month.")

    scraper = set_time_zone()
    to_date = datetime.now()

    to_date_year = int(to_date.year)
    to_date_month = int(to_date.month)

    load_all_history(scraper, Arrow(2007, 1, 1), Arrow(to_date_year, to_date_month, 1))