File size: 22,737 Bytes
a5132b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a743675
a5132b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdbce45
 
a5132b3
 
bdbce45
a5132b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05f52ea
 
 
 
 
 
 
 
bdbce45
 
 
 
05f52ea
 
 
 
 
bdbce45
 
 
 
 
 
 
 
 
 
05f52ea
 
 
 
bdbce45
 
05f52ea
 
 
 
 
 
 
bdbce45
05f52ea
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
import pandas as pd
import numpy as np
import yfinance as yf
import os
import finnhub
from twelvedata import TDClient
try:
    import talib as ta
except ImportError:
    ta = None
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from alpha_vantage.timeseries import TimeSeries
import time
import logging
import requests

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def print_log(message, level='INFO'):
    if level == 'INFO':
        logging.info(message)
    elif level == 'WARNING':
        logging.warning(message)
    elif level == 'ERROR':
        logging.error(message)
    else:
        logging.debug(message)

analyzer = SentimentIntensityAnalyzer()

def load_data_finnhub(ticker, start, end, interval, api_key):
    print_log(f"Fetching data for {ticker} from Finnhub")
    try:
        if not api_key:
            raise ValueError("Finnhub API key is required.")
        finnhub_client = finnhub.Client(api_key=api_key)
        start_ts = int(time.mktime(pd.to_datetime(start).timetuple()))
        end_ts = int(time.mktime(pd.to_datetime(end).timetuple()))
        
        # Finnhub interval mapping
        finnhub_interval_map = {
            "1m": "1", "5m": "5", "15m": "15", "30m": "30", "60m": "60",
            "1h": "60", "1d": "D", "1wk": "W", "1mo": "M"
        }
        fh_interval = finnhub_interval_map.get(interval, "D")
        
        res = finnhub_client.stock_candles(ticker, fh_interval, start_ts, end_ts)
        if res['s'] == 'no_data':
            raise ValueError(f"No data for {ticker} from Finnhub")
        df = pd.DataFrame(res)
        df['Date'] = pd.to_datetime(df['t'], unit='s')
        df = df.rename(columns={'o': 'Open', 'h': 'High', 'l': 'Low', 'c': 'value', 'v': 'Volume'})
        df = df.set_index('Date')
        df = df[['Open', 'High', 'Low', 'value', 'Volume']]
        return df
    except Exception as e:
        print_log(f"Error in load_data_finnhub for {ticker}: {str(e)}", 'ERROR')
        raise ValueError(f"Failed to load data for {ticker} from Finnhub: {str(e)}")

def load_data_twelvedata(ticker, start, end, interval, api_key):
    print_log(f"Fetching data for {ticker} from Twelve Data")
    try:
        if not api_key:
            raise ValueError("Twelve Data API key is required.")
        td = TDClient(apikey=api_key)
        
        # Twelve Data interval mapping
        twelvedata_interval_map = {
            "1m": "1min", "5m": "5min", "15m": "15min", "30m": "30min", "60m": "1h",
            "1h": "1h", "1d": "1day", "1wk": "1week", "1mo": "1month"
        }
        td_interval = twelvedata_interval_map.get(interval, "1day")

        ts = td.time_series(symbol=ticker, interval=td_interval, start_date=start, end_date=end, outputsize=5000)
        df = ts.as_pandas()
        if df is None or df.empty:
            raise ValueError(f"No data for {ticker} from Twelve Data")
        df = df.rename(columns={'close': 'value', 'open': 'Open', 'high': 'High', 'low': 'Low', 'volume': 'Volume'})
        df.index.name = 'Date'
        return df
    except Exception as e:
        print_log(f"Error in load_data_twelvedata for {ticker}: {str(e)}", 'ERROR')
        raise ValueError(f"Failed to load data for {ticker} from Twelve Data: {str(e)}")

def load_data(data_src='yahoo', ticker='AAPL', start='2020-01-01', end='2023-01-01', interval='1d', file_upload=None, alpha_api_key=None, finnhub_api_key=None, twelvedata_api_key=None):
    print_log(f"Loading data: source={data_src}, ticker={ticker}, start={start}, end={end}, interval={interval}, file_upload={'set' if file_upload else 'unset'}, alpha_api_key={'set' if alpha_api_key else 'unset'}, finnhub_api_key={'set' if finnhub_api_key else 'unset'}, twelvedata_api_key={'set' if twelvedata_api_key else 'unset'}")

    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    if start_date >= end_date:
        raise ValueError(f"Start date {start} must be before end date {end}")
    if end_date > datetime.now():
        print_log(f"End date {end} is in the future. Using current date as end date.", 'WARNING')
        end_date = datetime.now()

    df = pd.DataFrame()

    if data_src == 'csv' and file_upload:
        try:
            file_path = getattr(file_upload, 'name', file_upload)
            print_log(f"Loading CSV from {file_path}")
            df = pd.read_csv(file_path)
            if 'Date' not in df.columns:
                raise ValueError("CSV must contain a 'Date' column")
            df['Date'] = pd.to_datetime(df['Date']).dt.tz_localize(None)
            df = df.set_index('Date')
            if 'Close' not in df.columns and 'value' not in df.columns:
                raise ValueError("CSV must contain 'Close' or 'value' column")
            if 'Close' in df.columns:
                df = df.rename(columns={'Close': 'value'})
            if df.empty:
                raise ValueError(f"CSV data is empty for {ticker}")
            if df['value'].isna().all():
                raise ValueError(f"CSV 'value' column contains only NaNs for {ticker}")
        except Exception as e:
            print_log(f"Failed to load CSV {file_path}: {str(e)}", 'ERROR')
            raise ValueError(f"Failed to load CSV: {str(e)}")
    elif data_src == 'yahoo':
        print_log(f"Fetching data for {ticker} from Yahoo Finance")
        try:
            # Adjust start_date for yfinance intraday limitations
            if interval in ["1m"]:
                max_days = 7
            elif interval in ["2m", "5m", "15m", "30m", "60m", "90m", "1h"]:
                max_days = 60
            else:
                max_days = None

            if max_days:
                adjusted_end_date = end_date + timedelta(days=1) # yfinance end date is exclusive
                adjusted_start_date = adjusted_end_date - timedelta(days=max_days)
                if start_date < adjusted_start_date:
                    print_log(f"Adjusting start date for {interval} interval from {start_date.strftime('%Y-%m-%d')} to {adjusted_start_date.strftime('%Y-%m-%d')} due to yfinance limitations.", 'WARNING')
                    start_date = adjusted_start_date

            df = yf.download(ticker, start=start_date, end=end_date, interval=interval, progress=False, auto_adjust=False)

            if isinstance(df.columns, pd.MultiIndex):
                df.columns = df.columns.droplevel(1)
            if df.empty:
                raise ValueError(f"No data returned from Yahoo Finance for {ticker}")
            if 'Close' not in df.columns:
                raise ValueError(f"Yahoo Finance data missing 'Close' column for {ticker}")
            df = df.rename(columns={'Close': 'value'})
            # The index is already datetime, no need to create a 'Date' column and then reset
            if df['value'].isna().all():
                raise ValueError(f"Yahoo Finance 'value' column contains only NaNs for {ticker}")
            if df['value'].empty:
                raise ValueError(f"Yahoo Finance 'value' column is empty for {ticker}")
        except Exception as e:
            print_log(f"Yahoo Finance failed for {ticker}: {str(e)}", 'ERROR')
            raise ValueError(f"Yahoo Finance failed for {ticker}: {str(e)}")
    elif data_src == 'alpha_vantage' and alpha_api_key:
        print_log(f"Attempting Alpha Vantage for {ticker}, interval {interval}")
        try:
            ts = TimeSeries(key=alpha_api_key, output_format='pandas')
            # Alpha Vantage interval mapping
            av_interval_map = {
                "1m": "1min", "5m": "5min", "15m": "15min", "30m": "30min", "60m": "60min",
                "1h": "60min"
            }
            av_interval = av_interval_map.get(interval)
            
            if av_interval:
                df_av, _ = ts.get_intraday(symbol=ticker, interval=av_interval, outputsize='full')
            elif interval == "1d":
                df_av, _ = ts.get_daily(symbol=ticker, outputsize='full')
            elif interval == "1wk":
                df_av, _ = ts.get_weekly(symbol=ticker)
            elif interval == "1mo":
                df_av, _ = ts.get_monthly(symbol=ticker)
            else:
                raise ValueError(f"Unsupported interval for Alpha Vantage: {interval}")

            if df_av.empty:
                raise ValueError(f"No data returned from Alpha Vantage for {ticker}")
            
            # Standardize column names
            df_av = df_av.rename(columns={
                '4. close': 'value', '1. open': 'Open', '2. high': 'High', 
                '3. low': 'Low', '5. volume': 'Volume'
            })
            # For daily/weekly/monthly, index is already datetime. For intraday, it's also datetime.
            # Ensure consistent column order and index type
            df = df_av[['Open', 'High', 'Low', 'value', 'Volume']]
            df.index = pd.to_datetime(df.index)
            df = df.sort_index()
            
            if df['value'].isna().all():
                raise ValueError(f"Alpha Vantage 'value' column contains only NaNs for {ticker}")
            if df['value'].empty:
                raise ValueError(f"Alpha Vantage 'value' column is empty for {ticker}")
            print_log(f"Data loaded for {ticker} from Alpha Vantage with date range: {df.index.min()} to {df.index.max()}, shape: {df.shape}")

        except Exception as e:
            print_log(f"Alpha Vantage failed for {ticker}: {str(e)}", 'ERROR')
            raise ValueError(f"Alpha Vantage failed for {ticker}: {str(e)}")
    elif data_src == 'finnhub' and finnhub_api_key:
        df = load_data_finnhub(ticker, start, end, interval, finnhub_api_key)
    elif data_src == 'twelvedata' and twelvedata_api_key:
        df = load_data_twelvedata(ticker, start, end, interval, twelvedata_api_key)

    if df.empty:
        raise ValueError(f"No data loaded for {ticker} from {data_src}")

    # Ensure index is DatetimeIndex and sorted
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)
    df = df.sort_index()

    required_cols = ['Open', 'High', 'Low', 'value', 'Volume']
    for col in required_cols:
        if col not in df.columns:
            df[col] = np.nan

    if 'value' not in df.columns:
        raise ValueError(f"Target column 'value' is missing for {ticker}")
    if df['value'].isna().all():
        raise ValueError(f"Target column 'value' contains only NaNs for {ticker}")
    if df['value'].empty:
        raise ValueError(f"Target column 'value' is empty for {ticker}")

    print_log(f"Data loaded for {ticker} with date range: {df.index.min()} to {df.index.max()}, shape: {df.shape}")
    return df

def add_technical_indicators(df, selected_indicators):
    try:
        print_log(f"Starting add_technical_indicators with indicators: {selected_indicators}")
        if df.empty:
            print_log("DataFrame is empty, skipping technical indicator calculation.", "WARNING")
            return df, []

        for col in ['Open', 'High', 'Low', 'value', 'Volume']:
            if col not in df.columns:
                df[col] = np.nan
            df[col] = pd.to_numeric(df[col], errors='coerce')

        df.dropna(subset=['Open', 'High', 'Low', 'value', 'Volume'], inplace=True)
        if df.empty:
            print_log("DataFrame is empty after dropping NaNs for technical indicators.", "WARNING")
            return df, []

        if ta is None:
            print_log("TA-Lib not available. Cannot compute indicators. Falling back to 'value'.", 'ERROR')
            return df, []

        close = df['value'].values
        high = df['High'].values
        low = df['Low'].values
        volume = df['Volume'].values
        open_ = df['Open'].values

        indicator_map = {
            'rsi': {'func': ta.RSI, 'inputs': ['close'], 'params': {'timeperiod': 14}, 'output': ['rsi_14']},
            'macd': {'func': ta.MACD, 'inputs': ['close'], 'params': {'fastperiod': 12, 'slowperiod': 26, 'signalperiod': 9}, 'output': ['macd_12_26_9', 'macds_12_26_9', 'macdhist_12_26_9']},
            'bbands': {'func': ta.BBANDS, 'inputs': ['close'], 'params': {'timeperiod': 20}, 'output': ['upperband_20', 'middleband_20', 'lowerband_20']},
            'stoch': {'func': ta.STOCH, 'inputs': ['high', 'low', 'close'], 'params': {'fastk_period': 14, 'slowk_period': 3, 'slowd_period': 3}, 'output': ['slowk_14_3_3', 'slowd_14_3_3']},
            'adx': {'func': ta.ADX, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['adx_14']},
            'atr': {'func': ta.ATR, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['atr_14']},
            'cci': {'func': ta.CCI, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['cci_14']},
            'ema': {'func': ta.EMA, 'inputs': ['close'], 'params': {'timeperiod': 14}, 'output': ['ema_14']},
            'sma': {'func': ta.SMA, 'inputs': ['close'], 'params': {'timeperiod': 14}, 'output': ['sma_14']},
            'mom': {'func': ta.MOM, 'inputs': ['close'], 'params': {'timeperiod': 10}, 'output': ['mom_10']},
            'roc': {'func': ta.ROC, 'inputs': ['close'], 'params': {'timeperiod': 10}, 'output': ['roc_10']},
            'willr': {'func': ta.WILLR, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['willr_14']},
            'ultosc': {'func': ta.ULTOSC, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod1': 7, 'timeperiod2': 14, 'timeperiod3': 28}, 'output': ['ultosc_7_14_28']},
            'dx': {'func': ta.DX, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['dx_14']},
            'minus_di': {'func': ta.MINUS_DI, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['minus_di_14']},
            'plus_di': {'func': ta.PLUS_DI, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['plus_di_14']},
            'mfi': {'func': ta.MFI, 'inputs': ['high', 'low', 'close', 'volume'], 'params': {'timeperiod': 14}, 'output': ['mfi_14']},
            'obv': {'func': ta.OBV, 'inputs': ['close', 'volume'], 'params': {}, 'output': ['obv']},
            'ad': {'func': ta.AD, 'inputs': ['high', 'low', 'close', 'volume'], 'params': {}, 'output': ['ad']},
            'adosc': {'func': ta.ADOSC, 'inputs': ['high', 'low', 'close', 'volume'], 'params': {'fastperiod': 3, 'slowperiod': 10}, 'output': ['adosc_3_10']},
            'aroon': {'func': ta.AROON, 'inputs': ['high', 'low'], 'params': {'timeperiod': 14}, 'output': ['aroon_down_14', 'aroon_up_14']},
            'aroonosc': {'func': ta.AROONOSC, 'inputs': ['high', 'low'], 'params': {'timeperiod': 14}, 'output': ['aroonosc_14']},
            'bop': {'func': ta.BOP, 'inputs': ['open_', 'high', 'low', 'close'], 'params': {}, 'output': ['bop']},
            'cmo': {'func': ta.CMO, 'inputs': ['close'], 'params': {'timeperiod': 14}, 'output': ['cmo_14']},
            'dema': {'func': ta.DEMA, 'inputs': ['close'], 'params': {'timeperiod': 30}, 'output': ['dema_30']},
            'kama': {'func': ta.KAMA, 'inputs': ['close'], 'params': {'timeperiod': 30}, 'output': ['kama_30']},
            'ppo': {'func': ta.PPO, 'inputs': ['close'], 'params': {'fastperiod': 12, 'slowperiod': 26, 'matype': 0}, 'output': ['ppo_12_26_0']},
            'rocp': {'func': ta.ROCP, 'inputs': ['close'], 'params': {'timeperiod': 10}, 'output': ['rocp_10']},
            'rocr': {'func': ta.ROCR, 'inputs': ['close'], 'params': {'timeperiod': 10}, 'output': ['rocr_10']},
            'rocr100': {'func': ta.ROCR100, 'inputs': ['close'], 'params': {'timeperiod': 10}, 'output': ['rocr100_10']},
            'trix': {'func': ta.TRIX, 'inputs': ['close'], 'params': {'timeperiod': 14}, 'output': ['trix_14']},
            # 'tsi': {'func': ta.TSI, 'inputs': ['close'], 'params': {'fastperiod': 13, 'slowperiod': 25}, 'output': ['tsi_13_25']}, # Removed due to TA-Lib attribute error
            'uo': {'func': ta.ULTOSC, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod1': 7, 'timeperiod2': 14, 'timeperiod3': 28}, 'output': ['ultosc_7_14_28']},
            'willr': {'func': ta.WILLR, 'inputs': ['high', 'low', 'close'], 'params': {'timeperiod': 14}, 'output': ['willr_14']},
            'wma': {'func': ta.WMA, 'inputs': ['close'], 'params': {'timeperiod': 30}, 'output': ['wma_30']},
        }

        added_features = []
        for indicator_name in selected_indicators:
            if indicator_name in indicator_map:
                indicator_info = indicator_map[indicator_name]
                func = indicator_info['func']
                inputs = []
                for input_name in indicator_info['inputs']:
                    if input_name == 'close':
                        inputs.append(close)
                    elif input_name == 'high':
                        inputs.append(high)
                    elif input_name == 'low':
                        inputs.append(low)
                    elif input_name == 'volume':
                        inputs.append(volume)
                    elif input_name == 'open_':
                        inputs.append(open_)

                # Ensure inputs are numpy arrays and not empty
                if not all(len(arr) > 0 for arr in inputs):
                    print_log(f"Skipping {indicator_name}: insufficient data for inputs.", "WARNING")
                    continue

                try:
                    output_values = func(*inputs, **indicator_info['params'])
                    if not isinstance(output_values, tuple):
                        output_values = (output_values,)

                    for i, col_name in enumerate(indicator_info['output']):
                        df[col_name] = np.nan
                        # Ensure the output array matches the DataFrame length
                        if len(output_values[i]) == len(df):
                            df[col_name] = output_values[i]
                        else:
                            # Align output to DataFrame by padding with NaNs at the beginning
                            nan_padding = np.full(len(df) - len(output_values[i]), np.nan)
                            df[col_name] = np.concatenate((nan_padding, output_values[i]))
                        added_features.append(col_name)
                except Exception as e:
                    print_log(f"Error calculating indicator {indicator_name}: {str(e)}", "ERROR")
            else:
                print_log(f"Unknown indicator: {indicator_name}", "WARNING")

        df.dropna(inplace=True)
        print_log(f"Finished add_technical_indicators. New features added: {added_features}")
        return df, added_features
    except Exception as e:
        print_log(f"Error in add_technical_indicators: {str(e)}", 'ERROR')
        return df, []

def add_sentiment(df, ticker, news_api_key, start_date, end_date):
    try:
        sentiment_text, sentiment_score = sentiment_analysis(ticker, start_date, end_date, news_api_key)
        df['sentiment'] = sentiment_score if sentiment_score is not None else 0.0
        return df, sentiment_text
    except Exception as e:
        print_log(f"Error adding sentiment: {str(e)}", 'ERROR')
        df['sentiment'] = 0.0 # Default to neutral sentiment on error
        return df, f"Error adding sentiment: {str(e)}"


def sentiment_analysis(ticker, start_date, end_date, api_key):
    try:
        if not api_key:
            print_log("News API key not provided for sentiment analysis.", 'WARNING')
            return "No API key provided", None
        newsapi = NewsApiClient(api_key=api_key)
        start = pd.to_datetime(start_date)
        end = pd.to_datetime(end_date)
        articles = newsapi.get_everything(
            q=ticker, from_param=start.strftime("%Y-%m-%d"), to=end.strftime("%Y-%m-%d"),
            language='en', sort_by='relevancy'
        )
        sentiments = [analyzer.polarity_scores(article["title"])['compound'] for article in articles["articles"] if "title" in article and article["title"] is not None]
        avg_sentiment = np.mean(sentiments) if sentiments else 0.0
        sentiment_text = f"Average sentiment for {ticker}: {avg_sentiment:.2f}"
        return sentiment_text, avg_sentiment
    except Exception as e:
        print_log(f"Sentiment analysis failed: {str(e)}", 'ERROR')
        return f"Sentiment analysis failed: {str(e)}", None

def preprocess_data(df, features, target, window_size, horizon):
    try:
        print_log(f"Starting preprocessing: features={features}, target={target}, window={window_size}, horizon={horizon}")
        
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("DataFrame index must be a DatetimeIndex for preprocessing.")

        # The target column must be included in the features for scaling
        all_features = list(set(features + [target]))
        updated_feature_cols = [f for f in all_features if f in df.columns]
        
        if not updated_feature_cols:
            raise ValueError("None of the selected features are available in the data.")

        data = df[updated_feature_cols].values
        
        # Scale all features
        feature_scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = feature_scaler.fit_transform(data)
        
        # Scale the target column separately for inverse transform
        target_scaler = MinMaxScaler(feature_range=(0, 1))
        target_scaler.fit(df[[target]].values)

        # Get the index of the target column in the scaled data
        target_idx = updated_feature_cols.index(target)
        
        X, y = [], []
        for i in range(len(scaled_data) - window_size - horizon + 1):
            X.append(scaled_data[i:(i + window_size), :])
            # The target is the 'value' at the end of the window + horizon
            y.append(scaled_data[i + window_size + horizon - 1, target_idx])
            
        X, y = np.array(X), np.array(y)
        
        if X.shape[1] != window_size or X.shape[2] != len(updated_feature_cols):
            raise ValueError(f"Shape mismatch in X: expected ({len(scaled_data) - window_size - horizon + 1}, {window_size}, {len(updated_feature_cols)}), got {X.shape}")

        print_log(f"Preprocessing complete. X shape: {X.shape}, y shape: {y.shape}")
        return X, y, feature_scaler, target_scaler, updated_feature_cols, target_idx
    except Exception as e:
        print_log(f"Error in preprocess_data: {str(e)}", 'ERROR')
        raise ValueError(f"Failed to preprocess data: {str(e)}")