Spaces:
Running
Running
File size: 5,541 Bytes
4cb21eb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | import pandas as pd
import numpy as np
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import joblib
class Processor:
def __init__(self, scaler_path="models/robust_scaler.pkl"):
print("⚙️ Initializing AlphaProcessor...")
self.device = 0 if torch.cuda.is_available() else -1
self.model_name = "ProsusAI/finbert"
# Load Scaler (Required for normalization before GRU)
try:
self.scaler = joblib.load(scaler_path)
print(f"✅ Scaler loaded from {scaler_path}")
except:
print("⚠️ Scaler not found. Ensure robust_scaler.pkl is in models/ folder.")
# Initialize FinBERT Pipeline
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(
self.model_name, use_safetensors=True
)
self.sentiment_pipe = pipeline(
"sentiment-analysis",
model=self.model,
tokenizer=self.tokenizer,
device=self.device
)
def process(self, df_market, df_news):
"""
Modified to return metadata and historical features for Streamlit display.
"""
# 1. Process Sentiment features from headlines
df_sent, df_news_scored = self._generate_sentiment_profile(df_news)
# 2. Merge with market data and engineer all 14 features
df_features = self._engineer_14_features(df_market, df_sent)
# 3. Extract metadata for the UI (latest day's values)
latest_metrics = {
"Sent_Mean": df_features['Sent_Mean'].iloc[-1],
"News_Volume": np.exp(df_features['News_Volume'].iloc[-1]) - 1, # Reverse log
"Panic_Interaction": df_features['Sent_x_VIX'].iloc[-1],
"RSI": df_features['RSI'].iloc[-1] * 100
}
# 4. Get the last 30 days and scale for the Model
final_window = df_features.tail(30).values
scaled_window = self.scaler.transform(final_window)
input_tensor = np.expand_dims(scaled_window, axis=0).astype('float32')
# FIX: We now return df_features so app.py can plot the historical 30-day sentiment
return input_tensor, latest_metrics, df_features, df_news_scored
def _generate_sentiment_profile(self, df_news):
print("🧠 Running FinBERT Batch Analysis...")
titles = df_news['Title'].astype(str).tolist()
results = self.sentiment_pipe(titles, batch_size=32, truncation=True)
scores = []
for res in results:
label, score = res['label'].lower(), res['score']
scores.append(score if label == 'positive' else -score if label == 'negative' else 0.0)
df_news['Score'] = scores # Add scores to the raw news df
# Ensure dates match format for grouping
df_news['Date'] = pd.to_datetime(df_news['Date']).dt.date
grouped = df_news.groupby('Date')['Score']
daily = pd.DataFrame({
'Sent_Mean': grouped.mean(),
'Sent_Intensity': grouped.apply(lambda x: x.abs().mean()),
'News_Volume': np.log1p(grouped.count()),
'Net_Bull': grouped.apply(lambda x: x.sum() / (len(x) + 1))
}).fillna(0.0)
# Convert index back to datetime for merging
daily.index = pd.to_datetime(daily.index)
return daily, df_news
def _engineer_14_features(self, df, df_sent):
data = df.copy()
# --- QUANT BRANCH (7 Features) ---
tp = (data['High'] + data['Low'] + data['Close']) / 3
vwap = (tp * data['Volume']).rolling(20).sum() / (data['Volume'].rolling(20).sum() + 1e-9)
data['VWAP_Dist'] = np.log(data['Close'] / vwap)
delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
data['RSI'] = (100 - (100 / (1 + (gain/(loss + 1e-9))))) / 100.0
ema_12, ema_26 = data['Close'].ewm(span=12).mean(), data['Close'].ewm(span=26).mean()
data['MACD_Hist'] = ((ema_12 - ema_26) - (ema_12 - ema_26).ewm(span=9).mean()) / data['Close']
data['VIX_Norm'] = data['VIX'] / 100.0
data['VIX_Change'] = data['VIX'].pct_change()
tr = pd.concat([data['High']-data['Low'], abs(data['High']-data['Close'].shift()),
abs(data['Low']-data['Close'].shift())], axis=1).max(axis=1)
data['ATR_Dist'] = np.tanh((data['Close'] - data['Close'].rolling(22).mean()) / (tr.rolling(14).mean() + 1e-9))
data['Realized_Vol'] = data['Close'].pct_change().rolling(10).std() * 10
# --- SENTIMENT BRANCH (7 Features) ---
# Ensure indices match for joining
data.index = pd.to_datetime(data.index)
data = data.join(df_sent, how='left').fillna(0.0)
data['Sent_Mean_Delta'] = data['Sent_Mean'].diff().fillna(0.0)
data['Sent_Mean_EMA'] = data['Sent_Mean'].ewm(span=3).mean()
data['Sent_x_VIX'] = data['Sent_Mean'] * data['VIX_Norm'] # Panic Interaction
feature_cols = [
'VWAP_Dist', 'RSI', 'MACD_Hist', 'VIX_Norm', 'VIX_Change', 'ATR_Dist', 'Realized_Vol',
'Sent_Mean', 'Sent_Intensity', 'News_Volume', 'Net_Bull', 'Sent_Mean_Delta', 'Sent_Mean_EMA', 'Sent_x_VIX'
]
return data[feature_cols].dropna() |