Spaces:
Sleeping
Sleeping
File size: 5,816 Bytes
605fc75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import ta
from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt
# Function to calculate VADER sentiment scores
def calculate_vader_sentiment(text):
"""Calculate VADER sentiment scores for given text."""
try:
sia = SentimentIntensityAnalyzer()
return sia.polarity_scores(str(text))['compound']
except Exception as e:
print(f"Error calculating sentiment: {e}")
return 0
# Function to download stock data
def get_stock_data(ticker, start_date, end_date):
"""Download stock data using yfinance."""
try:
stock = yf.download(ticker, start=start_date, end=end_date)
stock.reset_index(inplace=True)
print(f"\nDownloaded {ticker} stock data:")
print(f"Date range: {stock['Date'].min()} to {stock['Date'].max()}")
print(f"Number of records: {len(stock)}")
return stock
except Exception as e:
print(f"Error downloading stock data: {e}")
return None
def preprocess_stock_data(stock_symbol, sentiment_file, start_date='2023-01-01'):
"""
Preprocess stock and sentiment data with proper error checking.
"""
print(f"\nProcessing data for {stock_symbol}:")
# Load sentiment data
sentiment_data = pd.read_csv(sentiment_file)
print(f"\nLoaded sentiment data from {sentiment_file}")
print(f"Columns available: {sentiment_data.columns.tolist()}")
# Calculate VADER sentiment if not already present
if 'cleaned_text' in sentiment_data.columns:
print("Calculating VADER sentiment scores...")
sentiment_data['vader_avg_sentiment'] = sentiment_data['cleaned_text'].apply(calculate_vader_sentiment)
# Convert sentiment data dates to datetime without timezone
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date']).dt.tz_localize(None)
# Download stock data
end_date = datetime.now().strftime('%Y-%m-%d')
stock_data = get_stock_data(stock_symbol, start_date, end_date)
if stock_data is None:
raise ValueError(f"Failed to download stock data for {stock_symbol}")
# Flatten column names if needed
if isinstance(stock_data.columns, pd.MultiIndex):
stock_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in stock_data.columns]
# Debugging: Check stock data structure
print("\nStock data columns:", stock_data.columns.tolist())
print("First few rows of stock data:")
print(stock_data.head())
# Rename 'Date_' to 'Date' if present
if 'Date_' in stock_data.columns:
stock_data.rename(columns={'Date_': 'Date'}, inplace=True)
# Check and rename the 'Close' column
close_column = f"Close_{stock_symbol}" if f"Close_{stock_symbol}" in stock_data.columns else "Close"
if close_column not in stock_data.columns:
raise ValueError(f"'{close_column}' column not found in stock data. Available columns: {stock_data.columns.tolist()}")
stock_data.rename(columns={close_column: 'close'}, inplace=True)
# Ensure 'Date' column exists
if 'Date' not in stock_data.columns:
raise ValueError("'Date' column not found in stock data after renaming.")
# Convert stock data dates to datetime without timezone
stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.tz_localize(None)
# Ensure 'close' is a numeric Series
print("\nConverting 'close' column to numeric...")
stock_data['close'] = pd.to_numeric(stock_data['close'], errors='coerce')
if stock_data['close'].isnull().all():
raise ValueError("'close' column contains all NaN values after conversion.")
# Calculate SMA
stock_data['SMA_50'] = ta.trend.sma_indicator(stock_data['close'], window=50)
stock_data['SMA_200'] = ta.trend.sma_indicator(stock_data['close'], window=200)
# Merge stock and sentiment data
merged_data = pd.merge_asof(
stock_data.sort_values('Date'),
sentiment_data.sort_values('Date'),
on='Date'
)
return merged_data
# Function to plot stock and sentiment data
def plot_stock_with_sentiment(data, stock_name):
"""Plot stock price and sentiment data."""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[3, 1])
# Stock price plot
ax1.plot(data['Date'], data['close'], label='Close Price')
ax1.plot(data['Date'], data['SMA_50'], label='50-day SMA')
ax1.plot(data['Date'], data['SMA_200'], label='200-day SMA')
ax1.set_title(f'{stock_name} Stock Price with Technical Indicators')
ax1.set_ylabel('Price')
ax1.legend()
# Sentiment plot
ax2.plot(data['Date'], data['vader_avg_sentiment'], label='Sentiment', color='purple')
ax2.fill_between(data['Date'], data['vader_avg_sentiment'], 0, alpha=0.2)
ax2.set_title('Sentiment Score')
ax2.set_ylabel('VADER Sentiment')
plt.tight_layout()
plt.show()
# Main execution
if __name__ == "__main__":
try:
print("Starting data preprocessing...")
# Process AAPL data
aapl_data = preprocess_stock_data(
'AAPL',
'cleaned_twitter_data.csv'
)
# Process TSLA data
tsla_data = preprocess_stock_data(
'TSLA',
'cleaned_reddit_data.csv'
)
# Plot the processed data
print("\nGenerating plots...")
plot_stock_with_sentiment(aapl_data, 'AAPL')
plot_stock_with_sentiment(tsla_data, 'TSLA')
except Exception as e:
print(f"\nError during processing: {e}")
import traceback
traceback.print_exc()
|