Spaces:
Sleeping
Sleeping
Delete download_stock_data.py
Browse files- download_stock_data.py +0 -188
download_stock_data.py
DELETED
|
@@ -1,188 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Download Stock Market Data for BPE Tokenizer Training
|
| 3 |
-
Downloads historical stock data from multiple sources and formats it for tokenization
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import sys
|
| 7 |
-
import io
|
| 8 |
-
|
| 9 |
-
# Fix console encoding
|
| 10 |
-
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
| 11 |
-
|
| 12 |
-
print("Installing required packages...")
|
| 13 |
-
import subprocess
|
| 14 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "yfinance", "pandas"], check=True)
|
| 15 |
-
|
| 16 |
-
import yfinance as yf
|
| 17 |
-
import pandas as pd
|
| 18 |
-
from datetime import datetime, timedelta
|
| 19 |
-
|
| 20 |
-
def download_stock_data():
|
| 21 |
-
"""Download historical stock data for multiple companies"""
|
| 22 |
-
|
| 23 |
-
# Major stocks from different sectors
|
| 24 |
-
tickers = [
|
| 25 |
-
# Tech
|
| 26 |
-
'AAPL', 'MSFT', 'GOOGL', 'META', 'NVDA', 'TSLA', 'AMD', 'INTC',
|
| 27 |
-
# Finance
|
| 28 |
-
'JPM', 'BAC', 'WFC', 'GS', 'MS', 'C',
|
| 29 |
-
# Healthcare
|
| 30 |
-
'JNJ', 'UNH', 'PFE', 'ABBV', 'TMO', 'MRK',
|
| 31 |
-
# Consumer
|
| 32 |
-
'AMZN', 'WMT', 'HD', 'NKE', 'MCD', 'SBUX',
|
| 33 |
-
# Energy
|
| 34 |
-
'XOM', 'CVX', 'COP', 'SLB',
|
| 35 |
-
# Industrial
|
| 36 |
-
'BA', 'CAT', 'GE', 'MMM',
|
| 37 |
-
# Indices
|
| 38 |
-
'^GSPC', '^DJI', '^IXIC' # S&P 500, Dow Jones, NASDAQ
|
| 39 |
-
]
|
| 40 |
-
|
| 41 |
-
print(f"\nDownloading data for {len(tickers)} stocks...")
|
| 42 |
-
print("This will download 5 years of daily data\n")
|
| 43 |
-
|
| 44 |
-
# Download 5 years of data
|
| 45 |
-
end_date = datetime.now()
|
| 46 |
-
start_date = end_date - timedelta(days=5*365)
|
| 47 |
-
|
| 48 |
-
all_data = []
|
| 49 |
-
|
| 50 |
-
for i, ticker in enumerate(tickers, 1):
|
| 51 |
-
try:
|
| 52 |
-
print(f"[{i}/{len(tickers)}] Downloading {ticker}...", end=' ')
|
| 53 |
-
stock = yf.Ticker(ticker)
|
| 54 |
-
df = stock.history(start=start_date, end=end_date)
|
| 55 |
-
|
| 56 |
-
if not df.empty:
|
| 57 |
-
df['Ticker'] = ticker
|
| 58 |
-
all_data.append(df)
|
| 59 |
-
print(f"✓ ({len(df)} days)")
|
| 60 |
-
else:
|
| 61 |
-
print("✗ No data")
|
| 62 |
-
except Exception as e:
|
| 63 |
-
print(f"✗ Error: {e}")
|
| 64 |
-
|
| 65 |
-
# Combine all data
|
| 66 |
-
print(f"\nCombining data from {len(all_data)} stocks...")
|
| 67 |
-
combined_df = pd.concat(all_data)
|
| 68 |
-
combined_df = combined_df.reset_index()
|
| 69 |
-
|
| 70 |
-
print(f"Total records: {len(combined_df):,}")
|
| 71 |
-
|
| 72 |
-
return combined_df
|
| 73 |
-
|
| 74 |
-
def format_for_tokenization(df):
|
| 75 |
-
"""Format stock data as text for BPE training with labels for better compression"""
|
| 76 |
-
|
| 77 |
-
print("\nFormatting data for tokenization with labels...")
|
| 78 |
-
|
| 79 |
-
# Sector mapping for major stocks
|
| 80 |
-
sector_map = {
|
| 81 |
-
'AAPL': 'TECH', 'MSFT': 'TECH', 'GOOGL': 'TECH', 'META': 'TECH',
|
| 82 |
-
'NVDA': 'TECH', 'TSLA': 'AUTO', 'AMD': 'TECH', 'INTC': 'TECH',
|
| 83 |
-
'JPM': 'FIN', 'BAC': 'FIN', 'WFC': 'FIN', 'GS': 'FIN', 'MS': 'FIN', 'C': 'FIN',
|
| 84 |
-
'JNJ': 'HEALTH', 'UNH': 'HEALTH', 'PFE': 'HEALTH', 'ABBV': 'HEALTH',
|
| 85 |
-
'TMO': 'HEALTH', 'MRK': 'HEALTH',
|
| 86 |
-
'AMZN': 'RETAIL', 'WMT': 'RETAIL', 'HD': 'RETAIL', 'NKE': 'RETAIL',
|
| 87 |
-
'MCD': 'RETAIL', 'SBUX': 'RETAIL',
|
| 88 |
-
'XOM': 'ENERGY', 'CVX': 'ENERGY', 'COP': 'ENERGY', 'SLB': 'ENERGY',
|
| 89 |
-
'BA': 'INDUST', 'CAT': 'INDUST', 'GE': 'INDUST', 'MMM': 'INDUST',
|
| 90 |
-
'^GSPC': 'INDEX', '^DJI': 'INDEX', '^IXIC': 'INDEX'
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
def get_volume_category(volume_millions):
|
| 94 |
-
"""Categorize volume for pattern repetition"""
|
| 95 |
-
if volume_millions < 50:
|
| 96 |
-
return 'LOW'
|
| 97 |
-
elif volume_millions < 150:
|
| 98 |
-
return 'MED'
|
| 99 |
-
else:
|
| 100 |
-
return 'HIGH'
|
| 101 |
-
|
| 102 |
-
def get_price_range(price):
|
| 103 |
-
"""Categorize price into ranges"""
|
| 104 |
-
if price < 50:
|
| 105 |
-
return 'UNDER50'
|
| 106 |
-
elif price < 100:
|
| 107 |
-
return 'UNDER100'
|
| 108 |
-
elif price < 200:
|
| 109 |
-
return 'UNDER200'
|
| 110 |
-
elif price < 500:
|
| 111 |
-
return 'UNDER500'
|
| 112 |
-
else:
|
| 113 |
-
return 'OVER500'
|
| 114 |
-
|
| 115 |
-
lines = []
|
| 116 |
-
for _, row in df.iterrows():
|
| 117 |
-
ticker = row['Ticker']
|
| 118 |
-
sector = sector_map.get(ticker, 'OTHER')
|
| 119 |
-
|
| 120 |
-
# Round prices to 1 decimal
|
| 121 |
-
open_price = round(row['Open'], 1)
|
| 122 |
-
high_price = round(row['High'], 1)
|
| 123 |
-
low_price = round(row['Low'], 1)
|
| 124 |
-
close_price = round(row['Close'], 1)
|
| 125 |
-
|
| 126 |
-
# Volume in millions
|
| 127 |
-
volume_millions = round(row['Volume'] / 1_000_000, 1)
|
| 128 |
-
vol_category = get_volume_category(volume_millions)
|
| 129 |
-
|
| 130 |
-
# Price range
|
| 131 |
-
price_range = get_price_range(close_price)
|
| 132 |
-
|
| 133 |
-
# Day of week for more repetition
|
| 134 |
-
day_of_week = row['Date'].strftime('%a').upper() # MON, TUE, WED, etc.
|
| 135 |
-
|
| 136 |
-
# Format with labels for better compression
|
| 137 |
-
# Pattern: SECTOR|TICKER|YEAR-MONTH|DAY|RANGE|OPEN:X|HIGH:X|LOW:X|CLOSE:X|VOL:CAT
|
| 138 |
-
line = (
|
| 139 |
-
f"{sector}|{ticker}|"
|
| 140 |
-
f"{row['Date'].strftime('%Y-%m')}|" # Month only
|
| 141 |
-
f"{day_of_week}|" # Day of week
|
| 142 |
-
f"{price_range}|"
|
| 143 |
-
f"OPEN:{open_price}|"
|
| 144 |
-
f"HIGH:{high_price}|"
|
| 145 |
-
f"LOW:{low_price}|"
|
| 146 |
-
f"CLOSE:{close_price}|"
|
| 147 |
-
f"VOL:{vol_category}"
|
| 148 |
-
)
|
| 149 |
-
lines.append(line)
|
| 150 |
-
|
| 151 |
-
# Join with newlines
|
| 152 |
-
text = '\n'.join(lines)
|
| 153 |
-
|
| 154 |
-
return text
|
| 155 |
-
|
| 156 |
-
def save_corpus(text, filename='stock_corpus.txt'):
|
| 157 |
-
"""Save the formatted text corpus"""
|
| 158 |
-
|
| 159 |
-
print(f"\nSaving to {filename}...")
|
| 160 |
-
with open(filename, 'w', encoding='utf-8') as f:
|
| 161 |
-
f.write(text)
|
| 162 |
-
|
| 163 |
-
size_mb = len(text) / (1024 * 1024)
|
| 164 |
-
print(f"✓ Saved {len(text):,} characters (~{size_mb:.2f} MB)")
|
| 165 |
-
|
| 166 |
-
return filename
|
| 167 |
-
|
| 168 |
-
if __name__ == "__main__":
|
| 169 |
-
print("=" * 70)
|
| 170 |
-
print("Stock Market Data Downloader for BPE Tokenizer")
|
| 171 |
-
print("=" * 70)
|
| 172 |
-
|
| 173 |
-
# Download data
|
| 174 |
-
df = download_stock_data()
|
| 175 |
-
|
| 176 |
-
# Format for tokenization
|
| 177 |
-
text = format_for_tokenization(df)
|
| 178 |
-
|
| 179 |
-
# Save corpus
|
| 180 |
-
filename = save_corpus(text)
|
| 181 |
-
|
| 182 |
-
print("\n" + "=" * 70)
|
| 183 |
-
print("✓ Download complete!")
|
| 184 |
-
print(f" Corpus saved to: {filename}")
|
| 185 |
-
print(f" Total records: {len(df):,}")
|
| 186 |
-
print(f" Date range: {df['Date'].min()} to {df['Date'].max()}")
|
| 187 |
-
print("\nNext step: Run 'python train_tokenizer.py'")
|
| 188 |
-
print("=" * 70)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|