itzkarthickkannan commited on
Commit
e634197
·
verified ·
1 Parent(s): 28c5847

Delete download_stock_data.py

Browse files
Files changed (1) hide show
  1. download_stock_data.py +0 -188
download_stock_data.py DELETED
@@ -1,188 +0,0 @@
1
- """
2
- Download Stock Market Data for BPE Tokenizer Training
3
- Downloads historical stock data from multiple sources and formats it for tokenization
4
- """
5
-
6
- import sys
7
- import io
8
-
9
- # Fix console encoding
10
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
11
-
12
- print("Installing required packages...")
13
- import subprocess
14
- subprocess.run([sys.executable, "-m", "pip", "install", "yfinance", "pandas"], check=True)
15
-
16
- import yfinance as yf
17
- import pandas as pd
18
- from datetime import datetime, timedelta
19
-
20
- def download_stock_data():
21
- """Download historical stock data for multiple companies"""
22
-
23
- # Major stocks from different sectors
24
- tickers = [
25
- # Tech
26
- 'AAPL', 'MSFT', 'GOOGL', 'META', 'NVDA', 'TSLA', 'AMD', 'INTC',
27
- # Finance
28
- 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'C',
29
- # Healthcare
30
- 'JNJ', 'UNH', 'PFE', 'ABBV', 'TMO', 'MRK',
31
- # Consumer
32
- 'AMZN', 'WMT', 'HD', 'NKE', 'MCD', 'SBUX',
33
- # Energy
34
- 'XOM', 'CVX', 'COP', 'SLB',
35
- # Industrial
36
- 'BA', 'CAT', 'GE', 'MMM',
37
- # Indices
38
- '^GSPC', '^DJI', '^IXIC' # S&P 500, Dow Jones, NASDAQ
39
- ]
40
-
41
- print(f"\nDownloading data for {len(tickers)} stocks...")
42
- print("This will download 5 years of daily data\n")
43
-
44
- # Download 5 years of data
45
- end_date = datetime.now()
46
- start_date = end_date - timedelta(days=5*365)
47
-
48
- all_data = []
49
-
50
- for i, ticker in enumerate(tickers, 1):
51
- try:
52
- print(f"[{i}/{len(tickers)}] Downloading {ticker}...", end=' ')
53
- stock = yf.Ticker(ticker)
54
- df = stock.history(start=start_date, end=end_date)
55
-
56
- if not df.empty:
57
- df['Ticker'] = ticker
58
- all_data.append(df)
59
- print(f"✓ ({len(df)} days)")
60
- else:
61
- print("✗ No data")
62
- except Exception as e:
63
- print(f"✗ Error: {e}")
64
-
65
- # Combine all data
66
- print(f"\nCombining data from {len(all_data)} stocks...")
67
- combined_df = pd.concat(all_data)
68
- combined_df = combined_df.reset_index()
69
-
70
- print(f"Total records: {len(combined_df):,}")
71
-
72
- return combined_df
73
-
74
- def format_for_tokenization(df):
75
- """Format stock data as text for BPE training with labels for better compression"""
76
-
77
- print("\nFormatting data for tokenization with labels...")
78
-
79
- # Sector mapping for major stocks
80
- sector_map = {
81
- 'AAPL': 'TECH', 'MSFT': 'TECH', 'GOOGL': 'TECH', 'META': 'TECH',
82
- 'NVDA': 'TECH', 'TSLA': 'AUTO', 'AMD': 'TECH', 'INTC': 'TECH',
83
- 'JPM': 'FIN', 'BAC': 'FIN', 'WFC': 'FIN', 'GS': 'FIN', 'MS': 'FIN', 'C': 'FIN',
84
- 'JNJ': 'HEALTH', 'UNH': 'HEALTH', 'PFE': 'HEALTH', 'ABBV': 'HEALTH',
85
- 'TMO': 'HEALTH', 'MRK': 'HEALTH',
86
- 'AMZN': 'RETAIL', 'WMT': 'RETAIL', 'HD': 'RETAIL', 'NKE': 'RETAIL',
87
- 'MCD': 'RETAIL', 'SBUX': 'RETAIL',
88
- 'XOM': 'ENERGY', 'CVX': 'ENERGY', 'COP': 'ENERGY', 'SLB': 'ENERGY',
89
- 'BA': 'INDUST', 'CAT': 'INDUST', 'GE': 'INDUST', 'MMM': 'INDUST',
90
- '^GSPC': 'INDEX', '^DJI': 'INDEX', '^IXIC': 'INDEX'
91
- }
92
-
93
- def get_volume_category(volume_millions):
94
- """Categorize volume for pattern repetition"""
95
- if volume_millions < 50:
96
- return 'LOW'
97
- elif volume_millions < 150:
98
- return 'MED'
99
- else:
100
- return 'HIGH'
101
-
102
- def get_price_range(price):
103
- """Categorize price into ranges"""
104
- if price < 50:
105
- return 'UNDER50'
106
- elif price < 100:
107
- return 'UNDER100'
108
- elif price < 200:
109
- return 'UNDER200'
110
- elif price < 500:
111
- return 'UNDER500'
112
- else:
113
- return 'OVER500'
114
-
115
- lines = []
116
- for _, row in df.iterrows():
117
- ticker = row['Ticker']
118
- sector = sector_map.get(ticker, 'OTHER')
119
-
120
- # Round prices to 1 decimal
121
- open_price = round(row['Open'], 1)
122
- high_price = round(row['High'], 1)
123
- low_price = round(row['Low'], 1)
124
- close_price = round(row['Close'], 1)
125
-
126
- # Volume in millions
127
- volume_millions = round(row['Volume'] / 1_000_000, 1)
128
- vol_category = get_volume_category(volume_millions)
129
-
130
- # Price range
131
- price_range = get_price_range(close_price)
132
-
133
- # Day of week for more repetition
134
- day_of_week = row['Date'].strftime('%a').upper() # MON, TUE, WED, etc.
135
-
136
- # Format with labels for better compression
137
- # Pattern: SECTOR|TICKER|YEAR-MONTH|DAY|RANGE|OPEN:X|HIGH:X|LOW:X|CLOSE:X|VOL:CAT
138
- line = (
139
- f"{sector}|{ticker}|"
140
- f"{row['Date'].strftime('%Y-%m')}|" # Month only
141
- f"{day_of_week}|" # Day of week
142
- f"{price_range}|"
143
- f"OPEN:{open_price}|"
144
- f"HIGH:{high_price}|"
145
- f"LOW:{low_price}|"
146
- f"CLOSE:{close_price}|"
147
- f"VOL:{vol_category}"
148
- )
149
- lines.append(line)
150
-
151
- # Join with newlines
152
- text = '\n'.join(lines)
153
-
154
- return text
155
-
156
- def save_corpus(text, filename='stock_corpus.txt'):
157
- """Save the formatted text corpus"""
158
-
159
- print(f"\nSaving to {filename}...")
160
- with open(filename, 'w', encoding='utf-8') as f:
161
- f.write(text)
162
-
163
- size_mb = len(text) / (1024 * 1024)
164
- print(f"✓ Saved {len(text):,} characters (~{size_mb:.2f} MB)")
165
-
166
- return filename
167
-
168
- if __name__ == "__main__":
169
- print("=" * 70)
170
- print("Stock Market Data Downloader for BPE Tokenizer")
171
- print("=" * 70)
172
-
173
- # Download data
174
- df = download_stock_data()
175
-
176
- # Format for tokenization
177
- text = format_for_tokenization(df)
178
-
179
- # Save corpus
180
- filename = save_corpus(text)
181
-
182
- print("\n" + "=" * 70)
183
- print("✓ Download complete!")
184
- print(f" Corpus saved to: {filename}")
185
- print(f" Total records: {len(df):,}")
186
- print(f" Date range: {df['Date'].min()} to {df['Date'].max()}")
187
- print("\nNext step: Run 'python train_tokenizer.py'")
188
- print("=" * 70)