data scraping without training function

#1
by khhamid - opened
data_collection/__pycache__/exception.cpython-36.pyc ADDED
Binary file (588 Bytes). View file
 
data_collection/__pycache__/logger.cpython-36.pyc ADDED
Binary file (842 Bytes). View file
 
data_collection/components/__pycache__/data_fetcher.cpython-36.pyc ADDED
Binary file (2.14 kB). View file
 
data_collection/components/__pycache__/data_preprocessor.cpython-36.pyc ADDED
Binary file (1.52 kB). View file
 
data_collection/components/data_fetcher.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import time
4
+ from logger import get_logger
5
+ from exception import CustomException
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ # Human-readable cryptocurrency symbol mapping
11
+ symbol_map = {
12
+ "bitcoin": "BTCUSDT",
13
+ "ethereum": "ETHUSDT",
14
+ "binance-coin": "BNBUSDT",
15
+ "ripple": "XRPUSDT",
16
+ "cardano": "ADAUSDT",
17
+ "solana": "SOLUSDT",
18
+ "polkadot": "DOTUSDT",
19
+ "dogecoin": "DOGEUSDT",
20
+ "shiba-inu": "SHIBUSDT",
21
+ "litecoin": "LTCUSDT",
22
+ "chainlink": "LINKUSDT",
23
+ "polygon": "MATICUSDT",
24
+ "avalanche": "AVAXUSDT",
25
+ "uniswap": "UNIUSDT",
26
+ "cosmos": "ATOMUSDT",
27
+ "stellar": "XLMUSDT",
28
+ "vechain": "VETUSDT",
29
+ "filecoin": "FILUSDT",
30
+ "algorand": "ALGOUSDT",
31
+ "monero": "XMRUSDT",
32
+ "bitcoin-cash": "BCHUSDT",
33
+ "eos": "EOSUSDT",
34
+ "tezos": "XTZUSDT",
35
+ "aave": "AAVEUSDT",
36
+ "compound": "COMPUSDT",
37
+ "maker": "MKRUSDT",
38
+ }
39
+
40
+ class DataFetcher:
41
+ def __init__(self, coin_name="ethereum", interval="1d", limit=365):
42
+ self.coin = coin_name
43
+ self.interval = interval
44
+ self.limit = limit
45
+ self.url = "https://api.binance.com/api/v3/klines"
46
+
47
+ def fetch_klines(self):
48
+ """Fetch data from Binance API with pagination to get more than 1000 rows"""
49
+ all_data = []
50
+ end_time = None # Start from the most recent data
51
+
52
+ while len(all_data) < self.limit:
53
+ params = {
54
+ "symbol": symbol_map[self.coin],
55
+ "interval": self.interval,
56
+ "limit": min(self.limit - len(all_data), 1000), # Request up to 1000 rows per API call
57
+ }
58
+ if end_time:
59
+ params["endTime"] = end_time # Set endTime to get older data
60
+
61
+ try:
62
+ response = requests.get(self.url, params=params)
63
+ response.raise_for_status()
64
+ data = response.json()
65
+
66
+ if not data:
67
+ break # Stop if no more data is returned
68
+
69
+ all_data.extend(data)
70
+ end_time = data[0][0] - 1 # Update endTime to fetch older data
71
+
72
+ time.sleep(1) # Binance API rate limiting
73
+
74
+ except requests.exceptions.RequestException as e:
75
+ logger.error(f"Error fetching Binance data: {e}")
76
+ raise CustomException(f"API Request Failed: {e}")
77
+
78
+ return all_data[:self.limit] # Ensure we return exactly `limit` rows
79
+
data_collection/components/data_preprocessor.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ from logger import get_logger
4
+ from datetime import datetime
5
+ logger = get_logger()
6
+
7
+ class DataPreprocessor:
8
+ @staticmethod
9
+ def process_klines(data):
10
+ """Convert Binance API data to a DataFrame and clean it."""
11
+ columns = ["Open_Time", "Open", "High", "Low", "Close", "Volume",
12
+ "Close_Time", "Quote_Asset_Volume", "Number_of_Trades",
13
+ "Taker_Buy_Base_Volume", "Taker_Buy_Quote_Volume", "Ignore"]
14
+
15
+ df = pd.DataFrame(data, columns=columns)
16
+
17
+ # Convert timestamps to datetime
18
+ df["Open_Time"] = pd.to_datetime(df["Open_Time"], unit="ms")
19
+ df["Close_Time"] = pd.to_datetime(df["Close_Time"], unit="ms")
20
+
21
+ # Convert numerical values to float
22
+ num_cols = ["Open", "High", "Low", "Close", "Volume",
23
+ "Quote_Asset_Volume", "Taker_Buy_Base_Volume", "Taker_Buy_Quote_Volume"]
24
+
25
+ df[num_cols] = df[num_cols].astype(float)
26
+
27
+ logger.info("Data successfully processed and cleaned.")
28
+ return df
29
+
30
+ @staticmethod
31
+ def save_to_csv(df, file_path=f"data/raw_datasetes/crypto_data_{datetime.now()}.csv"):
32
+ """Save the DataFrame to a CSV file."""
33
+ os.makedirs(os.path.dirname(file_path), exist_ok=True) # Ensure directory exists
34
+ df.to_csv(file_path, index=False)
35
+ logger.info(f"Data successfully saved to {file_path}")
data_collection/exception.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ class CustomException(Exception):
4
+ def __init__(self, message, error_details=sys):
5
+ super().__init__(message)
6
+ self.error_details = error_details
data_collection/logger.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ def get_logger():
6
+ # Ensure logs directory exists
7
+ log_dir = "logs"
8
+ os.makedirs(log_dir, exist_ok=True)
9
+
10
+ # Generate log file name based on the current date
11
+ log_file = os.path.join(log_dir, f"pipeline_{datetime.now().strftime('%Y-%m-%d')}.log")
12
+
13
+ # Create a logger
14
+ logger = logging.getLogger("CryptoPipeline")
15
+ logger.setLevel(logging.INFO)
16
+
17
+ # Formatter for logs
18
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
19
+
20
+ # File handler (logs will be saved in 'logs/pipeline_YYYY-MM-DD.log')
21
+ file_handler = logging.FileHandler(log_file)
22
+ file_handler.setFormatter(formatter)
23
+
24
+ # Stream handler (logs will also appear in the console)
25
+ stream_handler = logging.StreamHandler()
26
+ stream_handler.setFormatter(formatter)
27
+
28
+ # Avoid duplicate handlers
29
+ if not logger.hasHandlers():
30
+ logger.addHandler(file_handler)
31
+ logger.addHandler(stream_handler)
32
+
33
+ return logger
data_collection/logs/pipeline_2025-04-01.log ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-01 00:15:25,314 - INFO - Starting the cryptocurrency data pipeline...
2
+ 2025-04-01 00:15:27,128 - INFO - Data successfully processed and cleaned.
3
+ 2025-04-01 00:15:27,162 - INFO - Data successfully saved to artifacts/crypto_data.csv
4
+ 2025-04-01 00:15:27,220 - INFO - Pipeline executed successfully!
5
+ 2025-04-01 00:22:05,701 - INFO - Starting the cryptocurrency data pipeline...
6
+ 2025-04-01 00:22:07,406 - INFO - Data successfully processed and cleaned.
7
+ 2025-04-01 00:22:07,406 - ERROR - An error occurred in the pipeline: name 'datetime' is not defined
8
+ 2025-04-01 00:22:53,568 - INFO - Starting the cryptocurrency data pipeline...
9
+ 2025-04-01 00:22:55,350 - INFO - Data successfully processed and cleaned.
10
+ 2025-04-01 00:22:55,355 - ERROR - An error occurred in the pipeline: [Errno 22] Invalid argument: 'data/raw_datasetes/crypto_data_2025-04-01 00:22:55.352917.csv'
11
+ 2025-04-01 00:28:53,891 - INFO - Starting the cryptocurrency data pipeline...
12
+ 2025-04-01 00:28:55,583 - INFO - Data successfully processed and cleaned.
13
+ 2025-04-01 00:28:55,592 - INFO - Data successfully saved to data/raw_datasetes/crypto_data_2025_04_01_00.csv
14
+ 2025-04-01 00:28:55,626 - INFO - Pipeline executed successfully!
15
+ 2025-04-01 00:30:44,830 - INFO - Starting the cryptocurrency data pipeline...
16
+ 2025-04-01 00:30:46,542 - INFO - Data successfully processed and cleaned.
17
+ 2025-04-01 00:30:46,552 - INFO - Data successfully saved to ../../data/raw_datasetes/crypto_data_2025_04_01_00.csv
18
+ 2025-04-01 00:30:46,601 - INFO - Pipeline executed successfully!
19
+ 2025-04-01 00:32:20,299 - INFO - Starting the cryptocurrency data pipeline...
20
+ 2025-04-01 00:32:22,076 - INFO - Data successfully processed and cleaned.
21
+ 2025-04-01 00:32:22,089 - INFO - Data successfully saved to ../../data/raw_datasetes/crypto_data_2025_04_01_00.csv
22
+ 2025-04-01 00:32:22,133 - INFO - Pipeline executed successfully!
23
+ 2025-04-01 00:33:02,058 - INFO - Starting the cryptocurrency data pipeline...
24
+ 2025-04-01 00:33:03,808 - INFO - Data successfully processed and cleaned.
25
+ 2025-04-01 00:33:03,820 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
26
+ 2025-04-01 00:33:03,876 - INFO - Pipeline executed successfully!
27
+ 2025-04-01 00:35:19,340 - INFO - Starting the cryptocurrency data pipeline...
28
+ 2025-04-01 00:35:21,049 - INFO - Data successfully processed and cleaned.
29
+ 2025-04-01 00:35:21,056 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
30
+ 2025-04-01 00:35:21,088 - INFO - Pipeline executed successfully!
31
+ 2025-04-01 00:35:31,100 - INFO - Starting the cryptocurrency data pipeline...
32
+ 2025-04-01 00:35:32,404 - INFO - Data successfully processed and cleaned.
33
+ 2025-04-01 00:35:32,420 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
34
+ 2025-04-01 00:35:32,464 - INFO - Pipeline executed successfully!
35
+ 2025-04-01 00:35:42,482 - INFO - Starting the cryptocurrency data pipeline...
36
+ 2025-04-01 00:35:43,899 - INFO - Data successfully processed and cleaned.
37
+ 2025-04-01 00:35:43,906 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
38
+ 2025-04-01 00:35:43,946 - INFO - Pipeline executed successfully!
data_collection/pipeline ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from datetime import datetime
2
+ print(datetime.now().strftime("%Y_%m_%d_%H"))
data_collection/scraper.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from logger import get_logger
3
+ from exception import CustomException
4
+ from components.data_fetcher import DataFetcher
5
+ from components.data_preprocessor import DataPreprocessor
6
+ import argparse
7
+ from datetime import datetime
8
+ import time
9
+
10
+ logger = get_logger()
11
+
12
+ def scrape():
13
+ try:
14
+ logger.info("Starting the cryptocurrency data pipeline...")
15
+ print("Starting the cryptocurrency data pipeline...")
16
+
17
+ # Step 1: Fetch data from Binance
18
+ print("Step 1: Fetching data from Binance...")
19
+ # Argument parser setup
20
+ parser = argparse.ArgumentParser(description="Fetch cryptocurrency data from Binance")
21
+ parser.add_argument("--coin_name", type=str, help="Name of the cryptocurrency (e.g., ethereum, bitcoin)", required=True)
22
+ parser.add_argument("--interval", type=str, default="1d", help="Time interval (e.g., 1m, 1h, 1d)")
23
+ parser.add_argument("--limit", type=int, default=365, help="Number of data points to fetch")
24
+
25
+ args = parser.parse_args()
26
+ # Use arguments to fetch data
27
+ fetcher = DataFetcher(coin_name=args.coin_name, interval=args.interval, limit=args.limit)
28
+ raw_data = fetcher.fetch_klines()
29
+
30
+ print("Step 1 completed.")
31
+
32
+ # Step 2: Preprocess the data
33
+ print("Step 2: Processing the data...")
34
+ preprocessor = DataPreprocessor()
35
+ df = preprocessor.process_klines(raw_data)
36
+ print("Step 2 completed.")
37
+
38
+ # Step 3: Save to CSV
39
+ print("Step 3: Saving data to CSV...")
40
+ d=datetime.now().strftime("%Y_%m_%d_%H")
41
+ preprocessor.save_to_csv(df, file_path=f"../../data/raw_datasets/crypto_data_{d}.csv")
42
+ print("Step 3 completed.")
43
+
44
+ # Step 4: Display first few rows
45
+ print("Step 4: Displaying processed data sample...")
46
+ print(df.head())
47
+
48
+ logger.info("Pipeline executed successfully!")
49
+ print("Pipeline executed successfully!")
50
+
51
+ except Exception as e:
52
+ logger.error(f"An error occurred in the pipeline: {e}")
53
+ print(f"An error occurred in the pipeline: {e}")
54
+ raise CustomException(e, sys)
55
+
56
+
57
+ def train():
58
+ print("Training the model...")
59
+ if __name__ == "__main__":
60
+ i=1
61
+ while True:
62
+ scrape()
63
+ i+=1
64
+ if i==164:
65
+ train()
66
+ i=1
67
+ time.sleep(10)
68
+
69
+