Spaces:
Sleeping
Sleeping
data scraping without training function
#1
by
khhamid - opened
- data_collection/__pycache__/exception.cpython-36.pyc +0 -0
- data_collection/__pycache__/logger.cpython-36.pyc +0 -0
- data_collection/components/__pycache__/data_fetcher.cpython-36.pyc +0 -0
- data_collection/components/__pycache__/data_preprocessor.cpython-36.pyc +0 -0
- data_collection/components/data_fetcher.py +79 -0
- data_collection/components/data_preprocessor.py +35 -0
- data_collection/exception.py +6 -0
- data_collection/logger.py +33 -0
- data_collection/logs/pipeline_2025-04-01.log +38 -0
- data_collection/pipeline +2 -0
- data_collection/scraper.py +69 -0
data_collection/__pycache__/exception.cpython-36.pyc
ADDED
|
Binary file (588 Bytes). View file
|
|
|
data_collection/__pycache__/logger.cpython-36.pyc
ADDED
|
Binary file (842 Bytes). View file
|
|
|
data_collection/components/__pycache__/data_fetcher.cpython-36.pyc
ADDED
|
Binary file (2.14 kB). View file
|
|
|
data_collection/components/__pycache__/data_preprocessor.cpython-36.pyc
ADDED
|
Binary file (1.52 kB). View file
|
|
|
data_collection/components/data_fetcher.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import time
|
| 4 |
+
from logger import get_logger
|
| 5 |
+
from exception import CustomException
|
| 6 |
+
|
| 7 |
+
logger = get_logger()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Human-readable cryptocurrency symbol mapping
|
| 11 |
+
symbol_map = {
|
| 12 |
+
"bitcoin": "BTCUSDT",
|
| 13 |
+
"ethereum": "ETHUSDT",
|
| 14 |
+
"binance-coin": "BNBUSDT",
|
| 15 |
+
"ripple": "XRPUSDT",
|
| 16 |
+
"cardano": "ADAUSDT",
|
| 17 |
+
"solana": "SOLUSDT",
|
| 18 |
+
"polkadot": "DOTUSDT",
|
| 19 |
+
"dogecoin": "DOGEUSDT",
|
| 20 |
+
"shiba-inu": "SHIBUSDT",
|
| 21 |
+
"litecoin": "LTCUSDT",
|
| 22 |
+
"chainlink": "LINKUSDT",
|
| 23 |
+
"polygon": "MATICUSDT",
|
| 24 |
+
"avalanche": "AVAXUSDT",
|
| 25 |
+
"uniswap": "UNIUSDT",
|
| 26 |
+
"cosmos": "ATOMUSDT",
|
| 27 |
+
"stellar": "XLMUSDT",
|
| 28 |
+
"vechain": "VETUSDT",
|
| 29 |
+
"filecoin": "FILUSDT",
|
| 30 |
+
"algorand": "ALGOUSDT",
|
| 31 |
+
"monero": "XMRUSDT",
|
| 32 |
+
"bitcoin-cash": "BCHUSDT",
|
| 33 |
+
"eos": "EOSUSDT",
|
| 34 |
+
"tezos": "XTZUSDT",
|
| 35 |
+
"aave": "AAVEUSDT",
|
| 36 |
+
"compound": "COMPUSDT",
|
| 37 |
+
"maker": "MKRUSDT",
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
class DataFetcher:
|
| 41 |
+
def __init__(self, coin_name="ethereum", interval="1d", limit=365):
|
| 42 |
+
self.coin = coin_name
|
| 43 |
+
self.interval = interval
|
| 44 |
+
self.limit = limit
|
| 45 |
+
self.url = "https://api.binance.com/api/v3/klines"
|
| 46 |
+
|
| 47 |
+
def fetch_klines(self):
|
| 48 |
+
"""Fetch data from Binance API with pagination to get more than 1000 rows"""
|
| 49 |
+
all_data = []
|
| 50 |
+
end_time = None # Start from the most recent data
|
| 51 |
+
|
| 52 |
+
while len(all_data) < self.limit:
|
| 53 |
+
params = {
|
| 54 |
+
"symbol": symbol_map[self.coin],
|
| 55 |
+
"interval": self.interval,
|
| 56 |
+
"limit": min(self.limit - len(all_data), 1000), # Request up to 1000 rows per API call
|
| 57 |
+
}
|
| 58 |
+
if end_time:
|
| 59 |
+
params["endTime"] = end_time # Set endTime to get older data
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
response = requests.get(self.url, params=params)
|
| 63 |
+
response.raise_for_status()
|
| 64 |
+
data = response.json()
|
| 65 |
+
|
| 66 |
+
if not data:
|
| 67 |
+
break # Stop if no more data is returned
|
| 68 |
+
|
| 69 |
+
all_data.extend(data)
|
| 70 |
+
end_time = data[0][0] - 1 # Update endTime to fetch older data
|
| 71 |
+
|
| 72 |
+
time.sleep(1) # Binance API rate limiting
|
| 73 |
+
|
| 74 |
+
except requests.exceptions.RequestException as e:
|
| 75 |
+
logger.error(f"Error fetching Binance data: {e}")
|
| 76 |
+
raise CustomException(f"API Request Failed: {e}")
|
| 77 |
+
|
| 78 |
+
return all_data[:self.limit] # Ensure we return exactly `limit` rows
|
| 79 |
+
|
data_collection/components/data_preprocessor.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
from logger import get_logger
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
logger = get_logger()
|
| 6 |
+
|
| 7 |
+
class DataPreprocessor:
|
| 8 |
+
@staticmethod
|
| 9 |
+
def process_klines(data):
|
| 10 |
+
"""Convert Binance API data to a DataFrame and clean it."""
|
| 11 |
+
columns = ["Open_Time", "Open", "High", "Low", "Close", "Volume",
|
| 12 |
+
"Close_Time", "Quote_Asset_Volume", "Number_of_Trades",
|
| 13 |
+
"Taker_Buy_Base_Volume", "Taker_Buy_Quote_Volume", "Ignore"]
|
| 14 |
+
|
| 15 |
+
df = pd.DataFrame(data, columns=columns)
|
| 16 |
+
|
| 17 |
+
# Convert timestamps to datetime
|
| 18 |
+
df["Open_Time"] = pd.to_datetime(df["Open_Time"], unit="ms")
|
| 19 |
+
df["Close_Time"] = pd.to_datetime(df["Close_Time"], unit="ms")
|
| 20 |
+
|
| 21 |
+
# Convert numerical values to float
|
| 22 |
+
num_cols = ["Open", "High", "Low", "Close", "Volume",
|
| 23 |
+
"Quote_Asset_Volume", "Taker_Buy_Base_Volume", "Taker_Buy_Quote_Volume"]
|
| 24 |
+
|
| 25 |
+
df[num_cols] = df[num_cols].astype(float)
|
| 26 |
+
|
| 27 |
+
logger.info("Data successfully processed and cleaned.")
|
| 28 |
+
return df
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def save_to_csv(df, file_path=f"data/raw_datasetes/crypto_data_{datetime.now()}.csv"):
|
| 32 |
+
"""Save the DataFrame to a CSV file."""
|
| 33 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True) # Ensure directory exists
|
| 34 |
+
df.to_csv(file_path, index=False)
|
| 35 |
+
logger.info(f"Data successfully saved to {file_path}")
|
data_collection/exception.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
|
| 3 |
+
class CustomException(Exception):
|
| 4 |
+
def __init__(self, message, error_details=sys):
|
| 5 |
+
super().__init__(message)
|
| 6 |
+
self.error_details = error_details
|
data_collection/logger.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
def get_logger():
|
| 6 |
+
# Ensure logs directory exists
|
| 7 |
+
log_dir = "logs"
|
| 8 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
# Generate log file name based on the current date
|
| 11 |
+
log_file = os.path.join(log_dir, f"pipeline_{datetime.now().strftime('%Y-%m-%d')}.log")
|
| 12 |
+
|
| 13 |
+
# Create a logger
|
| 14 |
+
logger = logging.getLogger("CryptoPipeline")
|
| 15 |
+
logger.setLevel(logging.INFO)
|
| 16 |
+
|
| 17 |
+
# Formatter for logs
|
| 18 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 19 |
+
|
| 20 |
+
# File handler (logs will be saved in 'logs/pipeline_YYYY-MM-DD.log')
|
| 21 |
+
file_handler = logging.FileHandler(log_file)
|
| 22 |
+
file_handler.setFormatter(formatter)
|
| 23 |
+
|
| 24 |
+
# Stream handler (logs will also appear in the console)
|
| 25 |
+
stream_handler = logging.StreamHandler()
|
| 26 |
+
stream_handler.setFormatter(formatter)
|
| 27 |
+
|
| 28 |
+
# Avoid duplicate handlers
|
| 29 |
+
if not logger.hasHandlers():
|
| 30 |
+
logger.addHandler(file_handler)
|
| 31 |
+
logger.addHandler(stream_handler)
|
| 32 |
+
|
| 33 |
+
return logger
|
data_collection/logs/pipeline_2025-04-01.log
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-01 00:15:25,314 - INFO - Starting the cryptocurrency data pipeline...
|
| 2 |
+
2025-04-01 00:15:27,128 - INFO - Data successfully processed and cleaned.
|
| 3 |
+
2025-04-01 00:15:27,162 - INFO - Data successfully saved to artifacts/crypto_data.csv
|
| 4 |
+
2025-04-01 00:15:27,220 - INFO - Pipeline executed successfully!
|
| 5 |
+
2025-04-01 00:22:05,701 - INFO - Starting the cryptocurrency data pipeline...
|
| 6 |
+
2025-04-01 00:22:07,406 - INFO - Data successfully processed and cleaned.
|
| 7 |
+
2025-04-01 00:22:07,406 - ERROR - An error occurred in the pipeline: name 'datetime' is not defined
|
| 8 |
+
2025-04-01 00:22:53,568 - INFO - Starting the cryptocurrency data pipeline...
|
| 9 |
+
2025-04-01 00:22:55,350 - INFO - Data successfully processed and cleaned.
|
| 10 |
+
2025-04-01 00:22:55,355 - ERROR - An error occurred in the pipeline: [Errno 22] Invalid argument: 'data/raw_datasetes/crypto_data_2025-04-01 00:22:55.352917.csv'
|
| 11 |
+
2025-04-01 00:28:53,891 - INFO - Starting the cryptocurrency data pipeline...
|
| 12 |
+
2025-04-01 00:28:55,583 - INFO - Data successfully processed and cleaned.
|
| 13 |
+
2025-04-01 00:28:55,592 - INFO - Data successfully saved to data/raw_datasetes/crypto_data_2025_04_01_00.csv
|
| 14 |
+
2025-04-01 00:28:55,626 - INFO - Pipeline executed successfully!
|
| 15 |
+
2025-04-01 00:30:44,830 - INFO - Starting the cryptocurrency data pipeline...
|
| 16 |
+
2025-04-01 00:30:46,542 - INFO - Data successfully processed and cleaned.
|
| 17 |
+
2025-04-01 00:30:46,552 - INFO - Data successfully saved to ../../data/raw_datasetes/crypto_data_2025_04_01_00.csv
|
| 18 |
+
2025-04-01 00:30:46,601 - INFO - Pipeline executed successfully!
|
| 19 |
+
2025-04-01 00:32:20,299 - INFO - Starting the cryptocurrency data pipeline...
|
| 20 |
+
2025-04-01 00:32:22,076 - INFO - Data successfully processed and cleaned.
|
| 21 |
+
2025-04-01 00:32:22,089 - INFO - Data successfully saved to ../../data/raw_datasetes/crypto_data_2025_04_01_00.csv
|
| 22 |
+
2025-04-01 00:32:22,133 - INFO - Pipeline executed successfully!
|
| 23 |
+
2025-04-01 00:33:02,058 - INFO - Starting the cryptocurrency data pipeline...
|
| 24 |
+
2025-04-01 00:33:03,808 - INFO - Data successfully processed and cleaned.
|
| 25 |
+
2025-04-01 00:33:03,820 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
|
| 26 |
+
2025-04-01 00:33:03,876 - INFO - Pipeline executed successfully!
|
| 27 |
+
2025-04-01 00:35:19,340 - INFO - Starting the cryptocurrency data pipeline...
|
| 28 |
+
2025-04-01 00:35:21,049 - INFO - Data successfully processed and cleaned.
|
| 29 |
+
2025-04-01 00:35:21,056 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
|
| 30 |
+
2025-04-01 00:35:21,088 - INFO - Pipeline executed successfully!
|
| 31 |
+
2025-04-01 00:35:31,100 - INFO - Starting the cryptocurrency data pipeline...
|
| 32 |
+
2025-04-01 00:35:32,404 - INFO - Data successfully processed and cleaned.
|
| 33 |
+
2025-04-01 00:35:32,420 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
|
| 34 |
+
2025-04-01 00:35:32,464 - INFO - Pipeline executed successfully!
|
| 35 |
+
2025-04-01 00:35:42,482 - INFO - Starting the cryptocurrency data pipeline...
|
| 36 |
+
2025-04-01 00:35:43,899 - INFO - Data successfully processed and cleaned.
|
| 37 |
+
2025-04-01 00:35:43,906 - INFO - Data successfully saved to ../../data/raw_datasets/crypto_data_2025_04_01_00.csv
|
| 38 |
+
2025-04-01 00:35:43,946 - INFO - Pipeline executed successfully!
|
data_collection/pipeline
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
print(datetime.now().strftime("%Y_%m_%d_%H"))
|
data_collection/scraper.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from logger import get_logger
|
| 3 |
+
from exception import CustomException
|
| 4 |
+
from components.data_fetcher import DataFetcher
|
| 5 |
+
from components.data_preprocessor import DataPreprocessor
|
| 6 |
+
import argparse
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
logger = get_logger()
|
| 11 |
+
|
| 12 |
+
def scrape():
|
| 13 |
+
try:
|
| 14 |
+
logger.info("Starting the cryptocurrency data pipeline...")
|
| 15 |
+
print("Starting the cryptocurrency data pipeline...")
|
| 16 |
+
|
| 17 |
+
# Step 1: Fetch data from Binance
|
| 18 |
+
print("Step 1: Fetching data from Binance...")
|
| 19 |
+
# Argument parser setup
|
| 20 |
+
parser = argparse.ArgumentParser(description="Fetch cryptocurrency data from Binance")
|
| 21 |
+
parser.add_argument("--coin_name", type=str, help="Name of the cryptocurrency (e.g., ethereum, bitcoin)", required=True)
|
| 22 |
+
parser.add_argument("--interval", type=str, default="1d", help="Time interval (e.g., 1m, 1h, 1d)")
|
| 23 |
+
parser.add_argument("--limit", type=int, default=365, help="Number of data points to fetch")
|
| 24 |
+
|
| 25 |
+
args = parser.parse_args()
|
| 26 |
+
# Use arguments to fetch data
|
| 27 |
+
fetcher = DataFetcher(coin_name=args.coin_name, interval=args.interval, limit=args.limit)
|
| 28 |
+
raw_data = fetcher.fetch_klines()
|
| 29 |
+
|
| 30 |
+
print("Step 1 completed.")
|
| 31 |
+
|
| 32 |
+
# Step 2: Preprocess the data
|
| 33 |
+
print("Step 2: Processing the data...")
|
| 34 |
+
preprocessor = DataPreprocessor()
|
| 35 |
+
df = preprocessor.process_klines(raw_data)
|
| 36 |
+
print("Step 2 completed.")
|
| 37 |
+
|
| 38 |
+
# Step 3: Save to CSV
|
| 39 |
+
print("Step 3: Saving data to CSV...")
|
| 40 |
+
d=datetime.now().strftime("%Y_%m_%d_%H")
|
| 41 |
+
preprocessor.save_to_csv(df, file_path=f"../../data/raw_datasets/crypto_data_{d}.csv")
|
| 42 |
+
print("Step 3 completed.")
|
| 43 |
+
|
| 44 |
+
# Step 4: Display first few rows
|
| 45 |
+
print("Step 4: Displaying processed data sample...")
|
| 46 |
+
print(df.head())
|
| 47 |
+
|
| 48 |
+
logger.info("Pipeline executed successfully!")
|
| 49 |
+
print("Pipeline executed successfully!")
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"An error occurred in the pipeline: {e}")
|
| 53 |
+
print(f"An error occurred in the pipeline: {e}")
|
| 54 |
+
raise CustomException(e, sys)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def train():
|
| 58 |
+
print("Training the model...")
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
i=1
|
| 61 |
+
while True:
|
| 62 |
+
scrape()
|
| 63 |
+
i+=1
|
| 64 |
+
if i==164:
|
| 65 |
+
train()
|
| 66 |
+
i=1
|
| 67 |
+
time.sleep(10)
|
| 68 |
+
|
| 69 |
+
|