|
|
import os |
|
|
import pandas as pd |
|
|
from huggingface_hub import HfApi, hf_hub_download |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
def download_from_hf_dataset(file_path, dataset_name, token=None, repo_type="dataset"): |
|
|
""" |
|
|
Download a file from a Hugging Face dataset repository. |
|
|
""" |
|
|
if token is None: |
|
|
token = os.getenv("HF_TOKEN") |
|
|
|
|
|
try: |
|
|
local_path = hf_hub_download( |
|
|
repo_id=dataset_name, |
|
|
filename=file_path, |
|
|
repo_type=repo_type, |
|
|
local_dir=".", |
|
|
token=token, |
|
|
) |
|
|
print( |
|
|
f"Successfully downloaded {file_path} from {dataset_name} to {local_path}" |
|
|
) |
|
|
return local_path |
|
|
except Exception as e: |
|
|
print(f"Error downloading file: {str(e)}") |
|
|
|
|
|
if os.path.exists(file_path): |
|
|
print(f"Found local copy of {file_path}, using that.") |
|
|
return file_path |
|
|
return None |
|
|
|
|
|
|
|
|
DEFAULT_FILTER_QUERY = ( |
|
|
"premarket_change_from_perviousday_perc > 8 and " |
|
|
"premarket_close > 2 and " |
|
|
"`Shares Float`>1e6 and " |
|
|
"`Market Capitalization`<100e6" |
|
|
) |
|
|
|
|
|
|
|
|
def load_data(filter_query=DEFAULT_FILTER_QUERY): |
|
|
""" |
|
|
Loads and preprocesses the specific penny stock dataset. |
|
|
""" |
|
|
token = os.getenv("HF_TOKEN") |
|
|
dataset_name = "AmirTrader/PennyStocks" |
|
|
|
|
|
|
|
|
|
|
|
default_file = "marketsession_post_polygon_2020-01-01_2025-12-01.parquet_with_premarketvolume900K_marketcap1B.parquet" |
|
|
target_file = os.getenv("TARGET_FILE", default_file) |
|
|
|
|
|
|
|
|
local_path = download_from_hf_dataset( |
|
|
file_path=target_file, dataset_name=dataset_name, token=token |
|
|
) |
|
|
|
|
|
if not local_path or not os.path.exists(local_path): |
|
|
raise FileNotFoundError(f"Could not find or download dataset: {target_file}") |
|
|
|
|
|
df = pd.read_parquet(local_path) |
|
|
|
|
|
|
|
|
|
|
|
if filter_query: |
|
|
try: |
|
|
df = df.query(filter_query).copy() |
|
|
except Exception as e: |
|
|
print(f"Error applying query '{filter_query}': {e}") |
|
|
|
|
|
raise e |
|
|
else: |
|
|
df = df.copy() |
|
|
|
|
|
|
|
|
if "datetime" in df.columns: |
|
|
|
|
|
|
|
|
df["datetime"] = pd.to_datetime(df["datetime"]) |
|
|
df["date"] = df["datetime"].dt.date |
|
|
|
|
|
return df |
|
|
|