Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- scripts/cleanup_old_filings.py +44 -0
- scripts/run_pipeline.py +20 -0
scripts/cleanup_old_filings.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
from advisor.db import SessionLocal
|
| 5 |
+
from advisor.models import Filing
|
| 6 |
+
from config.settings import settings
|
| 7 |
+
|
| 8 |
+
def cleanup_old_filings(days: int = 365):
|
| 9 |
+
cutoff = datetime.date.today() - datetime.timedelta(days=days)
|
| 10 |
+
db = SessionLocal()
|
| 11 |
+
|
| 12 |
+
old_filings = (
|
| 13 |
+
db.query(Filing)
|
| 14 |
+
.filter(Filing.filing_date < cutoff)
|
| 15 |
+
.all()
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
removed = []
|
| 19 |
+
|
| 20 |
+
for filing in old_filings:
|
| 21 |
+
# Delete raw HTML
|
| 22 |
+
raw_path = filing.local_path
|
| 23 |
+
if raw_path and os.path.exists(raw_path):
|
| 24 |
+
os.remove(raw_path)
|
| 25 |
+
|
| 26 |
+
# Delete processed JSON
|
| 27 |
+
processed_path = os.path.join(
|
| 28 |
+
settings.SEC_PROCESSED_DIR,
|
| 29 |
+
f"{filing.accession_no}.json"
|
| 30 |
+
)
|
| 31 |
+
if os.path.exists(processed_path):
|
| 32 |
+
os.remove(processed_path)
|
| 33 |
+
|
| 34 |
+
removed.append(filing.accession_no)
|
| 35 |
+
db.delete(filing)
|
| 36 |
+
|
| 37 |
+
db.commit()
|
| 38 |
+
db.close()
|
| 39 |
+
|
| 40 |
+
return removed
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
removed = cleanup_old_filings()
|
| 44 |
+
print(f"Removed {len(removed)} old filings.")
|
scripts/run_pipeline.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 4 |
+
from ingestion.sec_downloader import ingest_ticker
|
| 5 |
+
from ingestion.text_extractor import extract_all
|
| 6 |
+
from indexes.build_sec_index import build_index
|
| 7 |
+
import argparse
|
| 8 |
+
|
| 9 |
+
p = argparse.ArgumentParser()
|
| 10 |
+
p.add_argument("tickers", nargs="+")
|
| 11 |
+
p.add_argument("--forms", nargs="+", default=["10-K","10-Q"])
|
| 12 |
+
p.add_argument("--max", type=int, default=5)
|
| 13 |
+
args = p.parse_args()
|
| 14 |
+
|
| 15 |
+
for ticker in args.tickers:
|
| 16 |
+
print(f"\n{'#'*50}\nRunning pipeline for {ticker.upper()}\n{'#'*50}")
|
| 17 |
+
ingest_ticker(ticker, form_types=args.forms, max_filings=args.max)
|
| 18 |
+
extract_all(ticker)
|
| 19 |
+
build_index(ticker)
|
| 20 |
+
print(f"\n✓ {ticker.upper()} complete")
|