investmentllm commited on
Commit
e41fa0b
·
verified ·
1 Parent(s): 6540f6a

Upload folder using huggingface_hub

Browse files
scripts/cleanup_old_filings.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ from advisor.db import SessionLocal
5
+ from advisor.models import Filing
6
+ from config.settings import settings
7
+
8
+ def cleanup_old_filings(days: int = 365):
9
+ cutoff = datetime.date.today() - datetime.timedelta(days=days)
10
+ db = SessionLocal()
11
+
12
+ old_filings = (
13
+ db.query(Filing)
14
+ .filter(Filing.filing_date < cutoff)
15
+ .all()
16
+ )
17
+
18
+ removed = []
19
+
20
+ for filing in old_filings:
21
+ # Delete raw HTML
22
+ raw_path = filing.local_path
23
+ if raw_path and os.path.exists(raw_path):
24
+ os.remove(raw_path)
25
+
26
+ # Delete processed JSON
27
+ processed_path = os.path.join(
28
+ settings.SEC_PROCESSED_DIR,
29
+ f"{filing.accession_no}.json"
30
+ )
31
+ if os.path.exists(processed_path):
32
+ os.remove(processed_path)
33
+
34
+ removed.append(filing.accession_no)
35
+ db.delete(filing)
36
+
37
+ db.commit()
38
+ db.close()
39
+
40
+ return removed
41
+
42
+ if __name__ == "__main__":
43
+ removed = cleanup_old_filings()
44
+ print(f"Removed {len(removed)} old filings.")
scripts/run_pipeline.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
4
+ from ingestion.sec_downloader import ingest_ticker
5
+ from ingestion.text_extractor import extract_all
6
+ from indexes.build_sec_index import build_index
7
+ import argparse
8
+
9
+ p = argparse.ArgumentParser()
10
+ p.add_argument("tickers", nargs="+")
11
+ p.add_argument("--forms", nargs="+", default=["10-K","10-Q"])
12
+ p.add_argument("--max", type=int, default=5)
13
+ args = p.parse_args()
14
+
15
+ for ticker in args.tickers:
16
+ print(f"\n{'#'*50}\nRunning pipeline for {ticker.upper()}\n{'#'*50}")
17
+ ingest_ticker(ticker, form_types=args.forms, max_filings=args.max)
18
+ extract_all(ticker)
19
+ build_index(ticker)
20
+ print(f"\n✓ {ticker.upper()} complete")