Upload Phase 2 Hierarchical Models (93.5% F1)
Browse files- README.md +47 -0
- inference_wrapper.py +63 -0
- models/embeddings_cache.npy +3 -0
- models/router_le.pkl +3 -0
- models/router_xgb.pkl +3 -0
- models/specialist_AGM_Info_le.pkl +3 -0
- models/specialist_AGM_Info_xgb.pkl +3 -0
- models/specialist_Annual_General_Meeting_le.pkl +3 -0
- models/specialist_Annual_General_Meeting_xgb.pkl +3 -0
- models/specialist_Debt_Info_le.pkl +3 -0
- models/specialist_Debt_Info_xgb.pkl +3 -0
- models/specialist_ESG_Info_le.pkl +3 -0
- models/specialist_ESG_Info_xgb.pkl +3 -0
- models/specialist_Equity_Info_le.pkl +3 -0
- models/specialist_Equity_Info_xgb.pkl +3 -0
- models/specialist_Equity_Information_le.pkl +3 -0
- models/specialist_Equity_Information_xgb.pkl +3 -0
- models/specialist_Financial_Reporting_le.pkl +3 -0
- models/specialist_Financial_Reporting_xgb.pkl +3 -0
- models/specialist_Investment_Vehicle_le.pkl +3 -0
- models/specialist_Investment_Vehicle_xgb.pkl +3 -0
- models/specialist_Investor_Comm_le.pkl +3 -0
- models/specialist_Investor_Comm_xgb.pkl +3 -0
- models/specialist_Investor_Communication_le.pkl +3 -0
- models/specialist_Investor_Communication_xgb.pkl +3 -0
- models/specialist_Listing_and_Regulatory_le.pkl +3 -0
- models/specialist_Listing_and_Regulatory_xgb.pkl +3 -0
- models/specialist_Management_le.pkl +3 -0
- models/specialist_Management_xgb.pkl +3 -0
- models/specialist_MandA_and_Legal_le.pkl +3 -0
- models/specialist_MandA_and_Legal_xgb.pkl +3 -0
- requirements.txt +4 -0
README.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- financial-filings
|
| 4 |
+
- classification
|
| 5 |
+
- xgboost
|
| 6 |
+
- jina-embeddings-v3
|
| 7 |
+
library_name: xgboost
|
| 8 |
+
metrics:
|
| 9 |
+
- f1: 0.935
|
| 10 |
+
- accuracy: 0.95
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Financial Reports Hierarchical Classifier
|
| 14 |
+
|
| 15 |
+
This is a production-grade Hierarchical Cascade Classifier designed to categorize European financial filings into **29 distinct classes**.
|
| 16 |
+
|
| 17 |
+
## Architecture
|
| 18 |
+
- **Level 1 (Router):** A Jina-V3 + XGBoost model routing to 8 main categories.
|
| 19 |
+
- **Level 2 (Specialists):** Specialized XGBoost models for fine-grained classification.
|
| 20 |
+
|
| 21 |
+
## Performance
|
| 22 |
+
- **Global Weighted F1-Score:** 93.5%
|
| 23 |
+
- **Top-2 Router Accuracy:** 97.3%
|
| 24 |
+
|
| 25 |
+
## Usage
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
from huggingface_hub import snapshot_download
|
| 29 |
+
import sys
|
| 30 |
+
import os
|
| 31 |
+
|
| 32 |
+
# 1. Download Models
|
| 33 |
+
model_path = snapshot_download(repo_id="FinancialReports/hierarchical-filing-classifier")
|
| 34 |
+
|
| 35 |
+
# 2. Add path and import wrapper
|
| 36 |
+
sys.path.append(model_path)
|
| 37 |
+
from inference_wrapper import FinancialFilingClassifier
|
| 38 |
+
|
| 39 |
+
# 3. Predict
|
| 40 |
+
classifier = FinancialFilingClassifier(model_path)
|
| 41 |
+
|
| 42 |
+
text = "The Board declares a dividend of 5 cents per share..."
|
| 43 |
+
result = classifier.predict(text)
|
| 44 |
+
|
| 45 |
+
print(result)
|
| 46 |
+
# Expect: {'category': 'Equity Info', 'label': 'Notice of Dividend Amount', 'score': 0.98}
|
| 47 |
+
```
|
inference_wrapper.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoModel
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
class FinancialFilingClassifier:
|
| 9 |
+
def __init__(self, model_dir):
|
| 10 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 11 |
+
print(f"Loading Jina Encoder on {self.device}...")
|
| 12 |
+
self.encoder = AutoModel.from_pretrained(
|
| 13 |
+
"jinaai/jina-embeddings-v3",
|
| 14 |
+
trust_remote_code=True,
|
| 15 |
+
torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32
|
| 16 |
+
).to(self.device)
|
| 17 |
+
|
| 18 |
+
print("Loading XGBoost Cascade...")
|
| 19 |
+
self.router = joblib.load(os.path.join(model_dir, "router_xgb.pkl"))
|
| 20 |
+
self.router_le = joblib.load(os.path.join(model_dir, "router_le.pkl"))
|
| 21 |
+
self.specialists = {}
|
| 22 |
+
self.model_dir = model_dir
|
| 23 |
+
|
| 24 |
+
def _get_vector(self, text):
|
| 25 |
+
log_len = np.log1p(len(str(text)))
|
| 26 |
+
with torch.no_grad():
|
| 27 |
+
vec = self.encoder.encode([text], task="classification", max_length=8192)
|
| 28 |
+
return np.hstack([vec, [[log_len]]])
|
| 29 |
+
|
| 30 |
+
def _load_specialist(self, category):
|
| 31 |
+
safe_name = category.replace(" ", "_").replace("&", "and").replace("/", "_")
|
| 32 |
+
if safe_name not in self.specialists:
|
| 33 |
+
try:
|
| 34 |
+
clf = joblib.load(os.path.join(self.model_dir, f"specialist_{safe_name}_xgb.pkl"))
|
| 35 |
+
le = joblib.load(os.path.join(self.model_dir, f"specialist_{safe_name}_le.pkl"))
|
| 36 |
+
self.specialists[safe_name] = (clf, le)
|
| 37 |
+
except FileNotFoundError:
|
| 38 |
+
return None
|
| 39 |
+
return self.specialists[safe_name]
|
| 40 |
+
|
| 41 |
+
def predict(self, text):
|
| 42 |
+
vector = self._get_vector(text)
|
| 43 |
+
router_probs = self.router.predict_proba(vector)[0]
|
| 44 |
+
top_indices = np.argsort(router_probs)[::-1][:2]
|
| 45 |
+
|
| 46 |
+
candidates = []
|
| 47 |
+
for idx in top_indices:
|
| 48 |
+
category = self.router_le.classes_[idx]
|
| 49 |
+
router_conf = router_probs[idx]
|
| 50 |
+
specialist = self._load_specialist(category)
|
| 51 |
+
|
| 52 |
+
if specialist:
|
| 53 |
+
clf, le = specialist
|
| 54 |
+
spec_probs = clf.predict_proba(vector)[0]
|
| 55 |
+
best_idx = np.argmax(spec_probs)
|
| 56 |
+
label = le.classes_[best_idx]
|
| 57 |
+
spec_conf = spec_probs[best_idx]
|
| 58 |
+
combined_score = np.sqrt(router_conf * spec_conf)
|
| 59 |
+
candidates.append({"category": category, "label": label, "score": float(combined_score)})
|
| 60 |
+
else:
|
| 61 |
+
candidates.append({"category": category, "label": category, "score": float(router_conf)})
|
| 62 |
+
|
| 63 |
+
return max(candidates, key=lambda x: x['score'])
|
models/embeddings_cache.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4688ffc541223e3e9ec569c95eb068b6868a14cea6a137c629e178871365fd7a
|
| 3 |
+
size 113340544
|
models/router_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15a1d996fd2c0da4a615c9e597c908896674e27dad31e2ba7ab5102140e4320d
|
| 3 |
+
size 632
|
models/router_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d5620b5760644ba8ae76c97fc6aca687c5ce50b90b2100c1637b07e94fad427
|
| 3 |
+
size 73162563
|
models/specialist_AGM_Info_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:477abc9b577e160165346f6cbdaff0f8efc6c472a00a6cfae850500d99eed90d
|
| 3 |
+
size 598
|
models/specialist_AGM_Info_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94130f1f5053739d54116d3c999705fc87cd4eeaa68872fa9815914a49d7b073
|
| 3 |
+
size 496534
|
models/specialist_Annual_General_Meeting_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:477abc9b577e160165346f6cbdaff0f8efc6c472a00a6cfae850500d99eed90d
|
| 3 |
+
size 598
|
models/specialist_Annual_General_Meeting_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f74cc376e804cd27a1b1f2a79dc26b51259a8da911a2b198eca5e87ab1e28454
|
| 3 |
+
size 496534
|
models/specialist_Debt_Info_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6868fe3e56a10d06a704c2357ce1812f20a0710d28333081a2630ddc39b41eb3
|
| 3 |
+
size 530
|
models/specialist_Debt_Info_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13ce8e1806221950b6d16f3515ecf36b7282c22c61f2904e685903b9b2482b7c
|
| 3 |
+
size 138075
|
models/specialist_ESG_Info_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ef2b24df4fd6c9ec6f401db704153fa435a029f0a7a4414186aadc82e264ac9
|
| 3 |
+
size 535
|
models/specialist_ESG_Info_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50edb2b9d105feaeb23706701b2e3ad893b1cb2dc08c6ec9fd91ec3a13a5da19
|
| 3 |
+
size 129758
|
models/specialist_Equity_Info_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a5feef1ff16ac95bd52b4bfa6e34d709d5c2bf17066341fb1c9a0d1450098e7
|
| 3 |
+
size 592
|
models/specialist_Equity_Info_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:530f21f5820d8de857f06b32622d55a6863d7878ab197b4f393f448703e53e25
|
| 3 |
+
size 677055
|
models/specialist_Equity_Information_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:548ab85c86c92c57b6701ad0db7a0f801299ea1ac9e588e25a3cb92e922e7729
|
| 3 |
+
size 607
|
models/specialist_Equity_Information_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca5f93c20020a37341ef493fc368def5c02221aa889c752cf3c117536cb02d27
|
| 3 |
+
size 835053
|
models/specialist_Financial_Reporting_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d81c24a1796d222f1cfcf561af8efcde464fb2f90b5049bf7578c68bfd5f9573
|
| 3 |
+
size 566
|
models/specialist_Financial_Reporting_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2eb465b9104ff331bbf6cb8906df68747995ec155109b548a6e66f647f8ff0c1
|
| 3 |
+
size 666991
|
models/specialist_Investment_Vehicle_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d0f6618f6e2e99ecaa2e4a133479504f0bb478fe7a93466327c56f6fc349dca
|
| 3 |
+
size 522
|
models/specialist_Investment_Vehicle_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6141f6e1eda21e0e1245c32e10f962a91fc88196b380f94b527eb28c5c351158
|
| 3 |
+
size 158342
|
models/specialist_Investor_Comm_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5fae7ca10a1807a1ccb21bfa698afbee9e4535899fe83eede1b7a66f68d69a6
|
| 3 |
+
size 549
|
models/specialist_Investor_Comm_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ddf0256e5c5e2b0c334ec9562cdecceea17400b50b897f92ebda3887a1818cc
|
| 3 |
+
size 416531
|
models/specialist_Investor_Communication_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5fae7ca10a1807a1ccb21bfa698afbee9e4535899fe83eede1b7a66f68d69a6
|
| 3 |
+
size 549
|
models/specialist_Investor_Communication_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:285265c6f1a893ac68ccfbc5e8ab238aeb03fe88e54edd048e2d4fa64cede4fe
|
| 3 |
+
size 416531
|
models/specialist_Listing_and_Regulatory_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31366cfe84445656c65d04dac807007a8a54ddb1fed494b7c5cc451c624d8939
|
| 3 |
+
size 519
|
models/specialist_Listing_and_Regulatory_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8162901718ab78a33438bcc335348395c80d749d39746a2d5bd1dd2783a6353f
|
| 3 |
+
size 141206
|
models/specialist_Management_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed859c1e51c69a0cf1c1fd4dc9359bd69769b8fcff1961df3a9b6ede766924bb
|
| 3 |
+
size 573
|
models/specialist_Management_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57c28884a3c727a6b269b5edd496a6ce98bb17dedad9d6ce36315e966e14ed33
|
| 3 |
+
size 672975
|
models/specialist_MandA_and_Legal_le.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da4d59bc1c406c82dbc76aeec01d2b811cea97b79c1066efbb9193d5ee810a52
|
| 3 |
+
size 515
|
models/specialist_MandA_and_Legal_xgb.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f05a37c7cbffe8e5c998051959455654be81357afc3195fa65a8612f65b1e97
|
| 3 |
+
size 160790
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
xgboost
|
| 4 |
+
scikit-learn
|