silashundhausen commited on
Commit
51454d1
·
verified ·
1 Parent(s): 95eaf1f

Upload Phase 2 Hierarchical Models (93.5% F1)

Browse files
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - financial-filings
4
+ - classification
5
+ - xgboost
6
+ - jina-embeddings-v3
7
+ library_name: xgboost
8
+ metrics:
9
+ - f1: 0.935
10
+ - accuracy: 0.95
11
+ ---
12
+
13
+ # Financial Reports Hierarchical Classifier
14
+
15
+ This is a production-grade Hierarchical Cascade Classifier designed to categorize European financial filings into **29 distinct classes**.
16
+
17
+ ## Architecture
18
+ - **Level 1 (Router):** A Jina-V3 + XGBoost model routing to 8 main categories.
19
+ - **Level 2 (Specialists):** Specialized XGBoost models for fine-grained classification.
20
+
21
+ ## Performance
22
+ - **Global Weighted F1-Score:** 93.5%
23
+ - **Top-2 Router Accuracy:** 97.3%
24
+
25
+ ## Usage
26
+
27
+ ```python
28
+ from huggingface_hub import snapshot_download
29
+ import sys
30
+ import os
31
+
32
+ # 1. Download Models
33
+ model_path = snapshot_download(repo_id="FinancialReports/hierarchical-filing-classifier")
34
+
35
+ # 2. Add path and import wrapper
36
+ sys.path.append(model_path)
37
+ from inference_wrapper import FinancialFilingClassifier
38
+
39
+ # 3. Predict
40
+ classifier = FinancialFilingClassifier(model_path)
41
+
42
+ text = "The Board declares a dividend of 5 cents per share..."
43
+ result = classifier.predict(text)
44
+
45
+ print(result)
46
+ # Expect: {'category': 'Equity Info', 'label': 'Notice of Dividend Amount', 'score': 0.98}
47
+ ```
inference_wrapper.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ import joblib
4
+ import torch
5
+ from transformers import AutoModel
6
+ import os
7
+
8
+ class FinancialFilingClassifier:
9
+ def __init__(self, model_dir):
10
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
11
+ print(f"Loading Jina Encoder on {self.device}...")
12
+ self.encoder = AutoModel.from_pretrained(
13
+ "jinaai/jina-embeddings-v3",
14
+ trust_remote_code=True,
15
+ torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32
16
+ ).to(self.device)
17
+
18
+ print("Loading XGBoost Cascade...")
19
+ self.router = joblib.load(os.path.join(model_dir, "router_xgb.pkl"))
20
+ self.router_le = joblib.load(os.path.join(model_dir, "router_le.pkl"))
21
+ self.specialists = {}
22
+ self.model_dir = model_dir
23
+
24
+ def _get_vector(self, text):
25
+ log_len = np.log1p(len(str(text)))
26
+ with torch.no_grad():
27
+ vec = self.encoder.encode([text], task="classification", max_length=8192)
28
+ return np.hstack([vec, [[log_len]]])
29
+
30
+ def _load_specialist(self, category):
31
+ safe_name = category.replace(" ", "_").replace("&", "and").replace("/", "_")
32
+ if safe_name not in self.specialists:
33
+ try:
34
+ clf = joblib.load(os.path.join(self.model_dir, f"specialist_{safe_name}_xgb.pkl"))
35
+ le = joblib.load(os.path.join(self.model_dir, f"specialist_{safe_name}_le.pkl"))
36
+ self.specialists[safe_name] = (clf, le)
37
+ except FileNotFoundError:
38
+ return None
39
+ return self.specialists[safe_name]
40
+
41
+ def predict(self, text):
42
+ vector = self._get_vector(text)
43
+ router_probs = self.router.predict_proba(vector)[0]
44
+ top_indices = np.argsort(router_probs)[::-1][:2]
45
+
46
+ candidates = []
47
+ for idx in top_indices:
48
+ category = self.router_le.classes_[idx]
49
+ router_conf = router_probs[idx]
50
+ specialist = self._load_specialist(category)
51
+
52
+ if specialist:
53
+ clf, le = specialist
54
+ spec_probs = clf.predict_proba(vector)[0]
55
+ best_idx = np.argmax(spec_probs)
56
+ label = le.classes_[best_idx]
57
+ spec_conf = spec_probs[best_idx]
58
+ combined_score = np.sqrt(router_conf * spec_conf)
59
+ candidates.append({"category": category, "label": label, "score": float(combined_score)})
60
+ else:
61
+ candidates.append({"category": category, "label": category, "score": float(router_conf)})
62
+
63
+ return max(candidates, key=lambda x: x['score'])
models/embeddings_cache.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4688ffc541223e3e9ec569c95eb068b6868a14cea6a137c629e178871365fd7a
3
+ size 113340544
models/router_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15a1d996fd2c0da4a615c9e597c908896674e27dad31e2ba7ab5102140e4320d
3
+ size 632
models/router_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d5620b5760644ba8ae76c97fc6aca687c5ce50b90b2100c1637b07e94fad427
3
+ size 73162563
models/specialist_AGM_Info_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477abc9b577e160165346f6cbdaff0f8efc6c472a00a6cfae850500d99eed90d
3
+ size 598
models/specialist_AGM_Info_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94130f1f5053739d54116d3c999705fc87cd4eeaa68872fa9815914a49d7b073
3
+ size 496534
models/specialist_Annual_General_Meeting_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477abc9b577e160165346f6cbdaff0f8efc6c472a00a6cfae850500d99eed90d
3
+ size 598
models/specialist_Annual_General_Meeting_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f74cc376e804cd27a1b1f2a79dc26b51259a8da911a2b198eca5e87ab1e28454
3
+ size 496534
models/specialist_Debt_Info_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6868fe3e56a10d06a704c2357ce1812f20a0710d28333081a2630ddc39b41eb3
3
+ size 530
models/specialist_Debt_Info_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13ce8e1806221950b6d16f3515ecf36b7282c22c61f2904e685903b9b2482b7c
3
+ size 138075
models/specialist_ESG_Info_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef2b24df4fd6c9ec6f401db704153fa435a029f0a7a4414186aadc82e264ac9
3
+ size 535
models/specialist_ESG_Info_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50edb2b9d105feaeb23706701b2e3ad893b1cb2dc08c6ec9fd91ec3a13a5da19
3
+ size 129758
models/specialist_Equity_Info_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a5feef1ff16ac95bd52b4bfa6e34d709d5c2bf17066341fb1c9a0d1450098e7
3
+ size 592
models/specialist_Equity_Info_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:530f21f5820d8de857f06b32622d55a6863d7878ab197b4f393f448703e53e25
3
+ size 677055
models/specialist_Equity_Information_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:548ab85c86c92c57b6701ad0db7a0f801299ea1ac9e588e25a3cb92e922e7729
3
+ size 607
models/specialist_Equity_Information_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca5f93c20020a37341ef493fc368def5c02221aa889c752cf3c117536cb02d27
3
+ size 835053
models/specialist_Financial_Reporting_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d81c24a1796d222f1cfcf561af8efcde464fb2f90b5049bf7578c68bfd5f9573
3
+ size 566
models/specialist_Financial_Reporting_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb465b9104ff331bbf6cb8906df68747995ec155109b548a6e66f647f8ff0c1
3
+ size 666991
models/specialist_Investment_Vehicle_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d0f6618f6e2e99ecaa2e4a133479504f0bb478fe7a93466327c56f6fc349dca
3
+ size 522
models/specialist_Investment_Vehicle_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6141f6e1eda21e0e1245c32e10f962a91fc88196b380f94b527eb28c5c351158
3
+ size 158342
models/specialist_Investor_Comm_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5fae7ca10a1807a1ccb21bfa698afbee9e4535899fe83eede1b7a66f68d69a6
3
+ size 549
models/specialist_Investor_Comm_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddf0256e5c5e2b0c334ec9562cdecceea17400b50b897f92ebda3887a1818cc
3
+ size 416531
models/specialist_Investor_Communication_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5fae7ca10a1807a1ccb21bfa698afbee9e4535899fe83eede1b7a66f68d69a6
3
+ size 549
models/specialist_Investor_Communication_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285265c6f1a893ac68ccfbc5e8ab238aeb03fe88e54edd048e2d4fa64cede4fe
3
+ size 416531
models/specialist_Listing_and_Regulatory_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31366cfe84445656c65d04dac807007a8a54ddb1fed494b7c5cc451c624d8939
3
+ size 519
models/specialist_Listing_and_Regulatory_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8162901718ab78a33438bcc335348395c80d749d39746a2d5bd1dd2783a6353f
3
+ size 141206
models/specialist_Management_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed859c1e51c69a0cf1c1fd4dc9359bd69769b8fcff1961df3a9b6ede766924bb
3
+ size 573
models/specialist_Management_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57c28884a3c727a6b269b5edd496a6ce98bb17dedad9d6ce36315e966e14ed33
3
+ size 672975
models/specialist_MandA_and_Legal_le.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da4d59bc1c406c82dbc76aeec01d2b811cea97b79c1066efbb9193d5ee810a52
3
+ size 515
models/specialist_MandA_and_Legal_xgb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f05a37c7cbffe8e5c998051959455654be81357afc3195fa65a8612f65b1e97
3
+ size 160790
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ xgboost
4
+ scikit-learn