teofizzy commited on
Commit
7011b92
·
1 Parent(s): f778fd6

prototype stage

Browse files
.gitignore ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------------------------------------------------------------
2
+ # 1. Python & Virtual Environments (Standard)
3
+ # -------------------------------------------------------------------------
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+ *.so
8
+ .venv/
9
+ venv/
10
+ env/
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+
28
+ # -------------------------------------------------------------------------
29
+ # 2. Security & Secrets (CRITICAL)
30
+ # Never commit your API keys (HuggingFace, OpenAI, etc.)
31
+ # -------------------------------------------------------------------------
32
+ .env
33
+ .env.local
34
+ .env.*
35
+ secrets.json
36
+ credentials.json
37
+
38
+ # -------------------------------------------------------------------------
39
+ # 3. Large AI Models & Weights (Hugging Face / PyTorch)
40
+ # These files are GBs in size; use DVC to track them instead.
41
+ # -------------------------------------------------------------------------
42
+ *.bin
43
+ *.pt
44
+ *.pth
45
+ *.ckpt
46
+ *.safetensors
47
+ *.onnx
48
+ models/
49
+ weights/
50
+ checkpoints/
51
+ lora-adapters/
52
+
53
+ # -------------------------------------------------------------------------
54
+ # 4. Data & RAG Stores (MshauriFedha Specific)
55
+ # Ignore raw PDFs and local vector databases (Chroma/Faiss).
56
+ # -------------------------------------------------------------------------
57
+ data/
58
+ datasets/
59
+ corpus/
60
+ # Ignore ChromaDB and Faiss local persistence folders
61
+ chroma_db/
62
+ chroma_storage/
63
+ faiss_indexes/
64
+ storage/
65
+
66
+ # -------------------------------------------------------------------------
67
+ # 5. CSCS & HPC Specifics
68
+ # Ignore the huge container images and Slurm log files.
69
+ # -------------------------------------------------------------------------
70
+ *.sif
71
+ *.sif.*
72
+ *.tar.gz
73
+ slurm-*.out
74
+ slurm-*.err
75
+ core.*
76
+
77
+ # -------------------------------------------------------------------------
78
+ # 6. Jupyter Notebooks
79
+ # -------------------------------------------------------------------------
80
+ .ipynb_checkpoints/
81
+ *-checkpoint.ipynb
82
+ # Optional: if you don't want to commit notebook outputs (just code)
83
+ # *.ipynb (Uncomment this if you only want to commit .py scripts)
84
+
85
+ # -------------------------------------------------------------------------
86
+ # 7. DVC (Data Version Control)
87
+ # We ignore the local cache/config but KEEP the .dvc files.
88
+ # -------------------------------------------------------------------------
89
+ /dvc_storage
90
+ .dvc/config.local
91
+ .dvc/tmp
92
+ .dvc/cache
93
+
94
+ # -------------------------------------------------------------------------
95
+ # 8. IDE & Editors
96
+ # -------------------------------------------------------------------------
97
+ .vscode/
98
+ .idea/
99
+ .DS_Store
100
+
101
+ # -------------------------------------------------------------------------
102
+ # 9. Docker / Deployment
103
+ # -------------------------------------------------------------------------
104
+ docker-compose.override.yml
105
+ .coverage
106
+ htmlcov/
107
+ *.err
108
+ *.out
notebooks/inspect_md.ipynb ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "62db404a-4930-4279-afa2-35ae4d11d857",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Inspect markdown files - KNBS and CBK\n",
9
+ "In this notebook, the core objective is to inspect and ingest the text from the already processed markdown files for CBK and KNBS."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 8,
15
+ "id": "e8696cf9-6995-4af3-937f-9154ee6d0a99",
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "%load_ext autoreload\n",
20
+ "%autoreload 2"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 4,
26
+ "id": "44ff42b3-2377-4e1e-accb-02706eaae797",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import sys\n",
31
+ "import logging\n",
32
+ "import warnings\n",
33
+ "import os\n",
34
+ "import pandas as pd\n",
35
+ "warnings.filterwarnings(\"ignore\")\n",
36
+ "from pathlib import Path\n",
37
+ "\n",
38
+ "# Configure logging to see output in the notebook\n",
39
+ "logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 5,
45
+ "id": "0d114027-fee3-4186-9d02-f6535c728553",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "# Fix paths for src files\n",
50
+ "project_root = Path(os.getcwd()).parent\n",
51
+ "script_dir = project_root / \"src\"\n",
52
+ "if str(script_dir) not in sys.path:\n",
53
+ " sys.path.append(str(script_dir))"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 6,
59
+ "id": "64a8246f-58d2-4aa7-bd2c-c6a2741f8c19",
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "from load.start_ollama import start_ollama_server\n",
64
+ "from load.ingest_md import ingest_markdown_reports"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 7,
70
+ "id": "e6fa2ada-beac-448a-b1d2-535fd2b5d0b1",
71
+ "metadata": {},
72
+ "outputs": [
73
+ {
74
+ "name": "stdout",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "✅ Ollama is already running.\n"
78
+ ]
79
+ },
80
+ {
81
+ "data": {
82
+ "text/plain": [
83
+ "True"
84
+ ]
85
+ },
86
+ "execution_count": 7,
87
+ "metadata": {},
88
+ "output_type": "execute_result"
89
+ }
90
+ ],
91
+ "source": [
92
+ "start_ollama_server()"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 4,
98
+ "id": "894e6598-3c57-4484-bc3c-be043f06b5ca",
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "# Define your paths\n",
103
+ "SCRATCH_DIR = os.environ.get(\"SCRATCH\")\n",
104
+ "KNBS_MARKDOWN_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/knbs/marker-output\")\n",
105
+ "VECTOR_DB_PATH = \"mshauri_fedha_chroma_db\"\n",
106
+ "EMBEDDING_MODEL = \"nomic-embed-text\""
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 5,
112
+ "id": "d8b23cd1-ea3f-4f7b-b553-dbeca35ee61a",
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "📄 Scanning for Markdown Reports in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/knbs/marker-output...\n"
120
+ ]
121
+ },
122
+ {
123
+ "name": "stderr",
124
+ "output_type": "stream",
125
+ "text": [
126
+ "100%|██████████| 574/574 [00:00<00:00, 4626.46it/s]\n"
127
+ ]
128
+ },
129
+ {
130
+ "name": "stdout",
131
+ "output_type": "stream",
132
+ "text": [
133
+ " Loaded 574 report files.\n",
134
+ " ✂️ Split into 32717 chunks.\n",
135
+ "🧠 Appending to Vector Store...\n",
136
+ "INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
137
+ ]
138
+ },
139
+ {
140
+ "name": "stderr",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "Ingesting Reports: 100%|██████████| 32717/32717 [1:09:38<00:00, 7.83chunk/s]"
144
+ ]
145
+ },
146
+ {
147
+ "name": "stdout",
148
+ "output_type": "stream",
149
+ "text": [
150
+ "\n",
151
+ "✅ Reports Added. Hybrid Knowledge Base is ready.\n"
152
+ ]
153
+ },
154
+ {
155
+ "name": "stderr",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "\n"
159
+ ]
160
+ }
161
+ ],
162
+ "source": [
163
+ "# Run the ingestion on KNBS text\n",
164
+ "ingest_markdown_reports(\n",
165
+ " markdown_dir=KNBS_MARKDOWN_DIR,\n",
166
+ " vector_db_path=VECTOR_DB_PATH,\n",
167
+ " model=EMBEDDING_MODEL\n",
168
+ ")"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 6,
174
+ "id": "95eae0fe-f71f-475d-9944-2f202a30174c",
175
+ "metadata": {},
176
+ "outputs": [],
177
+ "source": [
178
+ "from langchain_community.vectorstores import Chroma\n",
179
+ "from langchain_community.embeddings import OllamaEmbeddings"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 10,
185
+ "id": "72488cf7-3778-481a-8673-b7b08dd28e5a",
186
+ "metadata": {},
187
+ "outputs": [],
188
+ "source": [
189
+ "CBK_MARKDOWN_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/cbk/marker-output\")"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 11,
195
+ "id": "8b6ef094-487a-4148-9435-1e9546d6f5a3",
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "name": "stdout",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "📄 Scanning for Markdown Reports in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/marker-output...\n"
203
+ ]
204
+ },
205
+ {
206
+ "name": "stderr",
207
+ "output_type": "stream",
208
+ "text": [
209
+ "100%|██████████| 958/958 [00:11<00:00, 79.89it/s] \n"
210
+ ]
211
+ },
212
+ {
213
+ "name": "stdout",
214
+ "output_type": "stream",
215
+ "text": [
216
+ " Loaded 958 report files.\n",
217
+ " ✂️ Split into 4582 chunks.\n",
218
+ "🧠 Appending to Vector Store...\n"
219
+ ]
220
+ },
221
+ {
222
+ "name": "stderr",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "Ingesting Reports: 100%|██████████| 4582/4582 [10:21<00:00, 7.37chunk/s]"
226
+ ]
227
+ },
228
+ {
229
+ "name": "stdout",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "\n",
233
+ "✅ Reports Added. Hybrid Knowledge Base is ready.\n"
234
+ ]
235
+ },
236
+ {
237
+ "name": "stderr",
238
+ "output_type": "stream",
239
+ "text": [
240
+ "\n"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "# Run the ingestion on CBK text\n",
246
+ "ingest_markdown_reports(\n",
247
+ " markdown_dir=CBK_MARKDOWN_DIR,\n",
248
+ " vector_db_path=VECTOR_DB_PATH,\n",
249
+ " model=EMBEDDING_MODEL\n",
250
+ ")"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "markdown",
255
+ "id": "d85e55c2-1f09-4550-b2d2-4ae5ccae5ae3",
256
+ "metadata": {},
257
+ "source": [
258
+ "## Test the performance\n",
259
+ "Using similarity search, we test the performance of the embedding model used here."
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 15,
265
+ "id": "6aece829-bceb-498b-a549-cf29287b1bea",
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "🔎 Checking for tables in: 'Interest rates commercial banks 2020'\n",
273
+ "\n",
274
+ "---------------------------------------------------\n",
275
+ "📰 Source: knbs_batch_42_193220_2020kenyafactsfigures\n",
276
+ "| Value of shares traded (KSh Bn) | 209 | 147 | 172 | 176 | 154 |\n",
277
+ "| Equities market capitalization (KSh Bn) | 2,054 | 1,932 | 2,522 | 2,102 | 2,540 |\n",
278
+ "| NSE 20 Share Index (Base Jan 1966=100) | 4,040 | 3,186 | 3,712 | 2,834 | 2,654 |\n",
279
+ "\n",
280
+ "<sup>1</sup> W eig hted av erag e commercial bank interes t rates\n",
281
+ "\n",
282
+ "# **Monetary and Financial Statistics**\n",
283
+ "\n",
284
+ "**Table 21: Commercial Banks' Deposits, Loans and Advances, 2015 - 2019**\n",
285
+ "\n",
286
+ "| 2015 | 2016 | 2017 | 2018 | 2019* | |\n",
287
+ "|------------------------------------------------|-------------|-------------|-------------|-------------|-------------|\n",
288
+ "| Commercial bank<br>s (KS<br>h million) | | | | | |\n",
289
+ "| Deposits<br>liabilities | 2,661,140.0 | 2,771,710.6 | 3,068,723.8 | 3,414,705.5 | 3,634,995.9 |\n",
290
+ "| Total loans<br>and advances | 2,873,799.6 | 3,127,888.0 | 3,318,907.0 | 3,543,932.0 | 3,838,796.6 |\n",
291
+ "| Public sector | 630,049.0 | 814,585.0 | 930,174.5 | 1,057,217.2 | 1,177,091.3 |\n",
292
+ "| Private sector | 2,243,750.7 | 2,317,025.0 | 2,415,922.9 | 2,486,714.9 | 2,664,382.5 |\n",
293
+ "| Number of authorised institutions in operation | | | | | |\n",
294
+ "| Licensed banks | 43 | 43 | 42 | 42 ...\n",
295
+ "\n",
296
+ "---------------------------------------------------\n",
297
+ "📰 Source: knbs_batch_42_193220_thekenyapovertyreport2020\n",
298
+ "| Interest rate on commercial bank loans and advances $(\\%)$ | 17.44 | 2.607 | 1 | 9 | | 12.02 |\n",
299
+ "| Formal Employment sector (000's) | 2,601 | | ,,,, ...\n",
300
+ "\n",
301
+ "---------------------------------------------------\n",
302
+ "📰 Source: knbs_batch_35_193128_2021kenyafactsfigures\n",
303
+ "| 100 Japanese Yen | 93.55 | 92.22 | 91.74 | 93.59 | 99.80 |\n",
304
+ "| 1 SA Rand | 6.93 | | | | |\n",
305
+ "| KSh /TSh | 21.54 | 21.63 | 22.48 | 22.63 | 21.76 |\n",
306
+ "| KSh/ Ush | 33.68 | 34.92 | 36.81 | 36.32 | 34.93 |\n",
307
+ "| Overall Weighted Index 2009=100 | 114.30 | 114.83 | 116.52 | 115.66 | 113.04 |\n",
308
+ "\n",
309
+ "<sup>\\*</sup>Provisional\n",
310
+ "\n",
311
+ "<sup>2</sup> Countries in the Euro area included in the computation of Trade Weighted Fisher's Ideal Index are: Germany, France, Switzerland, Netherlands, Belgium and Italy.\n",
312
+ "\n",
313
+ "**Table 19: Nominal Interest Rates, 2016 – 2020**\n",
314
+ "\n",
315
+ "Percentage **2016 2017 2018 2019 2020** 91-day Treasury bill rate................................... 8.44 8.01 7.34 7.17 6.90 Inter-Bank Offered Rate.................................... 5.92 7.27 8.15 6.03 5.29 Overdraft Rates.................................................. 13.49 13.54 12.17 11.67 11.51 Commercial Banks Loans and Advances...... 13.69 13.64 12.51 12.24 12.02 Savings deposits rate....................................... 6.37 6.91 5.13 4.02 2.70\n",
316
+ "\n",
317
+ "<sup>1</sup>Weighted average commercial bank interest rates\n",
318
+ "\n",
319
+ "Table 20: Securities Exchange, 2016 - 2020\n",
320
+ "\n",
321
+ "| 2016 | 2017 | 2018 | 2019 | 2020* | |\n",
322
+ "|-----------------------------------------|-------|-------|-------|-------|-------|\n",
323
+ "| Value of shares traded (KSh Bn) | 147 | 172 | 176 | 154 | 149 |\n",
324
+ "| Equities Market c...\n",
325
+ "\n",
326
+ "---------------------------------------------------\n",
327
+ "📰 Source: knbs_batch_32_193040_2025_facts_and_figures\n",
328
+ "| Kenya Development Corporation2 | - | 10 | 9 | 4 | 7 | - | 521.4 | 613.8 | 510.0 | 599.9 |\n",
329
+ "| Industrial and Commercial Development Corporation | 3 | - | - | - | - | 100.9 | - | - | - | - |\n",
330
+ "| Sub - total | 320 | 328 | 303 | 362 | 343 | 1,096.3 | 1,394.5 | 1,690.8 | 1,869.9 | 2,304.9 |\n",
331
+ "| All other commercial banks1 | - | - | - | - | - | 410,640 | 463,981 | 527,235 | 637,513 | 560,643 |\n",
332
+ "| TOTAL | 320 | 328 | 303 | 362 | 343 | 411,736 | 465,376 | 528,926 | 639,383 | 562,948 |\n",
333
+ "\n",
334
+ "*<sup>\\*</sup> Provisional* \n",
335
+ "\n",
336
+ "*<sup>1</sup> Source: Central Bank of Kenya (excludes DBK).* \n",
337
+ "\n",
338
+ "*<sup>2</sup> IDB Capital, Tourism Finance Corporation and ICDC merged to form KDC in 2020* \n",
339
+ "\n",
340
+ "![](_page_92_Picture_0.jpeg)\n",
341
+ "\n",
342
+ "**Table 43: Selected EPZ Performance Indicators, 2020 - 2024**\n",
343
+ "\n",
344
+ "| | Unit | 2020 | 2021 | 2022 | 2023 | 2024* |\n",
345
+ "|------------------------------------------|----------------|---------|---------|---------|---------|---------|\n",
346
+ "| Gazetted Zones | Number | 76 | 82 | 89 | 102 | 105 |\n",
347
+ "| ...\n",
348
+ "\n",
349
+ "---------------------------------------------------\n",
350
+ "📰 Source: knbs_batch_34_193116_2022economicsurvey\n",
351
+ "| Savings deposits. | - | - | - | - | - | - | |\n",
352
+ "| Loan and Advances (maximum) | 13.25 | 12.10 | 12.19 | 12.04 | 12.04 | 12.16 | |\n",
353
+ "| Overdraft. | - | - | - | - | - | - | |\n",
354
+ "| Loans-Deposits Spread | - | - | - | - | 5.69 | - | |\n",
355
+ "\n",
356
+ "*Source: Central Bank of Kenya.*\n",
357
+ "\n",
358
+ "*Selected financial aggregates values are deflated using December Consumer Price Indices*\n",
359
+ "\n",
360
+ "*Weighted average commercial bank interest rates*\n",
361
+ "\n",
362
+ "*<sup>\\*</sup>Provisional*\n",
363
+ "\n",
364
+ "**4.12.** Table 4.7 shows the selected real principal interest rates from 2017 to 2021. Real interest rates reflect the real cost of borrowing, savings and return on investment. The weighted average real interest rate for commercial bank deposits increased to 0.77 per cent in 2021 from 0.68 per cent in 2020. Commercial banks loans and advances rate increased from 6.40 per cent in December 2020 to 6.43 per cent in December 2021. The real average interest rate for the 91-day Treasury Bills increased from 1.28 per cent in December 2020 to 1.53 per cent in December 2021 while the inter-bank rate declined further from negative 0.33 per cent in December 2020 to negative 0.63 per cent in December 2021.\n",
365
+ "\n",
366
+ "**Table 4.7: Selected Real Principal Interest Rates, 2017 – 2021**\n",
367
+ "\n",
368
+ "*Per cent*\n",
369
+ "\n",
370
+ "| Average Interest Rate for 91-day Treasury Bills | Year | Nominal Interest | Inflation Rate | Real Interest1 |\n",
371
+ "|---------------------------...\n",
372
+ "\n"
373
+ ]
374
+ }
375
+ ],
376
+ "source": [
377
+ "# Connect\n",
378
+ "embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=\"http://127.0.0.1:25000\")\n",
379
+ "vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embeddings)\n",
380
+ "\n",
381
+ "# Query for a known table\n",
382
+ "query = \"Interest rates commercial banks 2020\" \n",
383
+ "\n",
384
+ "results = vectorstore.similarity_search(query, k=5)\n",
385
+ "\n",
386
+ "print(f\"🔎 Checking for tables in: '{query}'\\n\")\n",
387
+ "\n",
388
+ "for i, doc in enumerate(results):\n",
389
+ " content = doc.page_content\n",
390
+ " print(\"---------------------------------------------------\")\n",
391
+ " print(f\"Source: {doc.metadata.get('source', 'N/A').split('/')[-1].split('.')[0].replace('-', '')}\")\n",
392
+ " # Print first 500 chars to see if headers align\n",
393
+ " print(content[:1500] + \"...\\n\")"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": null,
399
+ "id": "dc9072f0-fcd2-4b24-9050-795e64b26ff6",
400
+ "metadata": {},
401
+ "outputs": [],
402
+ "source": []
403
+ }
404
+ ],
405
+ "metadata": {
406
+ "kernelspec": {
407
+ "display_name": "Python 3 (ipykernel)",
408
+ "language": "python",
409
+ "name": "python3"
410
+ },
411
+ "language_info": {
412
+ "codemirror_mode": {
413
+ "name": "ipython",
414
+ "version": 3
415
+ },
416
+ "file_extension": ".py",
417
+ "mimetype": "text/x-python",
418
+ "name": "python",
419
+ "nbconvert_exporter": "python",
420
+ "pygments_lexer": "ipython3",
421
+ "version": "3.12.3"
422
+ }
423
+ },
424
+ "nbformat": 4,
425
+ "nbformat_minor": 5
426
+ }
notebooks/inspect_news.ipynb ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4b0cd97a-60f0-4582-a8c3-b4d9dbf7ab03",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Inspect Business News\n",
9
+ "In this notebook, the study will seek to inspect news files (stored as csv files), and ingest its content to the vectordb"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 1,
15
+ "id": "ed41b128-7baa-43e2-9b1a-2cfdc955440a",
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "%load_ext autoreload\n",
20
+ "%autoreload 2"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 6,
26
+ "id": "c80c78f8-8118-46b5-b954-e0b15d935ced",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "import pandas as pd\n",
31
+ "import glob\n",
32
+ "import os\n",
33
+ "import sys\n",
34
+ "import warnings\n",
35
+ "from pathlib import Path\n",
36
+ "warnings.filterwarnings(\"ignore\")"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 7,
42
+ "id": "7801b40d-c6a9-44cd-8afb-e544731aeb52",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "# Fix paths for src files\n",
47
+ "project_root = Path(os.getcwd()).parent\n",
48
+ "script_dir = project_root / \"src\"\n",
49
+ "if str(script_dir) not in sys.path:\n",
50
+ " sys.path.append(str(script_dir))"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 8,
56
+ "id": "2f940902-b38d-4e6e-9566-55f511dc0bc9",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "from load.explore_news_schema import analyze_schemas"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 4,
66
+ "id": "788c9e1d-5c17-4387-9153-77a31cd26eec",
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "name": "stdout",
71
+ "output_type": "stream",
72
+ "text": [
73
+ "🔍 Scanning 22 files in '/capstor/scratch/cscs/tligawa/mshauri-fedha/data/news'...\n",
74
+ "\n",
75
+ "--- Schema Report ---\n",
76
+ "\n",
77
+ "TYPE 1: Found in 5 files\n",
78
+ "Columns: ['description', 'published date', 'publisher', 'title', 'url']\n",
79
+ "Examples: ['google_news_10-11-2025.csv', 'google_news_10-11-2025-19-27.csv', 'google_news_19-11-2025-19-49.csv'] ... (+2 others)\n",
80
+ "\n",
81
+ "TYPE 2: Found in 7 files\n",
82
+ "Columns: ['authors', 'date', 'full_content', 'image', 'source', 'summary', 'title', 'url', 'word_count']\n",
83
+ "Examples: ['kenya_news_full_27-10-2025.csv', 'kenya_news_full_17-11-2025-17-52.csv', 'newsdata_10-11-2025.csv'] ... (+4 others)\n",
84
+ "\n",
85
+ "TYPE 3: Found in 10 files\n",
86
+ "Columns: ['content', 'date', 'source', 'title', 'url']\n",
87
+ "Examples: ['gnews_19-11-2025-19-49.csv', 'the_news_10-11-2025.csv', 'the_news_19-11-2025-19-49.csv'] ... (+7 others)\n",
88
+ "\n",
89
+ "--- Date Format Sample ---\n",
90
+ "Sample from column 'published date' in google_news_10-11-2025.csv:\n",
91
+ "['Sun, 09 Nov 2025 03:15:00 GMT', 'Sun, 09 Nov 2025 18:45:00 GMT', 'Tue, 04 Nov 2025 06:00:00 GMT', 'Tue, 04 Nov 2025 17:33:08 GMT', 'Wed, 05 Nov 2025 09:38:38 GMT']\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "# Path to where you just downloaded the files\n",
97
+ "SCRATCH_DIR = os.environ.get(\"SCRATCH\")\n",
98
+ "NEWS_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/news\")\n",
99
+ "\n",
100
+ "# Run the exploration\n",
101
+ "analyze_schemas(NEWS_DIR)"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 9,
107
+ "id": "f65ee0d4-2bac-46f4-8cd4-44ac9c903ee3",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "from load.ingest_news import ingest_news_data"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 10,
117
+ "id": "c7864bbb-e731-45b4-bd9e-4dbf9a063653",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "from load.start_ollama import start_ollama_server, pull_embedding_model"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 7,
127
+ "id": "279c443b-4502-40d9-9b29-06e1929ba842",
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "🚀 Starting Ollama Server...\n",
135
+ "⏳ Waiting for server to boot...\n",
136
+ "✅ Server started successfully.\n"
137
+ ]
138
+ },
139
+ {
140
+ "data": {
141
+ "text/plain": [
142
+ "True"
143
+ ]
144
+ },
145
+ "execution_count": 7,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "start_ollama_server()"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 8,
157
+ "id": "03b5c2c0-4d6a-4e06-ab88-806ca5eaa2d6",
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "⬇️ Requesting pull for 'nomic-embed-text'...\n",
165
+ " success manifest digest00%\n",
166
+ "✅ Model 'nomic-embed-text' installed successfully!\n"
167
+ ]
168
+ }
169
+ ],
170
+ "source": [
171
+ "# pull embedding model\n",
172
+ "pull_embedding_model(\"nomic-embed-text\")"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 9,
178
+ "id": "3a4a7e63-934c-4b0d-824f-f525278139dc",
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "🚀 Found 22 news files. Processing...\n"
186
+ ]
187
+ },
188
+ {
189
+ "name": "stderr",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "Reading CSVs: 100%|██████████| 22/22 [00:00<00:00, 179.19file/s]"
193
+ ]
194
+ },
195
+ {
196
+ "name": "stdout",
197
+ "output_type": "stream",
198
+ "text": [
199
+ " 📉 Condensed into 198 unique articles.\n",
200
+ "🧠 Embedding 455 chunks into Vector DB...\n"
201
+ ]
202
+ },
203
+ {
204
+ "name": "stderr",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "\n",
208
+ "/users/tligawa/mshauri-fedha/notebooks/ingest_news.py:149: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
209
+ " embeddings = OllamaEmbeddings(model=model, base_url=\"http://127.0.0.1:25000\")\n",
210
+ "/users/tligawa/mshauri-fedha/notebooks/ingest_news.py:150: LangChainDeprecationWarning: The class `Chroma` was deprecated in LangChain 0.2.9 and will be removed in 1.0. An updated version of the class exists in the `langchain-chroma package and should be used instead. To use it run `pip install -U `langchain-chroma` and import as `from `langchain_chroma import Chroma``.\n",
211
+ " vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)\n"
212
+ ]
213
+ },
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
219
+ ]
220
+ },
221
+ {
222
+ "name": "stderr",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "Embedding News: 100%|██████████| 455/455 [00:21<00:00, 21.06chunk/s]"
226
+ ]
227
+ },
228
+ {
229
+ "name": "stdout",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "\n",
233
+ "✅ News Ingestion Complete.\n"
234
+ ]
235
+ },
236
+ {
237
+ "name": "stderr",
238
+ "output_type": "stream",
239
+ "text": [
240
+ "\n"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "VECTOR_DB = \"mshauri_fedha_chroma_db\"\n",
246
+ "EMBEDDING_MODEL = \"nomic-embed-text\" # Make sure this matches your existing DB model\n",
247
+ "\n",
248
+ "# Run\n",
249
+ "ingest_news_data(NEWS_DIR, VECTOR_DB, EMBEDDING_MODEL)"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": null,
255
+ "id": "d993f352-44a4-4c73-ae2f-cc3b2e85a882",
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": [
259
+ "from langchain_community.vectorstores import Chroma\n",
260
+ "from langchain_community.embeddings import OllamaEmbeddings"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "ee969dc7-6ecb-45e3-ab89-875536177543",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "# --- CONFIG ---\n",
271
+ "VECTOR_DB_PATH = \"mshauri_fedha_chroma_db\"\n",
272
+ "EMBEDDING_MODEL = \"nomic-embed-text\"\n",
273
+ "OLLAMA_URL = \"http://127.0.0.1:25000\""
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": null,
279
+ "id": "e681dd43-f0f3-47ec-8445-4d188eb7886a",
280
+ "metadata": {},
281
+ "outputs": [],
282
+ "source": [
283
+ "# Connect to DB\n",
284
+ "print(\"Connecting to Vector Store...\")\n",
285
+ "embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=OLLAMA_URL)\n",
286
+ "vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embeddings)"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": null,
292
+ "id": "5decfff3-26d7-4784-b6b0-7fd98a34f3ca",
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": [
296
+ "# Get Stats\n",
297
+ "count = vectorstore._collection.count()\n",
298
+ "print(f\"Total Documents stored: {count}\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 10,
304
+ "id": "d55aab85-10c7-44a0-83cf-69c125416b61",
305
+ "metadata": {},
306
+ "outputs": [
307
+ {
308
+ "name": "stdout",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "🔌 Connecting to Vector Store...\n",
312
+ "✅ Total Documents stored: 455\n",
313
+ "\n",
314
+ "👀 Random Sample Document:\n",
315
+ "--- Metadata ---\n",
316
+ "Date: 2025-11-03\n",
317
+ "Source: african markets\n",
318
+ "Type: news\n",
319
+ "\n",
320
+ "--- Content (First 300 chars) ---\n",
321
+ "Title: BGFI Holding finally gets regulatory approval for its BVMAC IPO: inside a tumultuous IPO journey - african markets\n",
322
+ "Date: 2025-11-03\n",
323
+ "Source: african markets\n",
324
+ "\n",
325
+ "BGFI Holding finally gets regulatory approval for its BVMAC IPO: inside a tumultuous IPO journey african markets...\n"
326
+ ]
327
+ }
328
+ ],
329
+ "source": [
330
+ "# Peek at a Sample\n",
331
+ "print(\"\\n Random Sample Document:\")\n",
332
+ "# We fetch 1 random ID just to peek\n",
333
+ "result = vectorstore.get(limit=1)\n",
334
+ "\n",
335
+ "if result['ids']:\n",
336
+ " meta = result['metadatas'][0]\n",
337
+ " content = result['documents'][0]\n",
338
+ " \n",
339
+ " print(f\"--- Metadata ---\")\n",
340
+ " print(f\"Date: {meta.get('date')}\")\n",
341
+ " print(f\"Source: {meta.get('source')}\")\n",
342
+ " print(f\"Type: {meta.get('type')}\")\n",
343
+ " \n",
344
+ " print(f\"\\n--- Content (First 300 chars) ---\")\n",
345
+ " print(content[:300] + \"...\")\n",
346
+ "else:\n",
347
+ " print(\"Database is empty!\")"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": 11,
353
+ "id": "f1192ee8-f09a-4e11-84df-aacea487485e",
354
+ "metadata": {},
355
+ "outputs": [
356
+ {
357
+ "name": "stdout",
358
+ "output_type": "stream",
359
+ "text": [
360
+ "\n",
361
+ "🔎 Searching for: 'How have the protests impacted the Kenyan economy?'...\n",
362
+ "Found 10 relevant articles:\n",
363
+ "\n",
364
+ "Result #1 -------------------------\n",
365
+ "📅 Date: 2025-09-29\n",
366
+ "📰 Source: Devdiscourse\n",
367
+ "📝 Excerpt: Title: Madagascar's Government Dissolution Amidst Gen Z-Inspired Protests: A Call for Dialogue and Reform Date: 2025-09-29 Source: Devdiscourse In response to youth-led protests over worsening water and power shortages, Malagasy President Andry Rajoelina announced the dissolution of the government on Monday. The unrest, largely influenced by Gen Z movements in Kenya and Nepal, marks the largest such demonstrations in Madagascar in years. These rallies significantly challenge Rajoelina's leadership since his recent 2023 re-election. The president offered an apology for governmental shortcomings and vowed to engage in dialogue with the youth while ensuring support for affected businesses. The protests have seen significant casualties, with both protestors and bystanders affected, although official figures remain contested. (With inputs from agencies.)...\n",
368
+ "------------------------------------\n",
369
+ "\n",
370
+ "Result #2 -------------------------\n",
371
+ "📅 Date: 2025-10-27\n",
372
+ "📰 Source: capitalfm\n",
373
+ "📝 Excerpt: Title: Motorists protest ‘hidden’ concession in Rironi-Mau Summit Expressway Date: 2025-10-27 Source: capitalfm...\n",
374
+ "------------------------------------\n",
375
+ "\n",
376
+ "Result #3 -------------------------\n",
377
+ "📅 Date: 2024-06-21\n",
378
+ "📰 Source: riotimesonline.com\n",
379
+ "📝 Excerpt: In Kenya, a significant backlash has emerged against the 2024 Finance Bill, which proposes various tax increases. The bill was approved by the Kenyan parliament. This happened despite modifications by Parliament’s Finance Committee aimed at mitigating public dissatisfaction by dropping several contentious tax proposals. This legislative move coincided with a wave of protests primarily driven by Gen Z and millennials, marking a pivotal moment in the country’s political landscape. Protests erupted across multiple Kenyan cities, including Nairobi and key areas within President William Ruto’s Rift Valley strongholds like Eldoret and Kericho. These demonstrations were notable not only for their scope but also for their organization. They were orchestrated online without the backing of established political parties. The phrase “Tuko wengi,” meaning “We are many,” echoed through the streets of Eldoret, symbolizing the protesters’ unity and significant numbers. The demographic profile of the protesters is particularly noteworthy. Approximately 75% of Kenya’s population is under 35, with a median age of 19. Young Kenyans Mobilize This young populace could be a transformative force in future electoral processes, potentially starting with the 2027 general elections. Their active engagement in these protests reflects a broader trend of increasing political mobilization through digital platforms. This trend facilitates widespread participation and amplifies their collective voice. This series of protests underscores a deep-seated discontent among the youth, who perceive the tax hikes as detrimental to their economic prospects. Their willingness to publicly express their dissatisfaction highlights a shift towards more grassroots political involvement. This suggests that these young, digitally-savvy generations could significantly shape the future of Kenyan politics...\n",
380
+ "------------------------------------\n",
381
+ "\n",
382
+ "Result #4 -------------------------\n",
383
+ "📅 Date: 2023-07-19\n",
384
+ "📰 Source: abcnews.go.com\n",
385
+ "📝 Excerpt: . Luis Tato/AFP via Getty Images At least five protesters were injured on Wednesday as demonstrators clashed with police. Amnesty International Kenya said, said that \"para-military police officers and armored water cannon trucks [are] already patrolling and engaging protestors across several towns and neighborhoods.\" In Kibera -- a stronghold of the opposition -- protests turned violent, with demonstrators setting fire to tires and furniture, stones being pelted, and tear gas being deployed by police. In the most recent round of anti-government protests at least 23 people are reported to have been killed according to the U.N., with over 300 arrested. Protests have also been reported in Kenya's Kisumu, Kisii and Migori counties. A Kenya Police Officer shoots a tear gas canister to disperse some protesters as they gather to demonstrate in Nairobi, Kenya, on July 12, 2023. Luis Tato/AFP via Getty Images Kenya's Ministry of Education also announced that all primary and secondary schools in Nairobi and the coastal city Mombasa are to close on Wednesday as a \"precautionary measure\" following \"credible security intelligence.\" Several businesses also remain closed. The protests come after Ruto last month signed into law a contentious finance bill at Nairobi's State House that proposed doubling the tax levied on fuel from 8% to 16%. The bill aimed to aid in offsetting Kenya's external debt, officials said. However, the bill will have a ripple effect on the price of basic commodities, compounding on the economic strain of Kenyans already struggling with the rising cost of living. Riot police detain a supporter of Kenya's opposition leader Raila Odinga as he participates in an anti-government protest against the imposition of tax hikes by the government in Nairobi, Kenya, July 19, 2023. Thomas Mukoya/Reuters Implementation of the Bill -- which was due to come into effect on July 1 -- was halted by Kenya's High Court following a case brought by opposition Sen...\n",
386
+ "------------------------------------\n",
387
+ "\n",
388
+ "Result #5 -------------------------\n",
389
+ "📅 Date: 2023-07-19\n",
390
+ "📰 Source: abcnews.go.com\n",
391
+ "📝 Excerpt: An opposition leader called for three days of protests against a finance bill. 6 dead as Kenya rocked by nationwide anti-government protests over gas tax, Amnesty says A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. LONDON -- Kenya was bracing for days of anti-government protests led by the government's political opposition over a contentious new finance bill and the rising cost of living At least six people were shot and killed and at least a dozen others were injured on Wednesday, the first day of a planned three-day protest against higher taxes, Mathias Kinyoda, of Amnesty International Kenya, told ABC News. At least 87 demonstrators were arrested nationwide, he said. The protests were called by opposition leader Raila Odinga. The unrest was set to take place despite Kenya's President William Ruto vowing no protests would take place in the East African Nation. \"We are here, first and foremost, to confirm that the peaceful protests planned for Wednesday, Thursday and Friday this week are on as earlier declared by our leadership,\" read a statement by Odinga's party, Azimio La Umoja, sent to ABC News. A Kenya Police Officer runs away from a group of opposition supporters chasing him and throwing stones during anti-government protests in Nairobi on July 19, 2023. Luis Tato/AFP via Getty Images At least five protesters were injured on Wednesday as demonstrators clashed with police...\n",
392
+ "------------------------------------\n",
393
+ "\n",
394
+ "Result #6 -------------------------\n",
395
+ "📅 Date: 2022-07-20\n",
396
+ "📰 Source: thesouthafrican.com\n",
397
+ "📝 Excerpt: Inflation in Kenya: why and how to fix it; Image: Adobe stock Inflation has hit many countries recently, from the United States to Sri Lanka. In Kenya, too, the rising prices of basic commodities have left most citizens wondering what’s going on. The price of a 2kg packet of maize and wheat flour hit 200 shillings (US$2) from a low of 120 shillings in about three months. That is a 67% increase. The 12-month overall inflation rate reached 7.91% in June 2022. Politicians eyeing Kenya’s 9 August polls have been offering solutions in exchange for votes. ADVERTISEMENT Kenya’s average annual per capita income is US$5 270. With inflation, citizens lose even this limited purchasing power. The same money buys less. Wages and salaries do not go up fast enough. Citizens’ discontent can change the way they vote in democratic countries or lead to violence in undemocratic ones. The Kenya African National Union, which ruled the country from independence in 1963, was voted out in 2002 partly because of citizens’ discontent over the state of the economy. And in the US, economic discontent has been a big factor in voting; it led to Donald Trump’s win in the 2016 presidential polls. That’s why politicians are so quick to promise relief. But can they provide it? The two key drivers of inflation in Kenya’s consumer price index are food and energy. Russia’s war on Ukraine has raised the price of oil to the highest level in history, which spills over to the rest of global economy. And about 30% to 50% of Kenya’s imported wheat comes from Russia and Ukraine. Ukraine is exporting 60% less wheat this year compared with 2021, leading to a rise in price of wheat and its derivatives like bread. Unreliable rains have cut domestic production of maize and other food crops in Kenya. Production is about 15% to 20% below the five-year average. ADVERTISEMENT Some of the drivers of Kenya’s inflation are local while others are external and beyond its control...\n",
398
+ "------------------------------------\n",
399
+ "\n",
400
+ "Result #7 -------------------------\n",
401
+ "📅 Date: 2025-10-21\n",
402
+ "📰 Source: Crypto News\n",
403
+ "📝 Excerpt: . While Tether hasn’t broken down that figure by region, its gaze is fixed on Africa, where it sees its next chapter of growth unfolding. The firm points to a Chainalysis report revealing a 52% explosion in on-chain transaction volume across Sub-Saharan Africa, which rocketed past $205 billion in a single year. Behind that surge are small business owners and individuals turning to digital assets as a lifeline. They’re navigating the same harsh realities the data confirms: soaring inflation, unpredictable local currencies, and banking systems that have left many behind. To put faces to these numbers, Tether released a short documentary from Kenya. The film highlights local merchants using USDT to pay international suppliers and families relying on it to receive remittances from abroad. It’s a grassroots look at how a global digital dollar is providing a tangible anchor in economies often defined by their volatility....\n",
404
+ "------------------------------------\n",
405
+ "\n",
406
+ "Result #8 -------------------------\n",
407
+ "📅 Date: 2022-07-20\n",
408
+ "📰 Source: thesouthafrican.com\n",
409
+ "📝 Excerpt: . Production is about 15% to 20% below the five-year average. ADVERTISEMENT Some of the drivers of Kenya’s inflation are local while others are external and beyond its control. My view, based on my analysis of the Kenyan economy and other countries, is that inflation can be managed but there are no quick fixes. ALSO READ: Throwback: Family turns ‘white’ after freak accident (pics) Drivers of inflation In Kenya, a confluence of many factors has inflated prices, particularly after the Ukraine war and the pandemic. One is elections. Lots of money is spent during electioneering. Some of it is just given out with no commensurate productivity. Kenya saw this in 1990, when money in circulation rose before the 1992 elections and so did the rate of inflation. The second factor is corruption and mismanagement. Whether it’s in procurement where prices are inflated, or when goods are not supplied or substandard ones are supplied, consumers pay the price. The cost of corruption to the economy has been put by President Uhuru Kenyatta at 2 billion shillings, translating to about 7% of GDP annually. If people make illegal water or power connections, honest people pay for that. If a tender for building a road is inflated, someone pays for it. If government and its agencies over-employ, someone pays for that. If it takes longer to get services like port clearance and building approvals, someone pays for it. If bribes are exchanged, someone pays for it. The 50-shilling note given to police at a roadblock is paid by someone else. A third driver of inflation is a weak currency. Kenya’s currency has declined by 3.5% since the start of the year, partly because of decisions taken in other countries that affect the value of their currencies. A fourth driver of inflation is tax. It raises the price of goods and services. The Finance Act 2022 brought in new taxes and raised the rates of other existing ones. It seems the government did this to raise money without incurring more debts...\n",
410
+ "------------------------------------\n",
411
+ "\n",
412
+ "Result #9 -------------------------\n",
413
+ "📅 Date: 2023-06-23\n",
414
+ "📰 Source: france24.com\n",
415
+ "📝 Excerpt: Title: Kenya: Tax on all salaries to finance housing Date: 2023-06-23 Source: france24.com Kenya's parliament's voted throuh a raft of tax hikes in the first budget of president William Ruto. The contentious changes will double the tax on fuel to 16% to generate another 61 billion shillings or a little under 400,000 euros, for the government. The bill still has to be signed by Ruto and opposition members have threatened to call fresh protests if he does. The bill also proposes a new housing levy which Ruto says will help build more affordable houses but many Kenyans hit hard by the rising cost of living say they can't shoulder the extra cost. FRANCE 24's correspondent Bastien Renouil reports. Video by: Bastien RENOUIL...\n",
416
+ "------------------------------------\n",
417
+ "\n",
418
+ "Result #10 -------------------------\n",
419
+ "📅 Date: 2025-11-10\n",
420
+ "📰 Source: capitalfm\n",
421
+ "📝 Excerpt: NAIROBI, Kenya, Nov 10 — President William Ruto says he has “no regrets” over his directive to police to shoot protesters “in the feet,” defending the use of force during violent anti-government demonstrations last year. Speaking in an interview with Al Jazeera aired on Sunday, Ruto maintained that police acted lawfully when responding to riots that left dozens dead and businesses destroyed. “I don’t regret those comments at all because the law allows the police to use force when other people’s lives are in danger,” he said. When challenged on whether shooting protesters was excessive, he replied, “That is according to you. I think the police know what they need to do.” The President said his administration had to “balance” between allowing peaceful demonstrations and curbing criminal activity during the unrest. “We have had to balance between dealing with violent criminals on one end and managing protests on the other,” he told James Bays. Ruto says police to immobilize vandals, take them to court » Capital News July 9 directive Ruto’s comments referred to a July 9 address in Nairobi’s Kilimani area, where he warned vandals and looters that they would be immobilized and taken to court. “Anybody torching and destroying another person’s business should be shot in the feet and taken to the hospital pending court appearance,” Ruto declared at the time. “We want people to do business. Enough is enough.” The President spoke amid escalating anti-government protests that saw supermarkets and small businesses looted and torched in several towns, including Meru, Kitengela, and Kahawa Sukari. Magunas Supermarket in Meru was among the worst hit — ransacked and later set ablaze. According to the Kenya National Commission on Human Rights (KNCHR), thirty-one people were killed in the early wave of protests, with the toll later rising to sixty-five as demonstrations continued through June and July...\n",
422
+ "------------------------------------\n",
423
+ "\n"
424
+ ]
425
+ }
426
+ ],
427
+ "source": [
428
+ "# --- TEST QUERY ---\n",
429
+ "query = \"How have the protests impacted the Kenyan economy?\"\n",
430
+ "\n",
431
+ "print(f\"\\n🔎 Searching for: '{query}'...\")\n",
432
+ "\n",
433
+ "# Perform Similarity Search\n",
434
+ "results = vectorstore.similarity_search(query, k=10)\n",
435
+ "\n",
436
+ "print(f\"Found {len(results)} relevant articles:\\n\")\n",
437
+ "\n",
438
+ "for i, doc in enumerate(results):\n",
439
+ " print(f\"Result #{i+1} -------------------------\")\n",
440
+ " print(f\"Date: {doc.metadata.get('date', 'N/A')}\")\n",
441
+ " print(f\"Source: {doc.metadata.get('source', 'N/A')}\")\n",
442
+ " print(f\"Excerpt: {doc.page_content[:2500].replace(chr(10), ' ')}...\") # Remove newlines for clean print\n",
443
+ " print(\"------------------------------------\\n\")"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": null,
449
+ "id": "44fc797c-1462-48a6-9eba-60e06afbd6c9",
450
+ "metadata": {},
451
+ "outputs": [],
452
+ "source": []
453
+ }
454
+ ],
455
+ "metadata": {
456
+ "kernelspec": {
457
+ "display_name": "Python 3 (ipykernel)",
458
+ "language": "python",
459
+ "name": "python3"
460
+ },
461
+ "language_info": {
462
+ "codemirror_mode": {
463
+ "name": "ipython",
464
+ "version": 3
465
+ },
466
+ "file_extension": ".py",
467
+ "mimetype": "text/x-python",
468
+ "name": "python",
469
+ "nbconvert_exporter": "python",
470
+ "pygments_lexer": "ipython3",
471
+ "version": "3.12.3"
472
+ }
473
+ },
474
+ "nbformat": 4,
475
+ "nbformat_minor": 5
476
+ }
notebooks/structure_data.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/test_demo.ipynb ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d5c236df-3b17-4889-bfa0-62875afabb70",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Demo notebook"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 9,
14
+ "id": "ae890231-b53d-451b-912c-ad84bd1f3360",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "!pip install langchain-ollama --quiet"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 10,
24
+ "id": "ed3dafc1-5641-42ee-94e4-299295939a8f",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import os\n",
29
+ "import sys\n",
30
+ "from pathlib import Path"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 11,
36
+ "id": "25b50f89-d265-42b1-a97b-8e7790856595",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "# Fix paths for src files\n",
41
+ "project_root = Path(os.getcwd()).parent\n",
42
+ "script_dir = project_root / \"src\"\n",
43
+ "if str(script_dir) not in sys.path:\n",
44
+ " sys.path.append(str(script_dir))"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 12,
50
+ "id": "dea362be-192d-4929-b140-3778cba1df25",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "import warnings\n",
55
+ "warnings.filterwarnings(\"ignore\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 13,
61
+ "id": "333e0906-23f5-4836-8beb-188d2155e879",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "from load.mshauri_demo import create_mshauri_agent, ask_mshauri"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 14,
71
+ "id": "d8f69073-a21a-4b9d-89c5-d7554d7ac605",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stdout",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "⚙️ Initializing Mshauri Fedha (Model: qwen3:32b)...\n",
79
+ "✅ Mshauri Agent Ready (Zero-Dependency Mode).\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "# Initialize agent\n",
85
+ "agent = create_mshauri_agent()"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 4,
91
+ "id": "7a607e94-d378-4127-9c97-1d0a5a3e629f",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "query1 = \"What was the total value of commodity tea exports in 1998?\"\n",
96
+ "query2 = \"Why has the cost of living increased according to the reports?\"\n",
97
+ "query3 = \"What is the latest inflation rate?\"\n",
98
+ "query4 = \"What is the annual GDP for 2020?\"\n",
99
+ "query5 = \"How is the Kenyan economy performing compared to other African countries and countries like USA and Australia?\"\n",
100
+ "query6 = \"What sector of the Kenyan economy has been constantly improving? Show the numbers\"\n",
101
+ "query7 = \"summarize recent loan default trends in microfinance institutions.\"\n",
102
+ "query8 = \"What was the total public debt for 1999?\"\n",
103
+ "query9 = \"Is the Kenyan economy improving? Considering the quality of life of its citizens\"\n",
104
+ "query10 = \"Why did the shilling depreciate?\""
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 15,
110
+ "id": "5d5a7ff3-aa69-4083-81cf-d6e34c41be6f",
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "from load.start_ollama import start_ollama_server"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 7,
120
+ "id": "dc20cc1d-da64-4394-85d5-7b77ad0e88c5",
121
+ "metadata": {},
122
+ "outputs": [
123
+ {
124
+ "name": "stdout",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "🚀 Starting Ollama Server...\n",
128
+ "⏳ Waiting for server to boot...\n",
129
+ "✅ Server started successfully.\n"
130
+ ]
131
+ },
132
+ {
133
+ "data": {
134
+ "text/plain": [
135
+ "True"
136
+ ]
137
+ },
138
+ "execution_count": 7,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": [
144
+ "# Start ollama server\n",
145
+ "start_ollama_server()"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 8,
151
+ "id": "e3942375-e589-4469-94ef-b0bae5e0684f",
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "\n",
159
+ "❓ User: Why did the shilling depreciate?\n",
160
+ "----------------------------------------\n",
161
+ "🚀 Starting Agent Loop for: 'Why did the shilling depreciate?'\n",
162
+ "\n",
163
+ "🧠 Step 1: Thought: I need to determine the reasons behind the Kenyan shilling's depreciation. This requires analyzing economic factors like inflation, trade deficits, political stability, or global market trends. I'll first check the database for relevant quantitative data, then use financial reports/news if needed.\n",
164
+ "\n",
165
+ "Action: sql_db_list_tables \n",
166
+ "Action Input:\n",
167
+ "🛠️ Calling 'sql_db_list_tables' with: \n",
168
+ "\n",
169
+ "🧠 Step 2: Thought: I need to analyze economic indicators like trade deficits, inflation, or central bank policies. First, check the foreign trade summary for import/export trends and exchange rate history.\n",
170
+ "\n",
171
+ "Action: sql_db_schema \n",
172
+ "Action Input: foreign_trade_summary, cbk_indicative_rates\n",
173
+ "🛠️ Calling 'sql_db_schema' with: foreign_trade_summary, cbk_indicative_rates\n",
174
+ "\n",
175
+ "🧠 Step 3: Thought: I need to analyze trade deficits and exchange rate trends. First, check recent trade balances and exchange rates.\n",
176
+ "\n",
177
+ "Action: sql_db_query_checker \n",
178
+ "Action Input: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
179
+ "🛠️ Calling 'sql_db_query_checker' with: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
180
+ "\n",
181
+ "🧠 Step 4: Action: sql_db_query \n",
182
+ "Action Input: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
183
+ "🛠️ Calling 'sql_db_query' with: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
184
+ "\n",
185
+ "🧠 Step 5: Thought: The trade balance data shows a consistent deficit, indicating higher imports than exports, which increases demand for foreign currency. This likely contributes to shilling depreciation. To confirm, check exchange rate trends over time.\n",
186
+ "\n",
187
+ "Action: sql_db_query_checker \n",
188
+ "Action Input: SELECT date, mean_rate FROM cbk_indicative_rates WHERE currency = 'US DOLLAR' ORDER BY date DESC LIMIT 12\n",
189
+ "🛠️ Calling 'sql_db_query_checker' with: SELECT date, mean_rate FROM cbk_indicative_rates WHERE currency = 'US DOLLAR' ORDER BY date DESC LIMIT 12\n",
190
+ "\n",
191
+ "🧠 Step 6: Thought: The trade balance data shows persistent deficits, increasing demand for foreign currency. Exchange rate trends for the US dollar (mean rate rising) confirm shilling depreciation. To fully answer \"why,\" I should also check for inflation data or policy changes, but since the database lacks an inflation table, I'll rely on available data and external sources.\n",
192
+ "\n",
193
+ "Action: search_financial_reports_and_news \n",
194
+ "Action Input: Kenyan shilling depreciation causes 2024-2025\n",
195
+ "🛠️ Calling 'search_financial_reports_and_news' with: Kenyan shilling depreciation causes 2024-2025\n"
196
+ ]
197
+ },
198
+ {
199
+ "name": "stderr",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "/users/tligawa/mshauri-fedha/notebooks/mshauri_demo.py:159: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
203
+ " embeddings = OllamaEmbeddings(model=DEFAULT_EMBED_MODEL, base_url=ollama_url)\n",
204
+ "/users/tligawa/mshauri-fedha/notebooks/mshauri_demo.py:160: LangChainDeprecationWarning: The class `Chroma` was deprecated in LangChain 0.2.9 and will be removed in 1.0. An updated version of the class exists in the `langchain-chroma package and should be used instead. To use it run `pip install -U `langchain-chroma` and import as `from `langchain_chroma import Chroma``.\n",
205
+ " vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)\n"
206
+ ]
207
+ },
208
+ {
209
+ "name": "stdout",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "\n",
213
+ "🧠 Step 7: The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\n",
214
+ "\n",
215
+ "### 1. **Persistent Trade Deficits** \n",
216
+ " - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \n",
217
+ " - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\n",
218
+ "\n",
219
+ "### 2. **Low Central Bank Intervention** \n",
220
+ " - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \n",
221
+ " - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\n",
222
+ "\n",
223
+ "### 3. **Low Interest Rates** \n",
224
+ " - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \n",
225
+ " - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\n",
226
+ "\n",
227
+ "### 4. **Weak Export Sector Growth** \n",
228
+ " - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \n",
229
+ " - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\n",
230
+ "\n",
231
+ "### 5. **Global and Regional Factors** \n",
232
+ " - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \n",
233
+ " - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\n",
234
+ "\n",
235
+ "### Conclusion \n",
236
+ "The shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.\n",
237
+ "----------------------------------------\n",
238
+ "💡 Mshauri: The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\n",
239
+ "\n",
240
+ "### 1. **Persistent Trade Deficits** \n",
241
+ " - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \n",
242
+ " - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\n",
243
+ "\n",
244
+ "### 2. **Low Central Bank Intervention** \n",
245
+ " - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \n",
246
+ " - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\n",
247
+ "\n",
248
+ "### 3. **Low Interest Rates** \n",
249
+ " - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \n",
250
+ " - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\n",
251
+ "\n",
252
+ "### 4. **Weak Export Sector Growth** \n",
253
+ " - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \n",
254
+ " - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\n",
255
+ "\n",
256
+ "### 5. **Global and Regional Factors** \n",
257
+ " - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \n",
258
+ " - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\n",
259
+ "\n",
260
+ "### Conclusion \n",
261
+ "The shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.\n"
262
+ ]
263
+ },
264
+ {
265
+ "data": {
266
+ "text/plain": [
267
+ "'The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\\n\\n### 1. **Persistent Trade Deficits** \\n - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \\n - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\\n\\n### 2. **Low Central Bank Intervention** \\n - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \\n - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\\n\\n### 3. **Low Interest Rates** \\n - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \\n - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\\n\\n### 4. **Weak Export Sector Growth** \\n - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \\n - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\\n\\n### 5. **Global and Regional Factors** \\n - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \\n - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\\n\\n### Conclusion \\nThe shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.'"
268
+ ]
269
+ },
270
+ "execution_count": 8,
271
+ "metadata": {},
272
+ "output_type": "execute_result"
273
+ }
274
+ ],
275
+ "source": [
276
+ "ask_mshauri(agent, query10)"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": null,
282
+ "id": "79f0ba2b-7f35-4a2c-80fe-4d9c3731c558",
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": []
286
+ }
287
+ ],
288
+ "metadata": {
289
+ "kernelspec": {
290
+ "display_name": "Python 3 (ipykernel)",
291
+ "language": "python",
292
+ "name": "python3"
293
+ },
294
+ "language_info": {
295
+ "codemirror_mode": {
296
+ "name": "ipython",
297
+ "version": 3
298
+ },
299
+ "file_extension": ".py",
300
+ "mimetype": "text/x-python",
301
+ "name": "python",
302
+ "nbconvert_exporter": "python",
303
+ "pygments_lexer": "ipython3",
304
+ "version": "3.12.3"
305
+ }
306
+ },
307
+ "nbformat": 4,
308
+ "nbformat_minor": 5
309
+ }
notebooks/transform.ipynb ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "84e9f72e-84ff-49e5-b8ba-faa6ee9bc4df",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%load_ext autoreload\n",
11
+ "%autoreload 2"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "2b495825-0d1a-4a46-9297-6ceae1ccd2a2",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import sys\n",
22
+ "import os\n",
23
+ "import shutil\n",
24
+ "import subprocess\n",
25
+ "import time\n",
26
+ "import requests\n",
27
+ "import torch\n",
28
+ "from pathlib import Path\n",
29
+ "\n",
30
+ "# Fix paths so we can import 'extract.py'\n",
31
+ "project_root = Path(os.getcwd()).parent\n",
32
+ "script_dir = project_root / \"src/transform\"\n",
33
+ "if str(script_dir) not in sys.path:\n",
34
+ " sys.path.append(str(script_dir))\n",
35
+ "\n",
36
+ "# Import your optimized processor\n",
37
+ "from extract import MarkerFolderProcessor, configure_parallelism"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 3,
43
+ "id": "8d04e7ad-abf2-40e4-b308-fc0863464935",
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "✅ Setup complete.\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "# Paths\n",
56
+ "SCRATCH = Path(os.environ.get(\"SCRATCH\"))\n",
57
+ "INPUT_PDFS = SCRATCH / \"mshauri-fedha/data/cbk/pdfs\"\n",
58
+ "OUTPUT_DIR = SCRATCH / \"mshauri-fedha/data/cbk/marker-output\"\n",
59
+ "\n",
60
+ "# Ollama Setup\n",
61
+ "OLLAMA_HOME = SCRATCH / \"ollama_core\"\n",
62
+ "OLLAMA_BIN = OLLAMA_HOME / \"bin/ollama\"\n",
63
+ "OLLAMA_MODELS_DIR = OLLAMA_HOME / \"models\" \n",
64
+ "OLLAMA_HOST = \"http://localhost:11434\"\n",
65
+ "\n",
66
+ "print(\"✅ Setup complete.\")"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 4,
72
+ "id": "2a7846b4-2041-4b4f-9210-16a891d6c9f4",
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "🔍 GH200/A100 Detected: 4 GPUs | 94.5 GB VRAM\n",
80
+ "⚙️ Stability Config: 5 workers/GPU | 20 Total Slots\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "total_slots, workers_per_gpu, num_gpus = configure_parallelism()"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 5,
91
+ "id": "039a0a95-91e2-495c-a0ab-d2185f98461c",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "# Kill any old server first\n",
96
+ "subprocess.run([\"pkill\", \"-f\", \"ollama serve\"], stderr=subprocess.DEVNULL)\n",
97
+ "time.sleep(2)\n",
98
+ "\n",
99
+ "server_env = os.environ.copy()\n",
100
+ "server_env[\"OLLAMA_NUM_PARALLEL\"] = str(32) # Matches your total slots\n",
101
+ "server_env[\"OLLAMA_MAX_LOADED_MODELS\"] = \"1\"\n",
102
+ "server_env[\"OLLAMA_MAX_QUEUE\"] = \"2048\"\n",
103
+ "\n",
104
+ "# Start new server\n",
105
+ "process = subprocess.Popen(\n",
106
+ " [str(OLLAMA_BIN), \"serve\"], \n",
107
+ " stdout=subprocess.DEVNULL, \n",
108
+ " stderr=subprocess.DEVNULL,\n",
109
+ " env=server_env\n",
110
+ ")"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 6,
116
+ "id": "accd6a19-e216-450e-9cca-beaeaa7749a9",
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "name": "stdout",
121
+ "output_type": "stream",
122
+ "text": [
123
+ "⏳ Waiting for server heartbeat...\n",
124
+ "✅ Server is UP and listening!\n"
125
+ ]
126
+ }
127
+ ],
128
+ "source": [
129
+ "# Robust Wait Loop\n",
130
+ "print(\"⏳ Waiting for server heartbeat...\")\n",
131
+ "server_ready = False\n",
132
+ "for _ in range(60): # Wait 60 seconds max\n",
133
+ " try:\n",
134
+ " if requests.get(OLLAMA_HOST).status_code == 200:\n",
135
+ " server_ready = True\n",
136
+ " break\n",
137
+ " except:\n",
138
+ " time.sleep(1)\n",
139
+ "\n",
140
+ "if server_ready:\n",
141
+ " print(\"✅ Server is UP and listening!\")\n",
142
+ "else:\n",
143
+ " raise RuntimeError(\"❌ Server failed to start. Check logs.\")"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 7,
149
+ "id": "454617a5-c1ef-489f-b9c0-8e6b4fe39b47",
150
+ "metadata": {},
151
+ "outputs": [
152
+ {
153
+ "name": "stdout",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "⬇️ Checking/Pulling qwen2.5:7b...\n",
157
+ "📝 Creating 'qwen2.5-7b-16k' (16k Context)...\n"
158
+ ]
159
+ }
160
+ ],
161
+ "source": [
162
+ "# pull model\n",
163
+ "BASE_MODEL = \"qwen2.5:7b\" \n",
164
+ "CUSTOM_MODEL_NAME = \"qwen2.5-7b-16k\"\n",
165
+ "\n",
166
+ "print(f\"⬇️ Checking/Pulling {BASE_MODEL}...\")\n",
167
+ "subprocess.run(\n",
168
+ " [str(OLLAMA_BIN), \"pull\", BASE_MODEL], \n",
169
+ " check=True, \n",
170
+ " stdout=subprocess.DEVNULL,\n",
171
+ " stderr=subprocess.DEVNULL,\n",
172
+ " env=os.environ.copy()\n",
173
+ ")\n",
174
+ "\n",
175
+ "print(f\"📝 Creating '{CUSTOM_MODEL_NAME}' (16k Context)...\")\n",
176
+ "modelfile_content = f\"FROM {BASE_MODEL}\\nPARAMETER num_ctx 16384\"\n",
177
+ "with open(\"Modelfile_qwen_16k\", \"w\") as f:\n",
178
+ " f.write(modelfile_content)"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 8,
184
+ "id": "0fe26ed7-f31f-43eb-acca-1795b5528219",
185
+ "metadata": {},
186
+ "outputs": [
187
+ {
188
+ "name": "stdout",
189
+ "output_type": "stream",
190
+ "text": [
191
+ "✅ Model Ready.\n"
192
+ ]
193
+ },
194
+ {
195
+ "name": "stderr",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "\u001b[?2026h\u001b[?25l\u001b[1Ggathering model components \u001b[K\n",
199
+ "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
200
+ "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
201
+ "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
202
+ "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
203
+ "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
204
+ "writing manifest ⠋ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[1Ggathering model components \u001b[K\n",
205
+ "using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
206
+ "using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
207
+ "using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
208
+ "using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
209
+ "using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
210
+ "writing manifest \u001b[K\n",
211
+ "success \u001b[K\u001b[?25h\u001b[?2026l\n"
212
+ ]
213
+ }
214
+ ],
215
+ "source": [
216
+ "# run model\n",
217
+ "subprocess.run(\n",
218
+ " [str(OLLAMA_BIN), \"create\", CUSTOM_MODEL_NAME, \"-f\", \"Modelfile_qwen\"], \n",
219
+ " check=True, \n",
220
+ " stdout=subprocess.DEVNULL, \n",
221
+ " env=os.environ.copy()\n",
222
+ ")\n",
223
+ "print(\"✅ Model Ready.\")"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 9,
229
+ "id": "4750bd0f-3cd2-4d62-a6c4-75c2f19e45f1",
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "os.chdir(SCRATCH)"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": null,
239
+ "id": "9581ee47-f690-46c8-b331-084411fb8535",
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "name": "stdout",
244
+ "output_type": "stream",
245
+ "text": [
246
+ "✅ Detected 4 GPUs (Dynamic Mode)\n",
247
+ "🚀 Processing PDFs from: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/pdfs\n",
248
+ "📦 Created 1089 batches of 5 files each.\n",
249
+ "🚀 Launching 20 workers on 4 GPUs...\n"
250
+ ]
251
+ },
252
+ {
253
+ "name": "stderr",
254
+ "output_type": "stream",
255
+ "text": [
256
+ "20:26:57 - [GPU-3:Dev3] - Initializing Worker 3...\n",
257
+ "20:26:58 - [GPU-0:Dev0] - Initializing Worker 0...\n",
258
+ "20:27:05 - [GPU-1:Dev1] - Initializing Worker 1...\n",
259
+ "20:27:06 - [GPU-2:Dev2] - Initializing Worker 2...\n",
260
+ "20:27:09 - [GPU-4:Dev0] - Initializing Worker 4...\n",
261
+ "20:27:11 - [GPU-5:Dev1] - Initializing Worker 5...\n",
262
+ "20:27:12 - [GPU-6:Dev2] - Initializing Worker 6...\n",
263
+ "20:27:12 - [GPU-9:Dev1] - Initializing Worker 9...\n",
264
+ "20:27:14 - [GPU-7:Dev3] - Initializing Worker 7...\n",
265
+ "20:27:15 - [GPU-8:Dev0] - Initializing Worker 8...\n"
266
+ ]
267
+ }
268
+ ],
269
+ "source": [
270
+ "# Initialize the Processor\n",
271
+ "processor = MarkerFolderProcessor(\n",
272
+ " output_dir=OUTPUT_DIR,\n",
273
+ " ollama_url=OLLAMA_HOST,\n",
274
+ " ollama_model=CUSTOM_MODEL_NAME,\n",
275
+ " batch_multiplier=4, \n",
276
+ " workers_per_gpu=workers_per_gpu,\n",
277
+ " num_gpus=num_gpus \n",
278
+ ")\n",
279
+ "\n",
280
+ "# 3. Run the extraction\n",
281
+ "print(f\"🚀 Processing PDFs from: {INPUT_PDFS}\")\n",
282
+ "processor.process_folder(INPUT_PDFS, batch_size=5)"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "id": "931650d0-50f1-48c1-a1a1-a561392e004b",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": []
292
+ }
293
+ ],
294
+ "metadata": {
295
+ "kernelspec": {
296
+ "display_name": "Python 3 (ipykernel)",
297
+ "language": "python",
298
+ "name": "python3"
299
+ },
300
+ "language_info": {
301
+ "codemirror_mode": {
302
+ "name": "ipython",
303
+ "version": 3
304
+ },
305
+ "file_extension": ".py",
306
+ "mimetype": "text/x-python",
307
+ "name": "python",
308
+ "nbconvert_exporter": "python",
309
+ "pygments_lexer": "ipython3",
310
+ "version": "3.12.3"
311
+ }
312
+ },
313
+ "nbformat": 4,
314
+ "nbformat_minor": 5
315
+ }
notebooks/unzip_stores.ipynb ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "4a41dc7b-f751-4818-912d-21241047c485",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "%load_ext autoreload\n",
11
+ "%autoreload 2"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "729a7545-088d-4521-8948-60162d80b1e7",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import os\n",
22
+ "import shutil\n",
23
+ "import zipfile\n",
24
+ "from pathlib import Path\n",
25
+ "from tqdm import tqdm"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "id": "96eb6a92-9786-48b6-88e4-0441d1a531c5",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# --- CONFIGURATION ---\n",
36
+ "# 1. Source (Permanent Storage)\n",
37
+ "PROJECT_DIR = Path(os.environ.get(\"PROJECT\")) # Auto-detects $PROJECT\n",
38
+ "SOURCE_ZIPS = PROJECT_DIR / \"tligawa/mshauri-fedha-store/cbk/zipped-store\"\n",
39
+ "\n",
40
+ "# 2. Destination (Fast Scratch Storage)\n",
41
+ "SCRATCH_DIR = Path(os.environ.get(\"SCRATCH\")) # Auto-detects $SCRATCH\n",
42
+ "WORK_DIR = SCRATCH_DIR / \"mshauri-fedha/data/cbk\"\n",
43
+ "FINAL_PDF_DIR = WORK_DIR / \"text\"\n",
44
+ "TEMP_EXTRACT_DIR = WORK_DIR / \"temp-unzip-cbk\""
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 4,
50
+ "id": "3148a17b-8e99-448e-95de-bb2c60828049",
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "True"
57
+ ]
58
+ },
59
+ "execution_count": 4,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "os.path.exists(WORK_DIR)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 5,
71
+ "id": "cc97d707-f266-4f6f-9346-19e630101923",
72
+ "metadata": {},
73
+ "outputs": [
74
+ {
75
+ "name": "stdout",
76
+ "output_type": "stream",
77
+ "text": [
78
+ "🚀 Found 111 batches in /capstor/store/cscs/director2/g164/tligawa/mshauri-fedha-store/cbk/zipped-store\n",
79
+ "📂 Flattening to: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text ...\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "# Setup directories\n",
85
+ "if FINAL_PDF_DIR.exists():\n",
86
+ " print(f\"⚠️ Warning: Target folder {FINAL_PDF_DIR} already exists.\")\n",
87
+ "else:\n",
88
+ " FINAL_PDF_DIR.mkdir(parents=True, exist_ok=True)\n",
89
+ " \n",
90
+ "if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)\n",
91
+ "TEMP_EXTRACT_DIR.mkdir(parents=True, exist_ok=True)\n",
92
+ "\n",
93
+ "# --- EXECUTION ---\n",
94
+ "zips = sorted(list(SOURCE_ZIPS.glob(\"*.zip\")))\n",
95
+ "print(f\"🚀 Found {len(zips)} batches in {SOURCE_ZIPS}\")\n",
96
+ "print(f\"📂 Flattening to: {FINAL_PDF_DIR} ...\")"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 6,
102
+ "id": "0c2a9b27-133a-4960-8920-45ee57eb3d8a",
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "name": "stderr",
107
+ "output_type": "stream",
108
+ "text": [
109
+ "Unzipping & Flattening: 100%|██████████| 111/111 [00:22<00:00, 4.84it/s]"
110
+ ]
111
+ },
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "\n",
117
+ "✨ Done! 58 files are ready in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text\n",
118
+ "🧹 Cleaning up temp dirs...\n"
119
+ ]
120
+ },
121
+ {
122
+ "name": "stderr",
123
+ "output_type": "stream",
124
+ "text": [
125
+ "\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "# Unzip and flatten\n",
131
+ "count = 0\n",
132
+ "for zip_path in tqdm(zips, desc=\"Unzipping & Flattening\"):\n",
133
+ " batch_name = zip_path.stem # e.g., \"knbs_batch_1\"\n",
134
+ " \n",
135
+ " try:\n",
136
+ " # 1. Unzip to a temp folder\n",
137
+ " with zipfile.ZipFile(zip_path, 'r') as z:\n",
138
+ " z.extractall(TEMP_EXTRACT_DIR)\n",
139
+ " \n",
140
+ " # 2. Find the 'pdfs' subfolder inside that batch\n",
141
+ " # We look recursively because structure might vary slightly\n",
142
+ " pdf_files = list(TEMP_EXTRACT_DIR.rglob(\"*.txt\"))\n",
143
+ " \n",
144
+ " # 3. Move and Rename\n",
145
+ " for pdf in pdf_files:\n",
146
+ " # Create unique name: batch_name + original_name\n",
147
+ " # Example: knbs_batch_1_annual_report_2020.pdf\n",
148
+ " new_name = f\"{batch_name}_{pdf.name}\"\n",
149
+ " dest_path = FINAL_PDF_DIR / new_name\n",
150
+ " \n",
151
+ " shutil.move(str(pdf), str(dest_path))\n",
152
+ " count += 1\n",
153
+ " \n",
154
+ " except Exception as e:\n",
155
+ " print(f\"❌ Error processing {zip_path.name}: {e}\")\n",
156
+ " finally:\n",
157
+ " # Clean temp folder for next batch\n",
158
+ " for item in TEMP_EXTRACT_DIR.iterdir():\n",
159
+ " if item.is_dir(): shutil.rmtree(item)\n",
160
+ " else: item.unlink()\n",
161
+ "\n",
162
+ "print(f\"\\n✨ Done! {count} files are ready in {FINAL_PDF_DIR}\")\n",
163
+ "print(f\"🧹 Cleaning up temp dirs...\")\n",
164
+ "if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": null,
170
+ "id": "a04d70da-e9e9-4b1b-a393-93b5b76fcd8b",
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": []
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "Python 3 (ipykernel)",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.12.3"
193
+ }
194
+ },
195
+ "nbformat": 4,
196
+ "nbformat_minor": 5
197
+ }
src/extract/download_file_links.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import dependencies
2
+ from typing import Any, Union, List, Dict
3
+ import time
4
+ import pandas as pd
5
+ import logging
6
+ import requests
7
+ from gnews import GNews
8
+ import feedparser
9
+ from io import BytesIO
10
+ import time, re
11
+ from bs4 import BeautifulSoup
12
+ import urllib3
13
+ import certifi
14
+ from urllib.parse import urljoin, urlparse
15
+ from urllib.robotparser import RobotFileParser
16
+ from collections import Counter
17
+ from tqdm.auto import tqdm
18
+ from newspaper import Article
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+
21
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
22
+
23
+ # Set up basic logging
24
+ logging.basicConfig(
25
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class CBKExplorer:
31
+ def __init__(self, github_username):
32
+ self.user_agent = f"MshauriFedhaBot/0.1 (+https://github.com/{github_username}/mshaurifedha)"
33
+ self.session = requests.Session()
34
+ self.session.headers.update({"User-Agent": self.user_agent})
35
+
36
+ def is_allowed_by_robots(self, base_url, target_url):
37
+ """Check robots.txt for permission."""
38
+ parsed = urlparse(base_url)
39
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
40
+ rp = RobotFileParser()
41
+ try:
42
+ rp.set_url(robots_url)
43
+ rp.read()
44
+ return rp.can_fetch(self.user_agent, target_url)
45
+ except Exception as e:
46
+ print(f"[robots] Could not read robots.txt ({e}). Proceed cautiously.")
47
+ return False
48
+
49
+ def fetch(self, url, timeout=25, allow_proxy_fallback=True):
50
+ """
51
+ Robust fetch that tries:
52
+ 1) requests with certifi bundle (secure)
53
+ 2) http fallback (if https fails)
54
+ 3) requests with verify=False (insecure)
55
+ 4) optional external proxy fetch (r.jina.ai) as last resort
56
+
57
+ Returns (response, soup) or (None, None)
58
+ """
59
+ # helper to parse response->soup
60
+ def resp_to_soup(r):
61
+ try:
62
+ r.raise_for_status()
63
+ return r, BeautifulSoup(r.text, "lxml")
64
+ except Exception:
65
+ return None, None
66
+
67
+ # 1) Try with certifi (preferred)
68
+ try:
69
+ r = self.session.get(url, timeout=timeout, verify=certifi.where())
70
+ ok_resp, soup = resp_to_soup(r)
71
+ if ok_resp:
72
+ return ok_resp, soup
73
+ except requests.exceptions.SSLError as ssl_err:
74
+ print(f"[fetch] SSL error with certifi for {url}: {ssl_err}")
75
+ except Exception as e:
76
+ print(f"[fetch] Primary attempt failed for {url}: {e}")
77
+
78
+ # 2) Try http fallback if URL is https
79
+ try:
80
+ parsed = urlparse(url)
81
+ if parsed.scheme == "https":
82
+ http_url = url.replace("https://", "http://", 1)
83
+ try:
84
+ r = self.session.get(http_url, timeout=timeout)
85
+ ok_resp, soup = resp_to_soup(r)
86
+ if ok_resp:
87
+ print(f"[fetch] HTTP fallback succeeded for {http_url}")
88
+ return ok_resp, soup
89
+ except Exception as e:
90
+ print(f"[fetch] HTTP fallback failed for {http_url}: {e}")
91
+ except Exception as e:
92
+ print(f"[fetch] HTTP fallback: error preparing URL: {e}")
93
+
94
+ # 3) Try insecure (verify=False) as last direct option
95
+ try:
96
+ print(f"[fetch] Trying insecure fetch (verify=False) for {url} — not recommended for sensitive data.")
97
+ r = self.session.get(url, timeout=timeout, verify=False)
98
+ ok_resp, soup = resp_to_soup(r)
99
+ if ok_resp:
100
+ return ok_resp, soup
101
+ except Exception as e:
102
+ print(f"[fetch] Insecure fetch also failed for {url}: {e}")
103
+
104
+ # 4) Optional: external proxy/relay (last resort)
105
+ if allow_proxy_fallback:
106
+ try:
107
+ # Jina.ai simple fetch service: returns rendered HTML as text
108
+ # NOTE: this is an external service — use only for public/cached pages.
109
+ proxy_url = "https://r.jina.ai/http://" + url.replace("https://", "").replace("http://", "")
110
+ print(f"[fetch] Trying proxy fetch via {proxy_url}")
111
+ r = requests.get(proxy_url, timeout=30) # using plain requests (no verify issues; it's https to jina)
112
+ if r.status_code == 200 and r.text:
113
+ return r, BeautifulSoup(r.text, "lxml")
114
+ else:
115
+ print(f"[fetch] Proxy fetch returned status {r.status_code}")
116
+ except Exception as e:
117
+ print(f"[fetch] Proxy fetch failed: {e}")
118
+
119
+ # give up
120
+ print(f"[fetch] All fetch strategies failed for {url}")
121
+ return None, None
122
+
123
+ def abs_link(self, base, href):
124
+ """Make absolute link from relative href."""
125
+ if not href:
126
+ return None
127
+ return urljoin(base, href)
128
+
129
+ def explore_url(self, url, print_anchors=40):
130
+ """Explore a CBK URL: meta, headings, nav links, anchor samples, file-like links."""
131
+ print("URL:", url)
132
+ print("Allowed by robots.py? ->", self.is_allowed_by_robots(url, url))
133
+ resp, soup = self.fetch(url)
134
+ if not resp:
135
+ return None
136
+
137
+ # Basic meta
138
+ print("Status code:", resp.status_code)
139
+ title = soup.title.string.strip() if soup.title else ""
140
+ print("Title:", title)
141
+ desc = ""
142
+ meta_desc = soup.find("meta", attrs={"name":"description"}) or soup.find("meta", attrs={"property":"og:description"})
143
+ if meta_desc and meta_desc.get("content"):
144
+ desc = meta_desc["content"].strip()
145
+ print("Meta description:", desc[:300])
146
+
147
+ # Headings
148
+ h1s = [h.get_text(strip=True) for h in soup.find_all("h1")]
149
+ h2s = [h.get_text(strip=True) for h in soup.find_all("h2")]
150
+ print("H1s:", h1s[:5])
151
+ print("H2s:", h2s[:8])
152
+
153
+ # Nav / header anchors
154
+ navs = soup.find_all("nav")
155
+ if navs:
156
+ print(f"Found {len(navs)} <nav> block(s). Sample nav links:")
157
+ nav_links = []
158
+ for nav in navs:
159
+ for a in nav.find_all("a", href=True):
160
+ nav_links.append((a.get_text(strip=True), self.abs_link(url, a["href"])))
161
+ for t, link in nav_links[:20]:
162
+ print(" -", t or "<no-text>", "->", link)
163
+ else:
164
+ print("No <nav> block found (or it's rendered by JS).")
165
+
166
+ # Sample anchors across page
167
+ anchors = []
168
+ for a in soup.find_all("a", href=True):
169
+ text = a.get_text(strip=True)
170
+ href = a["href"].strip()
171
+ anchors.append((text, self.abs_link(url, href)))
172
+ anchors = [a for a in anchors if a[1] is not None]
173
+ print(f"Total anchors on page: {len(anchors)}. Showing first {min(print_anchors,len(anchors))}:")
174
+ for t, link in anchors[:print_anchors]:
175
+ print(" *", (t[:60] or "<no-text>"), "->", link)
176
+
177
+ # Class name frequencies
178
+ classes = []
179
+ for tag in soup.find_all(True):
180
+ cls = tag.get("class")
181
+ if cls:
182
+ classes.extend(cls if isinstance(cls, list) else [cls])
183
+ class_counts = Counter(classes)
184
+ print("Top 15 classes used on page (class_name:count):")
185
+ for k,v in class_counts.most_common(15):
186
+ print(" ", k, ":", v)
187
+
188
+ # Links that look like files
189
+ file_like = []
190
+ for text, link in anchors:
191
+ if re.search(r"\.pdf$|\.xls$|\.xlsx$|\.csv$", link, re.IGNORECASE):
192
+ file_like.append((text, link))
193
+ print("File-like links found on page:", len(file_like))
194
+ for t, l in file_like[:20]:
195
+ print(" FILE:", (t[:80] or "<no-text>"), "->", l)
196
+
197
+ return {"title": title, "anchors": anchors, "file_links": file_like, "class_counts": class_counts}
198
+
199
+ def inspect_pages(self, urls):
200
+ results = {}
201
+ for u in urls:
202
+ print("\n" + "="*80)
203
+ print("Inspecting:", u)
204
+ out = self.explore_url(u, print_anchors=80)
205
+ results[u] = out
206
+ time.sleep(1.0) # polite pause
207
+ return results
208
+
209
+ def collect_file_links(self, url, allowed_exts=(".pdf", ".xls", ".xlsx", ".csv")):
210
+ _, soup = self.fetch(url)
211
+ if not soup:
212
+ return pd.DataFrame() # instead of returning []
213
+
214
+ found = []
215
+ for a in soup.find_all("a", href=True):
216
+ href = a["href"].strip()
217
+ ab = self.abs_link(url, href)
218
+ if not ab:
219
+ continue
220
+ # only same domain (safety)
221
+ if urlparse(ab).netloc.endswith("centralbank.go.ke") or urlparse(ab).netloc == "":
222
+ if any(ab.lower().endswith(ext) for ext in allowed_exts):
223
+ found.append({"page":url, "text": a.get_text(strip=True), "file_url":ab})
224
+
225
+ # dedupe
226
+ seen = set()
227
+ dedup = []
228
+ for row in found:
229
+ if row["file_url"] not in seen:
230
+ dedup.append(row)
231
+ seen.add(row["file_url"])
232
+
233
+ df = pd.DataFrame(dedup)
234
+ print(f"Found {len(df)} file links on {url}")
235
+ return df
236
+
237
+
238
+ def crawl_links_for_files(self, start_url, allowed_exts=(".pdf", ".xls", ".xlsx", ".csv"), max_pages=50):
239
+ _, soup = self.fetch(start_url)
240
+ if not soup:
241
+ return []
242
+ pages = []
243
+ for a in soup.find_all("a", href=True):
244
+ href = a["href"].strip()
245
+ ab = self.abs_link(start_url, href)
246
+ if not ab:
247
+ continue
248
+ # only same domain
249
+ if urlparse(ab).netloc.endswith("centralbank.go.ke"):
250
+ pages.append(ab)
251
+ pages = list(dict.fromkeys(pages))[:max_pages]
252
+ print(f"Will inspect {len(pages)} linked pages from {start_url}")
253
+ results = []
254
+ for p in tqdm(pages):
255
+ df = self.collect_file_links(p, allowed_exts=allowed_exts)
256
+ if not df.empty:
257
+ results.append(df)
258
+ time.sleep(0.8)
259
+ if results:
260
+ return pd.concat(results, ignore_index=True)
261
+ return pd.DataFrame()
262
+
263
+ def download_files(file_links, root_dir, save_dir,
264
+ allowed_exts=(".pdf", ".xls", ".xlsx", ".csv"),
265
+ overwrite=False):
266
+ """
267
+ Download multiple files from a list of (title, url) pairs.
268
+
269
+ Args:
270
+ file_links: list of (title, url) tuples, or list of dicts {"text":..., "file_url":...}
271
+ root_dir: base folder to save under
272
+ save_dir: subdirectory under root_dir
273
+ allowed_exts: file extensions to allow
274
+ overwrite: if True, re-download even if file exists
275
+
276
+ Returns:
277
+ metadata: list of dicts (title, url, local_path, size, status)
278
+ """
279
+ save_dir_path = os.path.join(root_dir, save_dir)
280
+ os.makedirs(save_dir_path, exist_ok=True)
281
+ metadata = []
282
+
283
+ # Normalize file_links into [(title, url), ...]
284
+ norm_links = []
285
+ for item in file_links:
286
+ if isinstance(item, tuple):
287
+ title, url = item
288
+ elif isinstance(item, dict):
289
+ title, url = item.get("text", "file"), item.get("file_url")
290
+ else:
291
+ continue
292
+ norm_links.append((title.strip(), url.strip()))
293
+
294
+ for title, url in norm_links:
295
+ # filter by extension
296
+ if not any(url.lower().endswith(ext) for ext in allowed_exts):
297
+ continue
298
+
299
+ # guess extension from URL
300
+ ext = os.path.splitext(urlparse(url).path)[1] or ".bin"
301
+ # clean filename
302
+ safe_title = re.sub(r"[^A-Za-z0-9._-]+", "_", title)[:100]
303
+ fname = f"{safe_title}{ext}"
304
+ path = os.path.join(save_dir_path, fname)
305
+
306
+ if os.path.exists(path) and not overwrite:
307
+ print(f"[skip] {fname} already exists.")
308
+ status = "skipped"
309
+ else:
310
+ try:
311
+ print(f"[download] {title} -> {fname}")
312
+ r = requests.get(url, stream=True, timeout=60)
313
+ r.raise_for_status()
314
+ with open(path, "wb") as f:
315
+ for chunk in r.iter_content(8192):
316
+ if chunk:
317
+ f.write(chunk)
318
+ status = "ok"
319
+ except Exception as e:
320
+ print(f"[error] Failed: {url} ({e})")
321
+ status = "error"
322
+
323
+ size = os.path.getsize(path) if os.path.exists(path) else 0
324
+ metadata.append({
325
+ "title": title,
326
+ "url": url,
327
+ "local_path": path,
328
+ "size": size,
329
+ "status": status
330
+ })
331
+
332
+ return metadata
333
+
334
+ import os, subprocess, importlib, sys
335
+
336
+ def load_repo(repo):
337
+ local = repo.split("/")[-1]
338
+ if not os.path.exists(local):
339
+ subprocess.run(["git", "clone", f"https://github.com/{repo}.git"], check=True)
340
+ else:
341
+ subprocess.run(["git", "-C", local, "pull"], check=True)
342
+ if local not in sys.path:
343
+ sys.path.insert(0, local)
344
+ mod = importlib.import_module(local)
345
+ importlib.reload(mod)
346
+ return mod
347
+
348
+ def fetch_kenya_gnews(api_key):
349
+ # Free tier: 100 requests/day
350
+ url = f"https://gnews.io/api/v4/top-headlines?category=business&country=ke&token={api_key}"
351
+
352
+ response = requests.get(url)
353
+ data = response.json()
354
+
355
+ articles = []
356
+ for article in data.get('articles', []):
357
+ articles.append({
358
+ 'title': article.get('title'),
359
+ 'content': article.get('description'),
360
+ 'url': article.get('url'),
361
+ 'date': article.get('publishedAt'),
362
+ 'source': article.get('source', {}).get('name')
363
+ })
364
+
365
+ df = pd.DataFrame(articles)
366
+ return df
367
+
368
+ def is_valid_url(url):
369
+ try:
370
+ result = urlparse(url)
371
+ return all([result.scheme, result.netloc])
372
+ except:
373
+ return False
374
+
375
+ def fetch_kenya_thenewsapi(api_key):
376
+
377
+ url = f"https://api.thenewsapi.com/v1/news/all?api_token={api_key}&search=kenya+economy&language=en"
378
+
379
+ response = requests.get(url)
380
+ data = response.json()
381
+
382
+ articles = []
383
+ for article in data.get('data', []):
384
+ articles.append({
385
+ 'title': article.get('title'),
386
+ 'content': article.get('description'),
387
+ 'url': article.get('url'),
388
+ 'date': article.get('published_at'),
389
+ 'source': article.get('source')
390
+ })
391
+
392
+ df = pd.DataFrame(articles)
393
+ return df
394
+
395
+ def scrape_google_news_kenya():
396
+ google_news = GNews(
397
+ language='en',
398
+ country='KE',
399
+ period='7d',
400
+ max_results=50
401
+ )
402
+
403
+ # Search for Kenya business news
404
+ articles = google_news.get_news('Kenya economy OR inflation OR central bank')
405
+
406
+ df = pd.DataFrame(articles)
407
+ return df
408
+
409
+ # Install: pip install gnews
410
+
411
+ def scrape_african_business_rss():
412
+ feeds = [
413
+ 'https://african.business/feed/', # African Business Magazine
414
+ 'https://www.cnbcafrica.com/feed/', # CNBC Africa
415
+ 'https://allafrica.com/tools/headlines/rdf/economy/headlines.rdf', # AllAfrica Economy
416
+ ]
417
+
418
+ articles = []
419
+
420
+ for feed_url in feeds:
421
+ feed = feedparser.parse(feed_url)
422
+
423
+ for entry in feed.entries[:20]:
424
+ articles.append({
425
+ 'title': entry.get('title', ''),
426
+ 'url': entry.get('link', ''),
427
+ 'date': entry.get('published', ''),
428
+ 'summary': entry.get('summary', ''),
429
+ 'source': feed.feed.get('title', '')
430
+ })
431
+
432
+ df = pd.DataFrame(articles)
433
+ return df
434
+
435
+ def scrape_article(url: str, metadata: dict) -> dict:
436
+ """Scrape single article"""
437
+ try:
438
+ article = Article(url)
439
+ article.download()
440
+ article.parse()
441
+
442
+ if len(article.text) > 200:
443
+ return {
444
+ 'title': article.title,
445
+ 'full_content': article.text,
446
+ 'summary': metadata.get('summary', ''),
447
+ 'url': url,
448
+ 'date': metadata.get('date'),
449
+ 'source': metadata.get('source'),
450
+ 'authors': ', '.join(article.authors) if article.authors else '',
451
+ 'image': article.top_image,
452
+ 'word_count': len(article.text.split()),
453
+ 'status': 'success'
454
+ }
455
+ return None
456
+ except Exception as e:
457
+ return {'url': url, 'status': 'failed', 'error': str(e)}
458
+
459
+ def fetch_newsdata_multi(api_key: str) -> List[Dict]:
460
+ """Multiple NewsData.io requests with pagination"""
461
+ all_articles = []
462
+
463
+ # Different queries to maximize coverage
464
+ queries = [
465
+ 'kenya economy',
466
+ 'kenya inflation',
467
+ 'kenya central bank',
468
+ 'kenya business',
469
+ 'kenya finance'
470
+ ]
471
+
472
+ for query in queries:
473
+ try:
474
+ page = None
475
+ for _ in range(3): # Up to 3 pages per query
476
+ params = {
477
+ 'apikey': api_key,
478
+ 'q': query,
479
+ 'country': 'ke',
480
+ 'language': 'en'
481
+ }
482
+ if page:
483
+ params['page'] = page
484
+
485
+ response = requests.get('https://newsdata.io/api/1/latest', params=params, timeout=10)
486
+ data = response.json()
487
+
488
+ if data.get('status') != 'success':
489
+ break
490
+
491
+ for item in data.get('results', []):
492
+ if is_valid_url(item.get('link')):
493
+ all_articles.append({
494
+ 'url': item.get('link'),
495
+ 'summary': item.get('description', ''),
496
+ 'date': item.get('pubDate'),
497
+ 'source': item.get('source_id')
498
+ })
499
+
500
+ page = data.get('nextPage')
501
+ if not page:
502
+ break
503
+
504
+ time.sleep(1)
505
+ except Exception as e:
506
+ print(f"NewsData query '{query}': {e}")
507
+ continue
508
+
509
+ return all_articles
510
+
511
+ def fetch_gnews_multi(api_key: str) -> List[Dict]:
512
+ """Multiple GNews requests"""
513
+ all_articles = []
514
+
515
+ # Different search terms
516
+ searches = [
517
+ 'kenya economy',
518
+ 'kenya inflation',
519
+ 'kenya business',
520
+ 'nairobi stock exchange'
521
+ ]
522
+
523
+ for search in searches:
524
+ try:
525
+ params = {
526
+ 'apikey': api_key,
527
+ 'q': search,
528
+ 'country': 'ke',
529
+ 'lang': 'en',
530
+ 'max': 10 # Free tier max
531
+ }
532
+
533
+ response = requests.get('https://gnews.io/api/v4/search', params=params, timeout=10)
534
+ data = response.json()
535
+
536
+ for item in data.get('articles', []):
537
+ if is_valid_url(item.get('url')):
538
+ all_articles.append({
539
+ 'url': item.get('url'),
540
+ 'summary': item.get('description', ''),
541
+ 'date': item.get('publishedAt'),
542
+ 'source': item.get('source', {}).get('name')
543
+ })
544
+
545
+ time.sleep(1)
546
+ except Exception as e:
547
+ print(f"GNews search '{search}': {e}")
548
+ continue
549
+
550
+ return all_articles
551
+
552
+ def fetch_thenewsapi_multi(api_key: str) -> List[Dict]:
553
+ """Multiple TheNewsAPI requests (only 3 articles per request!)"""
554
+ all_articles = []
555
+
556
+ # Multiple searches to compensate for 3-article limit
557
+ searches = [
558
+ 'kenya economy',
559
+ 'kenya business',
560
+ 'kenya inflation',
561
+ 'kenya central bank',
562
+ 'kenya finance',
563
+ 'nairobi economy',
564
+ 'kenya investment',
565
+ 'kenya banking'
566
+ ]
567
+
568
+ for search in searches:
569
+ try:
570
+ params = {
571
+ 'api_token': api_key,
572
+ 'search': search,
573
+ 'language': 'en',
574
+ 'limit': 3 # Free tier limit
575
+ }
576
+
577
+ response = requests.get('https://api.thenewsapi.com/v1/news/all', params=params, timeout=10)
578
+ data = response.json()
579
+
580
+ for item in data.get('data', []):
581
+ if is_valid_url(item.get('url')):
582
+ all_articles.append({
583
+ 'url': item.get('url'),
584
+ 'summary': item.get('description', ''),
585
+ 'date': item.get('published_at'),
586
+ 'source': item.get('source')
587
+ })
588
+
589
+ time.sleep(1)
590
+ except Exception as e:
591
+ print(f"TheNewsAPI search '{search}': {e}")
592
+ continue
593
+
594
+ return all_articles
595
+
596
+ def scrape_kenya_news_maximum(
597
+ newsdata_key: str = None,
598
+ gnews_key: str = None,
599
+ thenewsapi_key: str = None,
600
+ max_workers: int = 8
601
+ ) -> pd.DataFrame:
602
+ """Get MAXIMUM articles from all sources"""
603
+
604
+ print("🔍 Fetching maximum articles from all APIs...\n")
605
+
606
+ all_articles = []
607
+
608
+ # Fetch from all sources
609
+ if newsdata_key:
610
+ print("📰 NewsData.io: ", end="", flush=True)
611
+ articles = fetch_newsdata_multi(newsdata_key)
612
+ all_articles.extend(articles)
613
+ print(f"{len(articles)} URLs")
614
+
615
+ if gnews_key:
616
+ print("📰 GNews.io: ", end="", flush=True)
617
+ articles = fetch_gnews_multi(gnews_key)
618
+ all_articles.extend(articles)
619
+ print(f"{len(articles)} URLs")
620
+
621
+ if thenewsapi_key:
622
+ print("📰 TheNewsAPI: ", end="", flush=True)
623
+ articles = fetch_thenewsapi_multi(thenewsapi_key)
624
+ all_articles.extend(articles)
625
+ print(f"{len(articles)} URLs (limited to 3/request on free)")
626
+
627
+ if not all_articles:
628
+ print("\n No articles found")
629
+ return pd.DataFrame()
630
+
631
+ # Deduplicate by URL
632
+ seen = set()
633
+ unique = []
634
+ for a in all_articles:
635
+ if a['url'] not in seen:
636
+ seen.add(a['url'])
637
+ unique.append(a)
638
+
639
+ print(f"\n Total unique URLs: {len(unique)}\n")
640
+
641
+ # Parallel scraping
642
+ results = []
643
+ failed = 0
644
+
645
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
646
+ futures = {executor.submit(scrape_article, a['url'], a): a for a in unique}
647
+
648
+ with tqdm(total=len(futures), desc="📄 Scraping", unit="article") as pbar:
649
+ for future in as_completed(futures):
650
+ result = future.result()
651
+
652
+ if result and result.get('status') == 'success':
653
+ results.append(result)
654
+ elif result:
655
+ failed += 1
656
+ if failed <= 3: # Only show first 3 errors
657
+ print(f"\n {result['url'][:50]}... | {result['error']}")
658
+
659
+ pbar.update(1)
660
+ time.sleep(0.2)
661
+
662
+ # Save
663
+ if results:
664
+ df = pd.DataFrame(results)
665
+ df = df.drop('status', axis=1, errors='ignore')
666
+
667
+ print(f"\n {len(results)} articles scraped | {failed} failed | {len(results)/(len(results)+failed)*100:.1f}% success")
668
+ print(f" Avg: {df['word_count'].mean():.0f} words | {df['source'].nunique()} sources")
669
+ return df
670
+
671
+ return pd.DataFrame()
src/load/clean_db.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # clean_database.py
2
+ import pandas as pd
3
+ from sqlalchemy import create_engine, text
4
+ import logging
5
+
6
+ # Set up a logger that can be configured by the importer
7
+ logger = logging.getLogger("DBCleaner")
8
+
9
+ def drop_blacklisted_tables(engine):
10
+ """Drops tables matching the blacklist patterns."""
11
+ drop_patterns = [
12
+ "bop_annual",
13
+ "commercial_banks_average_lending_rates",
14
+ "depository_corporation",
15
+ "exchange_rates_end_period",
16
+ "exchange_rates_period_average",
17
+ "forex_bureau_rates_sheet",
18
+ "lr_return_template",
19
+ "nsfr_return_template"
20
+ ]
21
+
22
+ with engine.connect() as conn:
23
+ all_tables = [t[0] for t in conn.execute(text("SELECT name FROM sqlite_master WHERE type='table'")).fetchall()]
24
+ tables_to_drop = []
25
+
26
+ for t in all_tables:
27
+ if any(p in t for p in drop_patterns):
28
+ tables_to_drop.append(t)
29
+
30
+ if not tables_to_drop:
31
+ logger.info("No tables found matching blacklist patterns.")
32
+ return
33
+
34
+ logger.info(f"🗑️ Dropping {len(tables_to_drop)} tables...")
35
+ for t in tables_to_drop:
36
+ conn.execute(text(f'DROP TABLE "{t}"'))
37
+ logger.info(f" - Dropped: {t}")
38
+ conn.commit()
39
+
40
+ def clean_table(engine, table_name, drop_top_rows=0, rename_map=None, rename_by_index=None, static_date=None):
41
+ """
42
+ Generic cleaner for specific table fixes.
43
+ """
44
+ try:
45
+ # Check if table exists first
46
+ with engine.connect() as conn:
47
+ exists = conn.execute(text(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")).scalar()
48
+ if not exists:
49
+ logger.warning(f" Table '{table_name}' not found. Skipping.")
50
+ return
51
+
52
+ df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
53
+ if df.empty: return
54
+
55
+ # Drop columns that are completely empty
56
+ df = df.dropna(axis=1, how='all')
57
+
58
+ # Drop top rows if requested
59
+ if drop_top_rows > 0:
60
+ df = df.iloc[drop_top_rows:].reset_index(drop=True)
61
+
62
+ # Rename by Index (useful for 'col_1', 'col_2')
63
+ if rename_by_index:
64
+ curr_cols = list(df.columns)
65
+ new_cols = curr_cols.copy()
66
+ for idx, new_name in rename_by_index.items():
67
+ if idx < len(curr_cols):
68
+ new_cols[idx] = new_name
69
+ df.columns = new_cols
70
+
71
+ # Rename by Map
72
+ if rename_map:
73
+ df.rename(columns=rename_map, inplace=True)
74
+
75
+ # Inject Static Date if missing
76
+ if static_date:
77
+ if 'date' not in df.columns:
78
+ df.insert(0, 'date', static_date)
79
+ else:
80
+ df['date'] = static_date
81
+
82
+ # Save back to DB (Replace mode)
83
+ df.to_sql(table_name, engine, if_exists='replace', index=False)
84
+ logger.info(f" Fixed '{table_name}': {len(df)} rows")
85
+
86
+ except Exception as e:
87
+ logger.error(f" Error cleaning '{table_name}': {e}")
88
+
89
+ def run_specific_fixes(engine):
90
+ """Orchestrates the specific cleaning rules."""
91
+ logger.info("🔧 Running specific table fixes...")
92
+
93
+ # 1. Historical Rates
94
+ clean_table(engine, "download_all_historical_rates",
95
+ rename_by_index={2: "mean_rate", 3: "buy_rate", 4: "sell_rate"})
96
+
97
+ # 2. Foreign Trade Summary
98
+ clean_table(engine, "foreign_trade_summary", drop_top_rows=1)
99
+
100
+ # 3. Forex Bureau Rates
101
+ clean_table(engine, "forex_bureau_rates",
102
+ rename_map={"bureau_name": "currency"})
103
+
104
+ # 4. Indicative Rates (Indicative Sheet)
105
+ clean_table(engine, "indicative_rates_sheet_indicative",
106
+ static_date="2017-11-16",
107
+ rename_by_index={0: "currency", 1: "mean_rate", 2: "buy_rate", 3: "sell_rate"})
108
+
109
+ # 5. Indicative Rates (Press Sheet)
110
+ clean_table(engine, "indicative_rates_sheet_press",
111
+ static_date="2017-11-16",
112
+ rename_by_index={
113
+ 0: "bank_name",
114
+ 1: "usd_buy", 2: "usd_sell", 3: "usd_margin",
115
+ 4: "gbp_buy", 5: "gbp_sell", 6: "gbp_margin"
116
+ })
117
+
118
+ # 6. Selected Domestic Exports
119
+ clean_table(engine, "value_of_selected_domestic_exports", drop_top_rows=2)
120
+
121
+ # 7. Imports by Commodity
122
+ clean_table(engine, "value_of_direct_imports_by_commodities", drop_top_rows=1)
123
+
124
+ def clean_database_pipeline(db_name):
125
+ """Main entry point for external calls."""
126
+ connection_str = f"sqlite:///{db_name}"
127
+ engine = create_engine(connection_str)
128
+
129
+ logger.info(f" Starting cleanup on {db_name}...")
130
+ drop_blacklisted_tables(engine)
131
+ run_specific_fixes(engine)
132
+ logger.info(" Cleanup Complete.")
133
+
134
+ def drop_tables(engine):
135
+ """Drops the specific list of tables requested."""
136
+ tables_to_drop = [
137
+ 'forex_bureau_rates',
138
+ 'forex_bureaus_rates_sheet_chief_dealers',
139
+ 'forex_bureaus_rates_sheet_director',
140
+ 'forex_bureaus_rates_sheet_directors',
141
+ 'forex_bureaus_rates_sheet_fbx',
142
+ 'forex_bureaus_rates_sheet_fbx1',
143
+ 'forex_bureaus_rates_sheet_fbx2',
144
+ 'forex_bureaus_rates_sheet_fxb1',
145
+ 'forex_bureaus_rates_sheet_fxb2',
146
+ 'forex_bureaus_rates_sheet_fxb22',
147
+ 'forex_bureaus_rates_sheet_market_intelligence',
148
+ 'forex_bureaus_rates_sheet_sheet1',
149
+ 'forex_bureaus_rates_sheet_sheet2',
150
+ 'forex_bureaus_rates_sheet_sheet3',
151
+ 'forex_bureaus_rates_sheet_sheet4',
152
+ 'issues_of_treasury_bills',
153
+ 'issues_of_treasury_bonds'
154
+ ]
155
+
156
+ print("🗑️ Dropping Tables...")
157
+ with engine.connect() as conn:
158
+ for t in tables_to_drop:
159
+ try:
160
+ conn.execute(text(f'DROP TABLE IF EXISTS "{t}"'))
161
+ print(f" - Dropped: {t}")
162
+ except Exception as e:
163
+ print(f" Could not drop {t}: {e}")
164
+ conn.commit()
165
+
166
+ def fix_foreign_trade(engine):
167
+ """Renames first column to 'year'."""
168
+ table_name = "foreign_trade_summary"
169
+ try:
170
+ df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
171
+ if 'kenyan_shillings_million_year' in df.columns:
172
+ df.rename(columns={'kenyan_shillings_million_year': 'year'}, inplace=True)
173
+ df.to_sql(table_name, engine, if_exists='replace', index=False)
174
+ print(f" Fixed '{table_name}': Renamed 'year' column.")
175
+ else:
176
+ print(f" '{table_name}': Target column not found.")
177
+ except Exception as e:
178
+ print(f" Error fixing {table_name}: {e}")
179
+
180
+ def fix_indicative_rates_shift(engine):
181
+ """
182
+ Applies the 'Shift Right + Fixed Date' logic.
183
+ Inserts 2017-11-16 at position 0, shifting existing data to the right.
184
+ """
185
+ targets = [
186
+ "indicative_rates_sheet_indicative",
187
+ "indicative_rates_sheet_press"
188
+ ]
189
+
190
+ fixed_date = "2017-11-16"
191
+
192
+ for table in targets:
193
+ try:
194
+ df = pd.read_sql(f'SELECT * FROM "{table}"', engine)
195
+ if df.empty: continue
196
+
197
+ # Logic: Insert new date column at index 0
198
+ # This effectively "shifts" the old col 0 to col 1
199
+ df.insert(0, 'fixed_date', fixed_date)
200
+
201
+ # Rename columns to reflect the shift clearly
202
+ # We assume the user wants standard names for the shifted data
203
+ # Adjust names based on the table type
204
+ new_columns = list(df.columns)
205
+ new_columns[0] = "date" # The new fixed column
206
+
207
+ # Assigning generic or specific headers for the shifted data
208
+ if "press" in table:
209
+ # Based on previous prompt instructions for Press sheet:
210
+ # Bank, USD_Buy, USD_Sell, USD_Margin, GBP_Buy...
211
+ expected_headers = ["date", "bank_name", "usd_buy", "usd_sell", "usd_margin", "gbp_buy", "gbp_sell", "gbp_margin", "euro_buy", "euro_sell", "euro_margin"]
212
+ else:
213
+ # Indicative sheet: Currency, Mean, Buy, Sell
214
+ expected_headers = ["date", "currency", "mean_rate", "buy_rate", "sell_rate"]
215
+
216
+ # Map headers safely (truncate if df has fewer cols, pad if more)
217
+ final_cols = expected_headers + [f"col_{i}" for i in range(len(df.columns) - len(expected_headers))]
218
+ df.columns = final_cols[:len(df.columns)]
219
+
220
+ # Clean up: Drop any old 'date' column if it was pushed to the right and is duplicate/garbage
221
+ # (Optional, but safer to keep strictly what we shifted)
222
+
223
+ df.to_sql(table, engine, if_exists='replace', index=False)
224
+ print(f" Fixed '{table}': Applied Date Shift & Header Rename.")
225
+
226
+ except Exception as e:
227
+ print(f" Error fixing {table}: {e}")
228
+
229
+ def fix_cbk_indicative_swap(engine):
230
+ """Swaps 'date' and 'currency' column names."""
231
+ table_name = "cbk_indicative_rates"
232
+ try:
233
+ df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
234
+
235
+ rename_map = {}
236
+ if 'date' in df.columns: rename_map['date'] = 'currency'
237
+ if 'currency' in df.columns: rename_map['currency'] = 'date'
238
+
239
+ if rename_map:
240
+ df.rename(columns=rename_map, inplace=True)
241
+ df.to_sql(table_name, engine, if_exists='replace', index=False)
242
+ print(f" Fixed '{table_name}': Swapped 'date' <-> 'currency'.")
243
+ except Exception as e:
244
+ print(f" Error fixing {table_name}: {e}")
src/load/explore_news_schema.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import glob
3
+ import os
4
+ import logging
5
+ import sys
6
+
7
+ # Configure logging to show up in the notebook
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format='%(message)s',
11
+ stream=sys.stdout,
12
+ force=True
13
+ )
14
+ logger = logging.getLogger("SchemaExplorer")
15
+
16
+ def analyze_schemas(news_dir: str):
17
+ """
18
+ Scans all CSV files in the given directory and groups them by their column structure.
19
+ """
20
+ if not os.path.exists(news_dir):
21
+ logger.error(f" Directory not found: {news_dir}")
22
+ return
23
+
24
+ csv_files = glob.glob(os.path.join(news_dir, "*.csv"))
25
+ logger.info(f"🔍 Scanning {len(csv_files)} files in '{news_dir}'...\n")
26
+
27
+ if not csv_files:
28
+ logger.warning(" No CSV files found.")
29
+ return
30
+
31
+ # Dictionary to store unique schemas: { (col1, col2): [file1, file2] }
32
+ schemas = {}
33
+
34
+ for f in csv_files:
35
+ try:
36
+ # Read only the header (fast)
37
+ df = pd.read_csv(f, nrows=0)
38
+
39
+ # Sort columns to ensure order doesn't matter for grouping
40
+ cols = tuple(sorted(df.columns.tolist()))
41
+
42
+ if cols not in schemas:
43
+ schemas[cols] = []
44
+ schemas[cols].append(os.path.basename(f))
45
+
46
+ except Exception as e:
47
+ logger.error(f" Error reading {os.path.basename(f)}: {e}")
48
+
49
+ # Report Findings
50
+ logger.info("--- Schema Report ---")
51
+ for i, (cols, files) in enumerate(schemas.items()):
52
+ logger.info(f"\nTYPE {i+1}: Found in {len(files)} files")
53
+ logger.info(f"Columns: {list(cols)}")
54
+ if len(files) < 5:
55
+ logger.info(f"Examples: {files}")
56
+ else:
57
+ logger.info(f"Examples: {files[:3]} ... (+{len(files)-3} others)")
58
+
59
+ # Date Format Check (Random Sample from the first valid file)
60
+ logger.info("\n--- Date Format Sample ---")
61
+ try:
62
+ sample_file = csv_files[0]
63
+ sample = pd.read_csv(sample_file, nrows=5)
64
+
65
+ # Look for a column containing 'date' or 'time'
66
+ date_col = next((c for c in sample.columns if 'date' in c.lower() or 'time' in c.lower() or 'published' in c.lower()), None)
67
+
68
+ if date_col:
69
+ logger.info(f"Sample from column '{date_col}' in {os.path.basename(sample_file)}:")
70
+ logger.info(sample[date_col].head().tolist())
71
+ else:
72
+ logger.warning("No obvious 'date' column found in sample.")
73
+ except Exception as e:
74
+ logger.error(f"Could not read sample for date check: {e}")
src/load/ingest_md.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import sys
4
+ import time
5
+ import requests
6
+ import subprocess
7
+ from pathlib import Path
8
+ from tqdm import tqdm
9
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader # <--- SWITCHED
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_community.embeddings import OllamaEmbeddings
13
+
14
+ # --- LOGGING ---
15
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(message)s', force=True)
16
+ logger = logging.getLogger("ReportIngest")
17
+
18
+ def _ensure_ollama_running(port="25000"):
19
+ host = f"http://127.0.0.1:{port}"
20
+ try:
21
+ if requests.get(host).status_code == 200:
22
+ return True
23
+ except: pass
24
+
25
+ print(" Starting Ollama Server...")
26
+ scratch = os.environ.get("SCRATCH", "/tmp")
27
+ base = Path(scratch)
28
+ bin_path = base / "ollama_core/bin/ollama"
29
+
30
+ env = os.environ.copy()
31
+ env["OLLAMA_HOST"] = f"127.0.0.1:{port}"
32
+ env["OLLAMA_MODELS"] = str(base / "ollama_core/models")
33
+
34
+ subprocess.Popen([str(bin_path), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
35
+ time.sleep(5)
36
+ return True
37
+
38
+ def ingest_markdown_reports(
39
+ markdown_dir="mshauri-fedha/data/knbs/marker-output",
40
+ vector_db_path="mshauri_fedha_chroma_db",
41
+ model="nomic-embed-text",
42
+ ollama_port="25000"
43
+ ):
44
+ _ensure_ollama_running(ollama_port)
45
+
46
+ if not os.path.exists(markdown_dir):
47
+ logger.error(f" Directory not found: {markdown_dir}")
48
+ return
49
+
50
+ print(f"📄 Scanning for Markdown Reports in {markdown_dir}...")
51
+
52
+ # --- 1. LOAD FILES (Improved) ---
53
+ # We use TextLoader which is faster and doesn't trigger 'unstructured' warnings
54
+ loader = DirectoryLoader(
55
+ markdown_dir,
56
+ glob="**/*.md",
57
+ loader_cls=TextLoader,
58
+ loader_kwargs={'autodetect_encoding': True}, # Safe for varying file encodings
59
+ show_progress=True,
60
+ use_multithreading=True
61
+ )
62
+
63
+ # Catch errors during loading (e.g., empty files)
64
+ try:
65
+ raw_docs = loader.load()
66
+ except Exception as e:
67
+ print(f" Warning during loading: {e}")
68
+ # Fallback: simple load if directory loader fails
69
+ raw_docs = []
70
+
71
+ if not raw_docs:
72
+ print(" No valid markdown files found.")
73
+ return
74
+
75
+ print(f" Loaded {len(raw_docs)} report files.")
76
+
77
+ # --- 2. CHUNKING ---
78
+ text_splitter = RecursiveCharacterTextSplitter(
79
+ chunk_size=2000,
80
+ chunk_overlap=200,
81
+ separators=["\n## ", "\n### ", "\n", " ", ""]
82
+ )
83
+ docs = text_splitter.split_documents(raw_docs)
84
+
85
+ # --- 3. METADATA ---
86
+ for d in docs:
87
+ d.metadata["type"] = "report"
88
+ if "source" not in d.metadata:
89
+ d.metadata["source"] = os.path.basename(d.metadata.get("source", "Official Report"))
90
+
91
+ print(f" ✂️ Split into {len(docs)} chunks.")
92
+
93
+ # --- 4. EMBEDDING ---
94
+ print(" Appending to Vector Store...")
95
+ embeddings = OllamaEmbeddings(
96
+ model=model,
97
+ base_url=f"http://127.0.0.1:{ollama_port}"
98
+ )
99
+
100
+ vectorstore = Chroma(
101
+ persist_directory=vector_db_path,
102
+ embedding_function=embeddings
103
+ )
104
+
105
+ # Batch Add
106
+ batch_size = 100
107
+
108
+ with tqdm(total=len(docs), desc="Ingesting Reports", unit="chunk") as pbar:
109
+ for i in range(0, len(docs), batch_size):
110
+ batch = docs[i:i+batch_size]
111
+ vectorstore.add_documents(batch)
112
+ pbar.update(len(batch))
113
+
114
+ print("\n Reports Added. Hybrid Knowledge Base is ready.")
115
+
116
+ if __name__ == "__main__":
117
+ ingest_markdown_reports()
src/load/ingest_news.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ import ast
5
+ import logging
6
+ import glob
7
+ import time
8
+ import requests
9
+ import subprocess
10
+ import sys
11
+ from pathlib import Path
12
+ from tqdm import tqdm # <--- New Import
13
+ from dateutil import parser
14
+ from langchain_core.documents import Document
15
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
16
+ from langchain_community.vectorstores import Chroma
17
+ from langchain_community.embeddings import OllamaEmbeddings
18
+
19
+ # --- CONFIG ---
20
+ MIN_CONTENT_LENGTH = 100
21
+
22
+ # --- LOGGING ---
23
+ logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(message)s', force=True)
24
+ logger = logging.getLogger("NewsIngest")
25
+
26
+ def _ensure_ollama_running(port="25000"):
27
+ host = f"http://127.0.0.1:{port}"
28
+ try:
29
+ if requests.get(host).status_code == 200:
30
+ return True
31
+ except: pass
32
+
33
+ print(" Starting Ollama Server...")
34
+ scratch = os.environ.get("SCRATCH", "/tmp")
35
+ base = Path(scratch)
36
+ bin_path = base / "ollama_core/bin/ollama"
37
+
38
+ env = os.environ.copy()
39
+ env["OLLAMA_HOST"] = f"127.0.0.1:{port}"
40
+ env["OLLAMA_MODELS"] = str(base / "ollama_core/models")
41
+
42
+ subprocess.Popen([str(bin_path), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
43
+ time.sleep(5)
44
+ return True
45
+
46
+ # --- CLEANING HELPERS ---
47
+ def clean_text(text):
48
+ if not isinstance(text, str): return ""
49
+ text = re.sub(r'(?:https?|ftp)://\S+|www\.\S+', '', text)
50
+ text = re.sub(r'<[^>]+>', '', text)
51
+ text = re.sub(r'^[\W_]+', '', text)
52
+ text = re.sub(r'\s+', ' ', text).strip()
53
+ return text
54
+
55
+ def parse_standard_date(date_str):
56
+ try:
57
+ if pd.isna(date_str): return "Unknown Date"
58
+ dt = parser.parse(str(date_str))
59
+ return dt.strftime("%Y-%m-%d")
60
+ except: return "Unknown Date"
61
+
62
+ def extract_publisher_from_dict(pub_str):
63
+ try:
64
+ if isinstance(pub_str, str) and "{" in pub_str:
65
+ data = ast.literal_eval(pub_str)
66
+ return data.get('title', 'Google News')
67
+ return str(pub_str)
68
+ except: return "Google News"
69
+
70
+ def normalize_news_df(df, filename):
71
+ cols = df.columns.tolist()
72
+ normalized = []
73
+
74
+ def create_entry(row, title_col, content_col, date_col, source_val):
75
+ title = clean_text(row.get(title_col, ''))
76
+ content = clean_text(row.get(content_col, ''))
77
+ if len(content) < MIN_CONTENT_LENGTH: return None
78
+ return {
79
+ 'title': title,
80
+ 'content': content,
81
+ 'date': parse_standard_date(row.get(date_col, '')),
82
+ 'source': source_val,
83
+ 'url': row.get('url', ''),
84
+ 'file_origin': filename
85
+ }
86
+
87
+ if 'publisher' in cols and 'description' in cols:
88
+ for _, row in df.iterrows():
89
+ source = extract_publisher_from_dict(row.get('publisher', ''))
90
+ entry = create_entry(row, 'title', 'description', 'published date', source)
91
+ if entry: normalized.append(entry)
92
+ elif 'full_content' in cols:
93
+ for _, row in df.iterrows():
94
+ c_col = 'full_content' if isinstance(row.get('full_content'), str) and len(str(row.get('full_content'))) > 50 else 'summary'
95
+ entry = create_entry(row, 'title', c_col, 'date', str(row.get('source', 'Unknown')))
96
+ if entry: normalized.append(entry)
97
+ elif 'content' in cols and 'source' in cols:
98
+ for _, row in df.iterrows():
99
+ entry = create_entry(row, 'title', 'content', 'date', str(row.get('source', 'Unknown')))
100
+ if entry: normalized.append(entry)
101
+ return pd.DataFrame(normalized)
102
+
103
+ def ingest_news_data(news_dir, vector_db_path="mshauri_fedha_chroma_db", model="nomic-embed-text"):
104
+ _ensure_ollama_running()
105
+
106
+ csv_files = glob.glob(os.path.join(news_dir, "*.csv"))
107
+ if not csv_files:
108
+ print("No files found.")
109
+ return
110
+
111
+ print(f" Found {len(csv_files)} news files. Processing...")
112
+
113
+ all_articles = []
114
+
115
+ # Progress bar for loading files
116
+ for f in tqdm(csv_files, desc="Reading CSVs", unit="file"):
117
+ try:
118
+ df = pd.read_csv(f)
119
+ clean_df = normalize_news_df(df, os.path.basename(f))
120
+ if not clean_df.empty:
121
+ all_articles.extend(clean_df.to_dict('records'))
122
+ except Exception as e:
123
+ pass
124
+
125
+ # Deduplication
126
+ unique_docs = {}
127
+ for art in all_articles:
128
+ key = f"{art['title']}_{art['date']}"
129
+ if art['title'] in art['content']:
130
+ page_content = f"Date: {art['date']}\nSource: {art['source']}\n\n{art['content']}"
131
+ else:
132
+ page_content = f"Title: {art['title']}\nDate: {art['date']}\nSource: {art['source']}\n\n{art['content']}"
133
+
134
+ if key in unique_docs:
135
+ if len(page_content) > len(unique_docs[key].page_content):
136
+ unique_docs[key] = Document(page_content=page_content, metadata={"source": art['source'], "date": art['date'], "type": "news"})
137
+ else:
138
+ unique_docs[key] = Document(page_content=page_content, metadata={"source": art['source'], "date": art['date'], "type": "news"})
139
+
140
+ raw_docs = list(unique_docs.values())
141
+ print(f" Condensed into {len(raw_docs)} unique articles.")
142
+
143
+ # Chunking
144
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200, separators=["\n\n", "\n", ". ", " "])
145
+ final_docs = text_splitter.split_documents(raw_docs)
146
+
147
+ if final_docs:
148
+ print(f" Embedding {len(final_docs)} chunks into Vector DB...")
149
+ embeddings = OllamaEmbeddings(model=model, base_url="http://127.0.0.1:25000")
150
+ vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
151
+
152
+ batch_size = 100
153
+ # Progress bar for embedding
154
+ with tqdm(total=len(final_docs), desc="Embedding News", unit="chunk") as pbar:
155
+ for i in range(0, len(final_docs), batch_size):
156
+ batch = final_docs[i:i+batch_size]
157
+ vectorstore.add_documents(batch)
158
+ pbar.update(len(batch))
159
+
160
+ print("\n News Ingestion Complete.")
161
+ else:
162
+ print("No valid articles extracted.")
src/load/inspect_db.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from sqlalchemy import create_engine, text
4
+
5
+ # --- CONFIGURATION ---
6
+ DB_NAME = "mshauri_fedha.db"
7
+ DB_CONNECTION = f"sqlite:///{DB_NAME}"
8
+
9
+ def list_all_tables(engine):
10
+ print(f"\n --- DATABASE SUMMARY: {DB_NAME} ---")
11
+ try:
12
+ with engine.connect() as conn:
13
+ # Query the master table for all table names
14
+ query = text("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
15
+ tables = conn.execute(query).fetchall()
16
+
17
+ if not tables:
18
+ print(" Database is empty.")
19
+ return []
20
+
21
+ table_list = [t[0] for t in tables]
22
+
23
+ print(f"{'ID':<4} | {'Rows':<8} | {'Table Name'}")
24
+ print("-" * 60)
25
+
26
+ for i, t_name in enumerate(table_list):
27
+ # Count rows for verification
28
+ try:
29
+ count = conn.execute(text(f'SELECT COUNT(*) FROM "{t_name}"')).scalar()
30
+ print(f"{i:<4} | {count:<8} | {t_name}")
31
+ except:
32
+ print(f"{i:<4} | {'ERROR':<8} | {t_name}")
33
+
34
+ return table_list
35
+ except Exception as e:
36
+ print(f" Connection failed: {e}")
37
+ return []
38
+
39
+ def inspect_table(engine, table_name):
40
+ print(f"\n🔎 Inspecting Table: '{table_name}'")
41
+ try:
42
+ # Read schema/columns
43
+ query = f'SELECT * FROM "{table_name}" LIMIT 5'
44
+ df = pd.read_sql(query, engine)
45
+
46
+ if df.empty:
47
+ print(" Table is empty.")
48
+ else:
49
+ print(f"Columns: {list(df.columns)}")
50
+ print("\n--- First 5 Rows ---")
51
+ # to_string() makes it readable in terminal without truncation
52
+ print(df.to_string(index=False))
53
+ print("-" * 50)
54
+ except Exception as e:
55
+ print(f" Could not read table: {e}")
56
+
57
+ def main():
58
+ if not os.path.exists(DB_NAME):
59
+ print(f" Error: Database file '{DB_NAME}' not found in current directory.")
60
+ print(f"Current Directory: {os.getcwd()}")
61
+ return
62
+
63
+ engine = create_engine(DB_CONNECTION)
64
+ tables = list_all_tables(engine)
65
+
66
+ if not tables: return
67
+
68
+ while True:
69
+ try:
70
+ user_input = input("\nEnter Table ID (or Name) to inspect, or 'q' to quit: ").strip()
71
+ if user_input.lower() == 'q': break
72
+
73
+ target_table = None
74
+
75
+ # Handle numeric ID input
76
+ if user_input.isdigit():
77
+ idx = int(user_input)
78
+ if 0 <= idx < len(tables):
79
+ target_table = tables[idx]
80
+ # Handle name input
81
+ elif user_input in tables:
82
+ target_table = user_input
83
+
84
+ if target_table:
85
+ inspect_table(engine, target_table)
86
+ else:
87
+ print(" Invalid selection.")
88
+ except KeyboardInterrupt:
89
+ break
90
+
91
+ if __name__ == "__main__":
92
+ main()
src/load/mshauri_demo.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ # These imports are stable and have worked in your previous logs
5
+ from langchain_ollama import ChatOllama
6
+ from langchain_community.utilities import SQLDatabase
7
+ from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_community.embeddings import OllamaEmbeddings
10
+
11
+ # --- CONFIGURATION ---
12
+ DEFAULT_SQL_DB = "sqlite:///mshauri_fedha_v6.db"
13
+ DEFAULT_VECTOR_DB = "mshauri_fedha_chroma_db"
14
+ DEFAULT_EMBED_MODEL = "nomic-embed-text"
15
+ DEFAULT_LLM_MODEL = "qwen3:32b"
16
+ DEFAULT_OLLAMA_URL = "http://127.0.0.1:25000"
17
+
18
+ # --- 1. REPLACEMENT CLASS FOR 'Tool' ---
19
+ class SimpleTool:
20
+ """A simple wrapper to replace langchain.tools.Tool"""
21
+ def __init__(self, name, func, description):
22
+ self.name = name
23
+ self.func = func
24
+ self.description = description
25
+
26
+ def run(self, input_data):
27
+ return self.func(input_data)
28
+
29
+ # --- 2. REPLACEMENT CLASS FOR THE AGENT ---
30
+ class SimpleReActAgent:
31
+ """A manual ReAct loop that doesn't rely on langchain.agents"""
32
+ def __init__(self, llm, tools, verbose=True):
33
+ self.llm = llm
34
+ self.tools = {t.name: t for t in tools}
35
+ self.verbose = verbose
36
+ # Create the tool description string for the prompt
37
+ self.tool_desc = "\n".join([f"{t.name}: {t.description}" for t in tools])
38
+ self.tool_names = ", ".join([t.name for t in tools])
39
+
40
+ # Hardcoded ReAct Prompt
41
+ self.prompt_template = """Answer the following questions as best you can. You have access to the following tools:
42
+
43
+ {tool_desc}
44
+
45
+ Use the following format:
46
+
47
+ Question: the input question you must answer
48
+ Thought: you should always think about what to do
49
+ Action: the action to take, should be one of [{tool_names}]
50
+ Action Input: the input to the action
51
+ Observation: the result of the action
52
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
53
+ Thought: I now know the final answer
54
+ Final Answer: the final answer to the original input question
55
+
56
+ Begin!
57
+
58
+ Question: {input}
59
+ Thought:{agent_scratchpad}"""
60
+
61
+ def invoke(self, inputs):
62
+ query = inputs["input"]
63
+ scratchpad = ""
64
+
65
+ print(f" Starting Agent Loop for: '{query}'")
66
+
67
+ for step in range(10): # Max 10 steps
68
+ # Fill the prompt
69
+ prompt = self.prompt_template.format(
70
+ tool_desc=self.tool_desc,
71
+ tool_names=self.tool_names,
72
+ input=query,
73
+ agent_scratchpad=scratchpad
74
+ )
75
+
76
+ # Call LLM
77
+ # stop=["\nObservation:"] prevents the LLM from hallucinating the tool output
78
+ response = self.llm.invoke(prompt, stop=["\nObservation:"])
79
+ response_text = response.content
80
+
81
+ if self.verbose:
82
+ print(f"\n🧠 Step {step+1}: {response_text.strip()}")
83
+
84
+ scratchpad += response_text
85
+
86
+ # Check for completion
87
+ if "Final Answer:" in response_text:
88
+ return {"output": response_text.split("Final Answer:")[-1].strip()}
89
+
90
+ # Parse Action
91
+ action_match = re.search(r"Action:\s*(.*?)\n", response_text)
92
+ input_match = re.search(r"Action Input:\s*(.*)", response_text)
93
+
94
+ if action_match and input_match:
95
+ action_name = action_match.group(1).strip()
96
+ action_input = input_match.group(1).strip()
97
+
98
+ # Execute Tool
99
+ if action_name in self.tools:
100
+ if self.verbose:
101
+ print(f"🛠️ Calling '{action_name}' with: {action_input}")
102
+
103
+ try:
104
+ # Handle both SimpleTool (.run) and LangChain Tools (.invoke or .run)
105
+ tool = self.tools[action_name]
106
+ if hasattr(tool, 'invoke'):
107
+ tool_result = tool.invoke(action_input)
108
+ else:
109
+ tool_result = tool.run(action_input)
110
+
111
+ except Exception as e:
112
+ tool_result = f"Error executing tool: {e}"
113
+
114
+ observation = f"\nObservation: {tool_result}\n"
115
+ else:
116
+ observation = f"\nObservation: Error: Tool '{action_name}' not found. Available: {self.tool_names}\n"
117
+
118
+ scratchpad += observation
119
+ else:
120
+ # Fallback: if no action found but also no Final Answer
121
+ if "Action:" in response_text:
122
+ scratchpad += "\nObservation: You provided an Action but no Action Input. Please provide the input.\n"
123
+ else:
124
+ return {"output": response_text.strip()}
125
+
126
+ return {"output": "Agent timed out."}
127
+
128
+ # --- 3. MAIN SETUP FUNCTION ---
129
+
130
+ def create_mshauri_agent(
131
+ sql_db_path=DEFAULT_SQL_DB,
132
+ vector_db_path=DEFAULT_VECTOR_DB,
133
+ llm_model=DEFAULT_LLM_MODEL,
134
+ ollama_url=DEFAULT_OLLAMA_URL
135
+ ):
136
+ print(f" Initializing Mshauri Fedha (Model: {llm_model})...")
137
+
138
+ # 1. Initialize LLM
139
+ try:
140
+ llm = ChatOllama(model=llm_model, base_url=ollama_url, temperature=0.1)
141
+ except Exception as e:
142
+ print(f" Error connecting to Ollama: {e}")
143
+ return None
144
+
145
+ # 2. LEFT BRAIN (SQL)
146
+ if "sqlite" in sql_db_path:
147
+ real_path = sql_db_path.replace("sqlite:///", "")
148
+ if not os.path.exists(real_path):
149
+ print(f" Warning: SQL Database not found at {real_path}")
150
+
151
+ db = SQLDatabase.from_uri(sql_db_path)
152
+ # The Toolkit returns standard LangChain tools, which our SimpleReActAgent can handle
153
+ sql_toolkit = SQLDatabaseToolkit(db=db, llm=llm)
154
+ sql_tools = sql_toolkit.get_tools()
155
+
156
+ # 3. RIGHT BRAIN (Vector)
157
+ # We define the Retriever function manually
158
+ def search_docs(query):
159
+ embeddings = OllamaEmbeddings(model=DEFAULT_EMBED_MODEL, base_url=ollama_url)
160
+ vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
161
+ docs = vectorstore.similarity_search(query, k=4)
162
+ return "\n\n".join([d.page_content for d in docs])
163
+
164
+ # Use our SimpleTool wrapper instead of importing from langchain
165
+ retriever_tool = SimpleTool(
166
+ name="search_financial_reports_and_news",
167
+ func=search_docs,
168
+ description="Searches CBK/KNBS reports and business news. Use this for qualitative questions (why, how, trends) or when SQL data is missing."
169
+ )
170
+
171
+ # 4. CREATE AGENT
172
+ tools = sql_tools + [retriever_tool]
173
+ agent = SimpleReActAgent(llm, tools)
174
+
175
+ print(" Mshauri Agent Ready (Zero-Dependency Mode).")
176
+ return agent
177
+
178
+ def ask_mshauri(agent, query):
179
+ if not agent:
180
+ print(" Agent not initialized.")
181
+ return
182
+
183
+ print(f"\n User: {query}")
184
+ print("-" * 40)
185
+
186
+ try:
187
+ response = agent.invoke({"input": query})
188
+ print("-" * 40)
189
+ print(f" Mshauri: {response['output']}")
190
+ return response['output']
191
+ except Exception as e:
192
+ print(f" Error during execution: {e}")
193
+ return None
194
+
195
+ if __name__ == "__main__":
196
+ # Quick Test
197
+ agent = create_mshauri_agent()
198
+ ask_mshauri(agent, "What is the inflation rate?")
src/load/start_ollama.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import time
4
+ import requests
5
+ from pathlib import Path
6
+
7
+ def start_ollama_server():
8
+ """Checks if Ollama is running on port 25000, if not, starts it."""
9
+ OLLAMA_PORT = "25000"
10
+ OLLAMA_HOST = f"http://127.0.0.1:{OLLAMA_PORT}"
11
+
12
+ # 1. Check if already running
13
+ try:
14
+ if requests.get(OLLAMA_HOST).status_code == 200:
15
+ print(" Ollama is already running.")
16
+ return True
17
+ except:
18
+ pass
19
+
20
+ print(" Starting Ollama Server...")
21
+
22
+ # 2. Define Paths (CSCS Environment)
23
+ SCRATCH = os.environ.get("SCRATCH", "/tmp")
24
+ BASE_DIR = Path(SCRATCH)
25
+ OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
26
+ MODELS_DIR = BASE_DIR / "ollama_core/models"
27
+
28
+ # 3. Setup Environment
29
+ server_env = os.environ.copy()
30
+ server_env["OLLAMA_HOST"] = f"127.0.0.1:{OLLAMA_PORT}"
31
+ server_env["OLLAMA_MODELS"] = str(MODELS_DIR)
32
+
33
+ # 4. Start Background Process
34
+ try:
35
+ subprocess.Popen(
36
+ [str(OLLAMA_BIN), "serve"],
37
+ stdout=subprocess.DEVNULL,
38
+ stderr=subprocess.DEVNULL,
39
+ env=server_env
40
+ )
41
+ print(" Waiting for server to boot...")
42
+ time.sleep(10) # Give it time to initialize
43
+
44
+ # 5. Verify
45
+ if requests.get(OLLAMA_HOST).status_code == 200:
46
+ print(" Server started successfully.")
47
+ return True
48
+ except Exception as e:
49
+ print(f" Failed to start server: {e}")
50
+ return False
51
+
52
+ import requests
53
+ import json
54
+ import sys
55
+
56
+ def pull_embedding_model(model_name="nomic-embed-text"):
57
+ url = "http://127.0.0.1:25000/api/pull"
58
+ print(f" Requesting pull for '{model_name}'...")
59
+
60
+ try:
61
+ # Send the pull request to the running server
62
+ response = requests.post(url, json={"name": model_name}, stream=True)
63
+ response.raise_for_status()
64
+
65
+ # Stream the progress so you know it's working
66
+ for line in response.iter_lines():
67
+ if line:
68
+ data = json.loads(line)
69
+ status = data.get('status', '')
70
+ completed = data.get('completed', 0)
71
+ total = data.get('total', 1)
72
+
73
+ # Print progress bar or status
74
+ if total > 1 and completed > 0:
75
+ percent = int((completed / total) * 100)
76
+ sys.stdout.write(f"\r {status}: {percent}%")
77
+ else:
78
+ sys.stdout.write(f"\r {status}")
79
+ sys.stdout.flush()
80
+
81
+ print(f"\n Model '{model_name}' installed successfully!")
82
+
83
+ except Exception as e:
84
+ print(f"\n Failed to pull model: {e}")
src/transform/config.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from dataclasses import dataclass
5
+
6
+ @dataclass
7
+ class ProcessingConfig:
8
+ """Central configuration for Data Sources."""
9
+ root_dir: str
10
+ source_name: str # e.g., 'knbs' or 'cbk'
11
+
12
+ # Settings
13
+ batch_size: int = 20
14
+ max_workers: int = 4
15
+ min_image_bytes: int = 3000
16
+ min_image_dim: int = 100
17
+ max_page_objects: int = 500
18
+
19
+ def __post_init__(self):
20
+ # Paths setup
21
+ self.base_processed_dir = os.path.join(self.root_dir, 'processed')
22
+ self.source_dir = os.path.join(self.base_processed_dir, self.source_name)
23
+ self.drive_zip_dir = os.path.join(self.source_dir, "zipped_batches")
24
+ self.meta_dir = os.path.join(self.source_dir, f"{self.source_name}_index_metadata")
25
+
26
+ # Log Files
27
+ self.logs = {
28
+ 'docs': os.path.join(self.meta_dir, f'{self.source_name}_docs_metadata.jsonl'),
29
+ 'images': os.path.join(self.meta_dir, f'{self.source_name}_images_index.jsonl'),
30
+ 'tables': os.path.join(self.meta_dir, f'{self.source_name}_tables_index.jsonl')
31
+ }
32
+
33
+ # Local Temp Paths
34
+ self.local_work_dir = Path(f"/tmp/temp_work_{self.source_name}")
35
+ self.local_dirs = {
36
+ 'texts': self.local_work_dir / "texts",
37
+ 'images': self.local_work_dir / "images",
38
+ 'tables': self.local_work_dir / "tables",
39
+ 'pdfs': self.local_work_dir / "pdfs"
40
+ }
41
+
42
+ def setup(self):
43
+ os.makedirs(self.drive_zip_dir, exist_ok=True)
44
+ os.makedirs(self.meta_dir, exist_ok=True)
45
+ if self.local_work_dir.exists():
46
+ shutil.rmtree(self.local_work_dir)
47
+ for d in self.local_dirs.values():
48
+ d.mkdir(parents=True, exist_ok=True)
49
+ self.create_canary()
50
+
51
+ def create_canary(self):
52
+ script_content = """
53
+ import sys, pymupdf, pdfplumber
54
+ if len(sys.argv) < 2: sys.exit(1)
55
+ try:
56
+ doc = pymupdf.open(sys.argv[1])
57
+ for p in doc: _, _ = p.get_text(), [doc.extract_image(i[0]) for i in p.get_images(full=True)]
58
+ with pdfplumber.open(sys.argv[1]) as p: _ = [page.objects for page in p.pages]
59
+ print("SAFE")
60
+ sys.exit(0)
61
+ except: sys.exit(1)
62
+ """
63
+ with open("pdf_canary.py", "w") as f:
64
+ f.write(script_content.strip())
src/transform/download_files.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import zipfile
4
+ import time
5
+ import gc
6
+ import threading
7
+ import shutil
8
+ import requests
9
+ import subprocess
10
+ import pymupdf
11
+ import pdfplumber
12
+ import pandas as pd
13
+ import urllib3
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ from tqdm import tqdm
18
+ from requests.adapters import HTTPAdapter
19
+ from urllib3.util.retry import Retry
20
+
21
+ # Import shared config
22
+ from config import ProcessingConfig
23
+
24
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
25
+
26
+ class UniversalProcessor:
27
+ def __init__(self, config: ProcessingConfig):
28
+ self.config = config
29
+ retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 504])
30
+ adapter = HTTPAdapter(max_retries=retry)
31
+ self.session = requests.Session()
32
+ self.session.mount("https://", adapter)
33
+ self.session.mount("http://", adapter)
34
+
35
+ def download(self, url, safe_title) -> Path:
36
+ try:
37
+ response = self.session.get(url, timeout=30, stream=True, verify=False)
38
+ response.raise_for_status()
39
+ ext = Path(url).suffix.lower() or '.pdf'
40
+ safe_name = safe_title[:50].replace(' ', '_').replace('/', '_')
41
+ filepath = self.config.local_dirs['pdfs'] / f"{safe_name}{ext}"
42
+ with open(filepath, 'wb') as f:
43
+ for chunk in response.iter_content(chunk_size=8192):
44
+ f.write(chunk)
45
+ return filepath
46
+ except: return None
47
+
48
+ def process(self, filepath: Path, url: str, safe_title: str):
49
+ # Only does basic validation/extraction for the initial pass
50
+ try:
51
+ res = subprocess.run(["python", "pdf_canary.py", str(filepath)], capture_output=True, timeout=15)
52
+ if res.returncode != 0: return None
53
+
54
+ # Basic PyMuPDF extraction for quick preview
55
+ doc = pymupdf.open(filepath)
56
+ text = "".join([page.get_text() for page in doc])
57
+ doc.close()
58
+
59
+ return {
60
+ 'text': text,
61
+ 'tables': [],
62
+ 'images': [],
63
+ 'metadata': {'pages': len(doc)}
64
+ }
65
+ except: return None
66
+
67
+ class BatchPipeline:
68
+ def __init__(self, config: ProcessingConfig, processor: UniversalProcessor):
69
+ self.config = config
70
+ self.processor = processor
71
+ self.lock = threading.Lock()
72
+ self.config.setup()
73
+
74
+ def _append_log(self, log_key, record):
75
+ with self.lock:
76
+ with open(self.config.logs[log_key], 'a', encoding='utf-8') as f:
77
+ f.write(json.dumps(record) + '\n')
78
+
79
+ def _worker(self, item):
80
+ row = item['row']
81
+ title = str(row.get('text', 'untitled'))
82
+ url = row['file_url']
83
+
84
+ path = self.processor.download(url, title)
85
+ if not path or path.stat().st_size < 500: return None
86
+
87
+ data = self.processor.process(path, url, title)
88
+ if not data: return None
89
+
90
+ # Save Preview Text
91
+ with open(self.config.local_dirs['texts'] / f"{path.stem}.txt", 'w') as f:
92
+ f.write(data['text'])
93
+
94
+ self._append_log('docs', {'url': url, 'file': path.name, 'status': 'downloaded'})
95
+ return True
96
+
97
+ def _zip_and_ship(self, batch_id):
98
+ ts = datetime.now().strftime("%H%M%S")
99
+ zname = f"{self.config.source_name}_{batch_id}_{ts}.zip"
100
+ local_z = Path(f"/tmp/{zname}")
101
+ drive_z = Path(self.config.drive_zip_dir) / zname
102
+
103
+ with zipfile.ZipFile(local_z, 'w', zipfile.ZIP_DEFLATED) as z:
104
+ for root, _, files in os.walk(self.config.local_work_dir):
105
+ for f in files:
106
+ fp = os.path.join(root, f)
107
+ z.write(fp, os.path.relpath(fp, self.config.local_work_dir))
108
+
109
+ shutil.copy(local_z, drive_z)
110
+ self.config.setup() # Wipe local
111
+ os.remove(local_z)
112
+
113
+ def run(self, df, ignore_history=False):
114
+ done = set()
115
+ if not ignore_history and os.path.exists(self.config.logs['docs']):
116
+ with open(self.config.logs['docs']) as f:
117
+ for l in f:
118
+ try: done.add(json.loads(l)['url'])
119
+ except: continue
120
+
121
+ queue = [r for _, r in df.iterrows() if r['file_url'] not in done]
122
+ print(f" Queued: {len(queue)} files.")
123
+
124
+ bs = self.config.batch_size
125
+ for i in range(0, len(queue), bs):
126
+ batch = queue[i:i+bs]
127
+ bid = f"batch_{i//bs + 1}"
128
+ print(f"Processing {bid}...")
129
+
130
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as ex:
131
+ futures = [ex.submit(self._worker, {'row': item}) for item in batch]
132
+ for _ in tqdm(as_completed(futures), total=len(batch)): pass
133
+
134
+ self._zip_and_ship(bid)
135
+ gc.collect()
136
+
137
+ if __name__ == "__main__":
138
+ # Example Usage
139
+ ROOT_DIR = "/scratch/user/mshauri_data"
140
+ conf = ProcessingConfig(root_dir=ROOT_DIR, source_name='cbk')
141
+ pipe = BatchPipeline(conf, UniversalProcessor(conf))
src/transform/extract.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import os
4
+ import sys
5
+ import time
6
+ import queue
7
+ import logging
8
+ import gc
9
+ import multiprocessing as mp
10
+ import argparse
11
+ from pathlib import Path
12
+ import torch
13
+
14
+ def configure_parallelism(workers_per_gpu=5):
15
+ """
16
+ Tuned specifically for 96GB VRAM GPUs.
17
+ We cap workers to prevent 'Thundering Herd' and use VRAM for 'Batch Power'.
18
+ """
19
+ if not torch.cuda.is_available():
20
+ return max(1, mp.cpu_count() // 2), 1, 0
21
+
22
+ num_gpus = torch.cuda.device_count()
23
+ gpu_properties = torch.cuda.get_device_properties(0)
24
+ total_vram_gb = gpu_properties.total_memory / (1024**3)
25
+
26
+ # --- THE STABILITY STRATEGY ---
27
+ # On 96GB, 8-10 workers is the "Sweet Spot".
28
+ # More workers than this creates too much 'context switching' overhead on the GPU.
29
+
30
+ total_slots = num_gpus * workers_per_gpu
31
+
32
+ print(f"🔍 GH200/A100 Detected: {num_gpus} GPUs | {total_vram_gb:.1f} GB VRAM")
33
+ print(f"⚙️ Stability Config: {workers_per_gpu} workers/GPU | {total_slots} Total Slots")
34
+
35
+ # --- SYSTEM TUNING ---
36
+ os.environ["OLLAMA_NUM_PARALLEL"] = str(total_slots)
37
+ os.environ["OLLAMA_MAX_QUEUE"] = "2048" # Large buffer for requests
38
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
39
+
40
+ return total_slots, workers_per_gpu, num_gpus
41
+
42
+
43
+
44
+ # --- CRITICAL: MULTIPROCESSING SETUP ---
45
+ try:
46
+ mp.set_start_method('spawn', force=True)
47
+ except RuntimeError:
48
+ pass
49
+
50
+ from marker.converters.pdf import PdfConverter
51
+ from marker.models import create_model_dict
52
+ from marker.output import text_from_rendered
53
+ from marker.config.parser import ConfigParser
54
+
55
+ # Configure Logger
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format='%(asctime)s - [GPU-%(processName)s] - %(message)s',
59
+ datefmt='%H:%M:%S'
60
+ )
61
+ logger = logging.getLogger(__name__)
62
+
63
+ def worker_routine(worker_id, gpu_id, batch_queue, output_dir, ollama_config, marker_config):
64
+ """
65
+ Optimized Worker for GH200:
66
+ 1. Receives a BATCH of files (List) to reduce queue overhead.
67
+ 2. Uses torch.compile for architectural optimization.
68
+ 3. Skips image extraction for speed.
69
+ 4. Fails fast on tables (Triage) to prevent LLM stalls.
70
+ """
71
+
72
+ time.sleep(worker_id * 1.5)
73
+
74
+ mp.current_process().name = f"{worker_id}:Dev{gpu_id}"
75
+ logger.info(f"Initializing Worker {worker_id}...")
76
+
77
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
78
+ os.environ["TORCH_DEVICE"] = f"cuda:{gpu_id}"
79
+
80
+ # 2. Model Initialization
81
+ try:
82
+ # Load Surya/OCR weights
83
+ artifact_dict = create_model_dict(
84
+ device=f"cuda:{gpu_id}",
85
+ dtype=torch.bfloat16,
86
+ attention_implementation="flash_attention_2"
87
+ )
88
+
89
+ # --- OPTIMIZATION 1: torch.compile ---
90
+ logger.info("Compiling models with torch.compile... (One-time setup)")
91
+ for key, model in artifact_dict.items():
92
+ if hasattr(model, 'forward'):
93
+ artifact_dict[key] = torch.compile(model, mode="max-autotune")
94
+
95
+ # --- OPTIMIZATION 2: Config Tuning ---
96
+ full_config = {
97
+ "output_format": "markdown",
98
+ "disable_multiprocessing": True,
99
+ "extract_images": False, # Speed up: Skip image extraction
100
+ "ocr_all_pages": False,
101
+ "use_llm": True,
102
+ "llm_service": "marker.services.ollama.OllamaService",
103
+
104
+ # --- TRIAGE STRATEGY ---
105
+ "max_table_retries": 0, # Fail fast if table extraction stalls
106
+ "llm_service_timeout": 150, # Don't let a table hold a worker for more than x minutes
107
+
108
+ **ollama_config,
109
+ **marker_config
110
+ }
111
+
112
+ config_parser = ConfigParser(full_config)
113
+
114
+ converter = PdfConverter(
115
+ config=config_parser.generate_config_dict(),
116
+ artifact_dict=artifact_dict,
117
+ processor_list=config_parser.get_processors(),
118
+ renderer=config_parser.get_renderer(),
119
+ llm_service=config_parser.get_llm_service()
120
+ )
121
+ logger.info(f"Worker Ready. Waiting for batches...")
122
+
123
+ except Exception as e:
124
+ logger.error(f"Initialization Failed: {e}")
125
+ return
126
+
127
+ # 3. Batch Work Loop
128
+ batches_processed = 0
129
+ while True:
130
+ try:
131
+ # Get a list of files (Batch)
132
+ batch_files = batch_queue.get(timeout=5)
133
+ except queue.Empty:
134
+ logger.info(f"Queue empty. Worker shutting down. Processed {batches_processed} batches.")
135
+ break
136
+
137
+ # Process the batch locally
138
+ for pdf_path_str in batch_files:
139
+ try:
140
+ pdf_path = Path(pdf_path_str)
141
+ doc_out_dir = Path(output_dir) / pdf_path.stem
142
+ md_file = doc_out_dir / f"{pdf_path.stem}.md"
143
+
144
+ if md_file.exists():
145
+ continue
146
+
147
+ # Heavy Compute (OCR + LLM)
148
+ rendered = converter(str(pdf_path))
149
+
150
+ if rendered is None:
151
+ logger.warning(f"Skipping {pdf_path.name}: Converter returned None")
152
+ continue
153
+
154
+ text, meta, images = text_from_rendered(rendered)
155
+
156
+ # Write Output
157
+ doc_out_dir.mkdir(parents=True, exist_ok=True)
158
+ with open(md_file, "w", encoding="utf-8") as f:
159
+ f.write(text)
160
+
161
+ except Exception as e:
162
+ logger.error(f"Failed on {pdf_path.name}: {e}")
163
+
164
+ # Cleanup after batch to keep VRAM healthy
165
+ batches_processed += 1
166
+
167
+ # Aggressive GC after every batch prevents "memory creep" on long runs
168
+ gc.collect()
169
+ torch.cuda.empty_cache()
170
+
171
+ class MarkerFolderProcessor:
172
+ def __init__(self, output_dir, ollama_url, ollama_model, batch_multiplier, workers_per_gpu, num_gpus):
173
+ self.output_dir = Path(output_dir)
174
+ self.output_dir.mkdir(parents=True, exist_ok=True)
175
+ self.num_gpus = num_gpus
176
+
177
+ # We now accept the dynamic num_gpus passed from __main__
178
+ if self.num_gpus > 0:
179
+ print(f" Detected {self.num_gpus} GPUs (Dynamic Mode)")
180
+ else:
181
+ print(" No GPUs detected. Running in CPU mode.")
182
+
183
+ self.workers_per_gpu = workers_per_gpu
184
+
185
+ # Configs passed to workers
186
+ self.ollama_config = {
187
+ "ollama_base_url": ollama_url,
188
+ "ollama_model": ollama_model,
189
+ "ollama_timeout": 600, # 10 mins max per request
190
+ "ollama_options": {
191
+ "num_ctx": 32768,
192
+ "num_predict": 2048,
193
+ "temperature": 0.0
194
+ }
195
+ }
196
+ self.marker_config = {"batch_multiplier": batch_multiplier}
197
+
198
+ def process_folder(self, source_folder, batch_size=10, subset=None):
199
+ if subset is not None:
200
+ # Use the partitioned list provided by run_transform.py
201
+ pdfs = [Path(p) for p in subset]
202
+ else:
203
+ source_path = Path(source_folder)
204
+ pdfs = sorted(list(source_path.glob("*.pdf")))
205
+
206
+ if not pdfs:
207
+ print("No PDFs to process.")
208
+ return
209
+
210
+ manager = mp.Manager()
211
+ batch_queue = manager.Queue()
212
+
213
+ # --- BATCHING STRATEGY ---
214
+ # Chunk the list of PDFs into batches
215
+ chunks = [pdfs[i:i + batch_size] for i in range(0, len(pdfs), batch_size)]
216
+ print(f" Created {len(chunks)} batches of {batch_size} files each.")
217
+
218
+ for chunk in chunks:
219
+ batch_queue.put([str(p) for p in chunk])
220
+
221
+ total_workers = (self.num_gpus * self.workers_per_gpu) if self.num_gpus > 0 else 1
222
+ print(f" Launching {total_workers} workers on {self.num_gpus} GPUs...")
223
+
224
+ processes = []
225
+ for i in range(total_workers):
226
+ gpu_id = i % self.num_gpus if self.num_gpus > 0 else 0
227
+ p = mp.Process(
228
+ target=worker_routine,
229
+ args=(i, gpu_id, batch_queue, self.output_dir, self.ollama_config, self.marker_config)
230
+ )
231
+ p.start()
232
+ processes.append(p)
233
+
234
+ for p in processes:
235
+ p.join()
236
+
237
+ print(" Extraction Complete.")
238
+
239
+ # This block only runs if you execute 'python extract.py' directly.
240
+ if __name__ == "__main__":
241
+ parser = argparse.ArgumentParser()
242
+ parser.add_argument("--input", required=True, help="Input folder of PDFs")
243
+ parser.add_argument("--output", required=True, help="Output folder")
244
+ parser.add_argument("--url", default="http://localhost:11434", help="Ollama URL")
245
+ parser.add_argument("--model", default="llama3", help="Ollama Model Name")
246
+ args = parser.parse_args()
247
+
248
+ # --- DYNAMIC HARDWARE DETECTION ---
249
+ if torch.cuda.is_available():
250
+ num_gpus = torch.cuda.device_count()
251
+ gpu_properties = torch.cuda.get_device_properties(0)
252
+ total_vram_gb = gpu_properties.total_memory / (1024**3)
253
+
254
+ # Calculate optimal workers: (VRAM - 2GB overhead) / 5GB per worker
255
+ workers_per_gpu = int((total_vram_gb - 2) // 5)
256
+ workers_per_gpu = max(1, workers_per_gpu) # Minimum 1
257
+
258
+ total_slots = num_gpus * workers_per_gpu
259
+ print(f"⚙️ Dynamic Config: {num_gpus} GPUs | {workers_per_gpu} workers/GPU | {total_slots} Total Slots")
260
+
261
+ # Set Env vars for external tools (optional, but good practice)
262
+ os.environ["OLLAMA_NUM_PARALLEL"] = str(total_slots)
263
+
264
+ else:
265
+ num_gpus = 0
266
+ workers_per_gpu = 1
267
+ print(" No GPU detected. Defaulting to 1 worker.")
268
+
269
+ # --- PROCESSOR INIT ---
270
+ processor = MarkerFolderProcessor(
271
+ output_dir=args.output,
272
+ ollama_url=args.url,
273
+ ollama_model=args.model,
274
+ batch_multiplier=2,
275
+ workers_per_gpu=workers_per_gpu, # Passed dynamically
276
+ num_gpus=num_gpus # Passed dynamically
277
+ )
278
+
279
+ processor.process_folder(args.input, batch_size=10)
src/transform/get_csv_from_md.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import io
5
+ import time
6
+ import logging
7
+ import requests
8
+ import subprocess
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from sqlalchemy import create_engine
12
+ from ollama import Client
13
+
14
+ # --- LOGGING ---
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
16
+ logger = logging.getLogger("KNBS_Ingest")
17
+
18
+ # --- 1. INFRASTRUCTURE ---
19
+
20
+ def _manage_ollama_server(ollama_host, ollama_port, ollama_bin, model):
21
+ """Ensures Ollama is running (reuses existing logic)."""
22
+ try:
23
+ if requests.get(ollama_host).status_code == 200:
24
+ logger.info(" Ollama connected.")
25
+ return True
26
+ except: pass
27
+
28
+ logger.info(f"🚀 Starting Ollama ({model})...")
29
+ scratch_env = os.environ.get("SCRATCH", "/tmp")
30
+ models_dir = Path(scratch_env) / "ollama_core/models"
31
+
32
+ server_env = os.environ.copy()
33
+ server_env["OLLAMA_HOST"] = f"127.0.0.1:{ollama_port}"
34
+ server_env["OLLAMA_MODELS"] = str(models_dir)
35
+
36
+ try:
37
+ subprocess.Popen([str(ollama_bin), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=server_env)
38
+ time.sleep(5)
39
+ subprocess.run([str(ollama_bin), "pull", model], env=server_env, check=True)
40
+ return True
41
+ except Exception as e:
42
+ logger.error(f" Server Error: {e}")
43
+ return False
44
+
45
+ # --- 2. MARKDOWN PARSING ENGINE ---
46
+
47
+ def extract_tables_from_markdown(md_content: str) -> list[pd.DataFrame]:
48
+ """
49
+ Scans markdown text for pipe-delimited tables (| col | col |)
50
+ and converts them to Pandas DataFrames.
51
+ """
52
+ tables = []
53
+ lines = md_content.split('\n')
54
+ buffer = []
55
+ inside_table = False
56
+
57
+ for line in lines:
58
+ stripped = line.strip()
59
+ # Detect table lines (must start and end with |)
60
+ if stripped.startswith('|') and stripped.endswith('|'):
61
+ inside_table = True
62
+ buffer.append(stripped)
63
+ else:
64
+ if inside_table:
65
+ # Table block ended, process buffer
66
+ if buffer:
67
+ try:
68
+ table_str = '\n'.join(buffer)
69
+ # Read using pandas, handling markdown separators
70
+ df = pd.read_csv(
71
+ io.StringIO(table_str),
72
+ sep="|",
73
+ skipinitialspace=True,
74
+ engine='python'
75
+ )
76
+
77
+ # CLEANUP PANDAS ARTIFACTS
78
+ # 1. Drop empty columns (pandas creates empty cols for leading/trailing pipes)
79
+ df = df.dropna(axis=1, how='all')
80
+
81
+ # 2. Filter out the markdown divider row (e.g. ---|---|---)
82
+ if not df.empty:
83
+ df = df[~df.iloc[:,0].astype(str).str.contains('---', regex=False)]
84
+
85
+ if not df.empty and len(df.columns) > 1:
86
+ tables.append(df)
87
+
88
+ except Exception as e:
89
+ logger.warning(f"Failed to parse a table block: {e}")
90
+
91
+ buffer = []
92
+ inside_table = False
93
+
94
+ return tables
95
+
96
+ # --- 3. LLM HEADER CLEANER (KNBS SPECIFIC) ---
97
+
98
+ def clean_knbs_headers(df: pd.DataFrame, filename: str, table_index: int, client: Client, model: str) -> pd.DataFrame:
99
+ """
100
+ Uses LLM to sanitize headers, handling split headers common in PDF-to-Markdown.
101
+ """
102
+ raw_headers = [str(c).strip() for c in df.columns]
103
+
104
+ # Context: Provide first 2 rows to help identify if headers are split across rows
105
+ data_preview = df.head(2).astype(str).values.tolist()
106
+
107
+ prompt = f"""
108
+ You are a Data Engineer cleaning Kenya National Bureau of Statistics (KNBS) data.
109
+
110
+ Source File: "{filename}"
111
+ Table Index: {table_index}
112
+
113
+ Current Headers: {raw_headers}
114
+ Data Preview (First 2 Rows): {data_preview}
115
+
116
+ Task: Return a list of {len(raw_headers)} clean, snake_case SQL column names.
117
+
118
+ RULES:
119
+ 1. INFER MEANING: If header is "Gross" and Row 1 is "Domestic Product", the column name is "gdp".
120
+ 2. HANDLE YEARS: If headers are "2019", "2020", keep as "year_2019".
121
+ 3. HANDLE GARBAGE: If header is "Unnamed: 1" look at Data Preview. If it contains items like "Agriculture", name it "sector".
122
+ 4. KNBS reports often have a "Total" column. Ensure it is named "total".
123
+
124
+ Respond ONLY with a JSON list of strings.
125
+ """
126
+
127
+ try:
128
+ res = client.chat(model=model, messages=[{'role': 'user', 'content': prompt}], format='json')
129
+ new_headers = json.loads(res['message']['content'])
130
+
131
+ # Handle dictionary wrapper if LLM returns {"headers": [...]}
132
+ if isinstance(new_headers, dict):
133
+ for val in new_headers.values():
134
+ if isinstance(val, list):
135
+ new_headers = val
136
+ break
137
+
138
+ # Validation: Length must match
139
+ if isinstance(new_headers, list) and len(new_headers) == len(df.columns):
140
+ df.columns = new_headers
141
+ else:
142
+ # Fallback: keep originals but snake_case them
143
+ df.columns = [re.sub(r'[^a-zA-Z0-9]', '_', str(c).strip()).lower() for c in df.columns]
144
+
145
+ except Exception as e:
146
+ logger.warning(f"LLM Header clean failed (Table {table_index}): {e}")
147
+
148
+ return df
149
+
150
+ # --- 4. MAIN PIPELINE EXPORT ---
151
+
152
+ def ingest_knbs_data(input_dir: str, db_name: str, model: str = "qwen2.5:14b"):
153
+ """
154
+ Main entry point to run the KNBS ingestion pipeline.
155
+ Recursively scans input_dir for all .md files.
156
+ """
157
+ # Paths
158
+ SCRATCH = os.environ.get("SCRATCH", "/tmp")
159
+ BASE_DIR = Path(SCRATCH)
160
+
161
+ INPUT_PATH = Path(input_dir)
162
+ if not INPUT_PATH.exists():
163
+ INPUT_PATH = BASE_DIR / input_dir
164
+
165
+ if not INPUT_PATH.exists():
166
+ logger.error(f" Input directory not found: {INPUT_PATH}")
167
+ return
168
+
169
+ OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
170
+ CUSTOM_PORT = "25000"
171
+ OLLAMA_HOST = f"http://127.0.0.1:{CUSTOM_PORT}"
172
+
173
+ # Infrastructure
174
+ if not _manage_ollama_server(OLLAMA_HOST, CUSTOM_PORT, OLLAMA_BIN, model): return
175
+
176
+ engine = create_engine(f"sqlite:///{db_name}")
177
+ client = Client(host=OLLAMA_HOST)
178
+
179
+ # Process Files (RECURSIVE SEARCH using rglob)
180
+ files = sorted(list(INPUT_PATH.rglob("*.md")))
181
+ logger.info(f"🚀 Found {len(files)} KNBS markdown files (Recursive Scan). Starting ingestion...")
182
+
183
+ for f in files:
184
+ logger.info(f"📄 Processing {f.name}...")
185
+ try:
186
+ with open(f, 'r', encoding='utf-8', errors='ignore') as file:
187
+ content = file.read()
188
+
189
+ # A. Extract Tables
190
+ dfs = extract_tables_from_markdown(content)
191
+
192
+ if not dfs:
193
+ continue
194
+
195
+ logger.info(f" found {len(dfs)} tables.")
196
+
197
+ # B. Clean & Load Tables
198
+ for i, df in enumerate(dfs):
199
+ # Basic cleanup
200
+ df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
201
+ if df.empty or len(df) < 2: continue # Skip empty/tiny tables
202
+
203
+ # LLM Semantic Cleaning
204
+ df = clean_knbs_headers(df, f.name, i, client, model)
205
+
206
+ # Sanitize numeric data
207
+ for c in df.columns:
208
+ if any(x in str(c).lower() for x in ['rate', 'value', 'amount', 'total', 'year', 'price']):
209
+ df[c] = df[c].apply(lambda x: pd.to_numeric(str(x).replace(',', '').replace('%',''), errors='ignore'))
210
+
211
+ # Naming: knbs_{filename_slug}_tab{index}
212
+ slug = re.sub(r'[^a-zA-Z0-9]', '_', f.stem).lower()[:40].lstrip('_')
213
+ table_name = f"{slug}_tab{i}"
214
+
215
+ df['source_file'] = f.name
216
+
217
+ df.to_sql(table_name, engine, if_exists='replace', index=False)
218
+ logger.info(f" -> Saved table: {table_name} ({len(df)} rows)")
219
+
220
+ except Exception as e:
221
+ logger.error(f" Failed {f.name}: {e}")
222
+
223
+ logger.info(" KNBS Ingestion Complete.")
src/transform/run_transform.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import subprocess
4
+ import time
5
+ import logging
6
+ import requests
7
+ import torch
8
+ from pathlib import Path
9
+ from datetime import timedelta
10
+
11
+ # --- 1. LOGGING SETUP ---
12
+ # Identify Node Rank for logging clarity
13
+ NODE_ID = os.environ.get("SLURM_PROCID", "0")
14
+
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format=f'%(asctime)s - [Node {NODE_ID}] - %(levelname)s - %(message)s',
18
+ handlers=[
19
+ logging.StreamHandler(sys.stdout),
20
+ logging.FileHandler(f"logs/node_{NODE_ID}_transform.log")
21
+ ]
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+ def main():
26
+ t_start = time.perf_counter()
27
+ logger.info(f"🚀 Starting Transformation Pipeline on Node {NODE_ID}")
28
+
29
+ # --- 2. ENVIRONMENT & PATHS ---
30
+ SCRATCH = Path(os.environ.get("SCRATCH", "/tmp"))
31
+ INPUT_PDFS_DIR = SCRATCH / "mshauri-fedha/data/knbs/pdfs"
32
+ OUTPUT_DIR = SCRATCH / "mshauri-fedha/data/knbs/marker-output"
33
+
34
+ OLLAMA_HOME = SCRATCH / "ollama_core"
35
+ OLLAMA_BIN = OLLAMA_HOME / "bin/ollama"
36
+ OLLAMA_HOST = "http://localhost:11434"
37
+
38
+ # Important: Ensure the current directory is in sys.path for 'extract' import
39
+ if os.getcwd() not in sys.path:
40
+ sys.path.append(os.getcwd())
41
+
42
+ try:
43
+ from extract import MarkerFolderProcessor, configure_parallelism
44
+ except ImportError as e:
45
+ logger.error(f"Could not import extract.py from {os.getcwd()}")
46
+ raise e
47
+
48
+ # --- 3. DYNAMIC PARALLELISM & OLLAMA CONFIG ---
49
+ # Calculates workers based on node hardware (GH200 96GB)
50
+ total_slots, workers_per_gpu, num_gpus = configure_parallelism()
51
+
52
+ # Clean up any zombie servers on this node
53
+ subprocess.run(["pkill", "-f", "ollama serve"], stderr=subprocess.DEVNULL)
54
+ time.sleep(5)
55
+
56
+ # Set server environment variables
57
+ server_env = os.environ.copy()
58
+ server_env["OLLAMA_NUM_PARALLEL"] = str(total_slots)
59
+ server_env["OLLAMA_MAX_LOADED_MODELS"] = "1"
60
+ server_env["OLLAMA_MAX_QUEUE"] = "2048"
61
+ server_env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
62
+
63
+ logger.info(f"⏳ Launching Ollama Server with {total_slots} slots...")
64
+ subprocess.Popen(
65
+ [str(OLLAMA_BIN), "serve"],
66
+ stdout=subprocess.DEVNULL,
67
+ stderr=subprocess.DEVNULL,
68
+ env=server_env
69
+ )
70
+
71
+ # Heartbeat check
72
+ for i in range(60):
73
+ try:
74
+ if requests.get(OLLAMA_HOST).status_code == 200:
75
+ logger.info(" Ollama Server is online.")
76
+ break
77
+ except:
78
+ time.sleep(1)
79
+ else:
80
+ raise RuntimeError(" Ollama server heartbeat failed.")
81
+
82
+ # --- 4. MODEL SETUP ---
83
+ BASE_MODEL = "qwen2.5:7b"
84
+ CUSTOM_MODEL_NAME = "qwen2.5-7b-16k"
85
+
86
+ logger.info(f" Pulling {BASE_MODEL}...")
87
+ subprocess.run([str(OLLAMA_BIN), "pull", BASE_MODEL], check=True, capture_output=True)
88
+
89
+ logger.info(f" Creating custom 16k context model...")
90
+ modelfile_path = Path(f"Modelfile_node_{NODE_ID}")
91
+ modelfile_path.write_text(f"FROM {BASE_MODEL}\nPARAMETER num_ctx 16384")
92
+ subprocess.run([str(OLLAMA_BIN), "create", CUSTOM_MODEL_NAME, "-f", str(modelfile_path)], check=True, capture_output=True)
93
+
94
+ # --- 5. AUTOMATED DATA PARTITIONING ---
95
+ # Get all PDFs and sort them for deterministic behavior
96
+ all_pdfs = sorted(list(INPUT_PDFS_DIR.glob("*.pdf")))
97
+ total_nodes = int(os.environ.get("SLURM_NTASKS", 1))
98
+ node_rank = int(NODE_ID)
99
+
100
+ # Each node takes every Nth file (Node 0 takes index 0, 2, 4... Node 1 takes 1, 3, 5...)
101
+ my_pdfs = all_pdfs[node_rank::total_nodes]
102
+ my_pdf_strs = [str(p) for p in my_pdfs]
103
+
104
+ logger.info(f" Data Partitioning: Node {node_rank}/{total_nodes} handling {len(my_pdfs)} files.")
105
+
106
+ # --- 6. EXECUTION ---
107
+ os.chdir(SCRATCH)
108
+
109
+ processor = MarkerFolderProcessor(
110
+ output_dir=OUTPUT_DIR,
111
+ ollama_url=OLLAMA_HOST,
112
+ ollama_model=CUSTOM_MODEL_NAME,
113
+ batch_multiplier=4,
114
+ workers_per_gpu=workers_per_gpu,
115
+ num_gpus=num_gpus
116
+ )
117
+
118
+ logger.info(f"🚀 Processing PDFs...")
119
+ # Using the 'subset' parameter in process_folder (ensure extract.py supports this)
120
+ processor.process_folder(INPUT_PDFS_DIR, batch_size=5, subset=my_pdf_strs)
121
+
122
+ # --- 7. CLEANUP & TIMING ---
123
+ t_end = time.perf_counter()
124
+ duration = timedelta(seconds=t_end - t_start)
125
+ logger.info(" Transformation process finished.")
126
+ logger.info(f"⏱️ Total Duration for Node {NODE_ID}: {duration}")
127
+
128
+ # Shutdown server
129
+ subprocess.run(["pkill", "-f", "ollama serve"], stderr=subprocess.DEVNULL)
130
+ if modelfile_path.exists(): modelfile_path.unlink()
131
+
132
+ if __name__ == "__main__":
133
+ main()
src/transform/structure_data.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ import logging
6
+ import requests
7
+ import subprocess
8
+ import pandas as pd
9
+ import numpy as np
10
+ from typing import List, Tuple, Dict
11
+ from pathlib import Path
12
+ from sqlalchemy import create_engine, text
13
+ from ollama import Client
14
+
15
+ # --- LOGGING ---
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
17
+ logger = logging.getLogger("SmartIngestV6")
18
+
19
+ # --- 0. KNOWLEDGE BASE (THE RULES) ---
20
+
21
+ # Files to ignore completely
22
+ SKIP_PATTERNS = [
23
+ "december__2019_tap",
24
+ "lcr_return",
25
+ "lcr_sheet",
26
+ "quarterly_gdp",
27
+ "remittances",
28
+ "depository_corporation_survey_(expanded)"
29
+ ]
30
+
31
+ # Exact column expectations based on prior analysis [cite: 1, 2, 5, 6, 8, 14, 15, 16, 17]
32
+ SCHEMA_DEFINITIONS = {
33
+ "annual_gdp": ["year", "month", "nominal_gdp_prices", "real_gdp_growth", "real_gdp_prices"],
34
+ "bop_annual": ["bpm6_concept", "year_2019", "year_2020", "year_2021", "year_2022", "year_2023", "year_2024"],
35
+ "indicative_rates": ["date", "currency", "mean_rate", "buy_rate", "sell_rate"],
36
+ "exchange_rates": ["date", "currency", "mean_rate", "buy_rate", "sell_rate"], # Catch-all for historical/indicative
37
+ "central_bank_rates": ["year", "month", "reverse_repo", "interbank_rate", "tbill_91_day", "tbill_182_day", "tbill_364_day", "reserve_requirement", "cbr"],
38
+ "commercial_bank_rates": ["year", "month", "deposit_rate", "savings_rate", "lending_rate", "overdraft_rate"],
39
+ "domestic_debt": ["fiscal_year", "treasury_bills", "treasury_bonds", "govt_stocks", "overdraft_cbk", "advances_commercial", "other_debt", "total_debt"],
40
+ "forex_bureau": ["bureau_name", "usd_buy", "usd_sell", "usd_margin", "gbp_buy", "gbp_sell", "gbp_margin", "euro_buy", "euro_sell", "euro_margin"],
41
+ "treasury_bills": ["issue_date", "amount_offered", "tenure", "amount_received", "amount_accepted", "yield_rate", "alloted", "rejected", "redeemed", "outstanding"],
42
+ "treasury_bonds": ["issue_date", "bond_code", "amount_offered", "amount_received", "amount_accepted", "coupon_rate", "alloted", "rejected", "redeemed", "outstanding"],
43
+ "exports": ["year", "month", "commodity", "value_millions", "total"],
44
+ "imports": ["year", "month", "commodity", "value_millions", "total"],
45
+ "revenue": ["year", "month", "tax_revenue", "non_tax_revenue", "total_revenue", "recurrent_expenditure", "development_expenditure"],
46
+ "depository_corporation_survey": ["category", "data_values"] # Wide table handling triggered later
47
+ }
48
+
49
+ # --- 1. INFRASTRUCTURE ---
50
+
51
+ def _manage_ollama_server(ollama_host, ollama_port, ollama_bin, model):
52
+ try:
53
+ if requests.get(ollama_host).status_code == 200:
54
+ logger.info(" Ollama connected.")
55
+ return True
56
+ except: pass
57
+
58
+ logger.info(f" Starting Ollama ({model})...")
59
+ scratch_env = os.environ.get("SCRATCH", "/tmp")
60
+ models_dir = Path(scratch_env) / "ollama_core/models"
61
+
62
+ server_env = os.environ.copy()
63
+ server_env["OLLAMA_HOST"] = f"127.0.0.1:{ollama_port}"
64
+ server_env["OLLAMA_MODELS"] = str(models_dir)
65
+ models_dir.mkdir(parents=True, exist_ok=True)
66
+
67
+ try:
68
+ subprocess.Popen([str(ollama_bin), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=server_env)
69
+ time.sleep(5)
70
+ subprocess.run([str(ollama_bin), "pull", model], env=server_env, check=True)
71
+ return True
72
+ except Exception as e:
73
+ logger.error(f" Server Error: {e}")
74
+ return False
75
+
76
+ # --- 2. HEADER HUNTER (Geometric Scanner) ---
77
+
78
+ def read_csv_robust(file_path: Path) -> pd.DataFrame:
79
+ encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
80
+ for enc in encodings:
81
+ try:
82
+ return pd.read_csv(file_path, header=None, dtype=str, encoding=enc).fillna("")
83
+ except UnicodeDecodeError:
84
+ continue
85
+ return pd.DataFrame()
86
+
87
+ def find_best_header_row(df_raw: pd.DataFrame, expected_keywords: List[str]) -> Tuple[int, int]:
88
+ """Scores rows based on expected keywords for this specific file type."""
89
+ scores = {}
90
+ scan_depth = min(30, len(df_raw))
91
+
92
+ # If we have no expectations, use generic keywords
93
+ if not expected_keywords:
94
+ expected_keywords = ['year', 'month', 'date', 'rate', 'bank', 'shilling', 'total']
95
+
96
+ for i in range(scan_depth):
97
+ row_str = " ".join(df_raw.iloc[i].astype(str)).lower()
98
+ score = 0
99
+
100
+ # Reward: Matches expected schema
101
+ for kw in expected_keywords:
102
+ if kw.lower() in row_str:
103
+ score += 3
104
+
105
+ # Penalty: Looks like Data (Dense numbers)
106
+ num_cells = sum(1 for c in df_raw.iloc[i].astype(str) if c.replace(',','').replace('.','').isdigit())
107
+ if num_cells > len(df_raw.columns) * 0.5:
108
+ score -= 10
109
+
110
+ scores[i] = score
111
+
112
+ best_header = max(scores, key=scores.get)
113
+ if scores[best_header] <= 0:
114
+ return _geometric_scan(df_raw)
115
+
116
+ return best_header, best_header + 1
117
+
118
+ def _geometric_scan(df_raw):
119
+ """Fallback: Find first dense block of numbers."""
120
+ def is_data(x):
121
+ try:
122
+ float(str(x).replace(',', ''))
123
+ return 1
124
+ except: return 0
125
+ scores = df_raw.map(is_data).sum(axis=1)
126
+ if scores.empty or scores.max() <= 1: return 0, 1
127
+ data_rows = scores[scores >= scores.max() * 0.5].index.tolist()
128
+ if not data_rows: return 0, 1
129
+ data_start = data_rows[0]
130
+ header_idx = max(0, data_start - 1)
131
+ # Search up for content
132
+ while header_idx > 0:
133
+ if df_raw.iloc[header_idx].str.join("").str.strip().any(): break
134
+ header_idx -= 1
135
+ return header_idx, data_start
136
+
137
+ # --- 3. HYBRID PROMPT STRATEGY ---
138
+
139
+ def get_clean_headers(raw_headers: List[str], first_row: List[str], filename: str, client: Client, model: str) -> List[str]:
140
+ # 1. Identify File Type & Expectations
141
+ expected_cols = []
142
+ file_type = "generic"
143
+ for key, cols in SCHEMA_DEFINITIONS.items():
144
+ if key in filename.lower():
145
+ file_type = key
146
+ expected_cols = cols
147
+ break
148
+
149
+ # 2. Build Prompt
150
+ valid_raw = [str(h).strip() for h in raw_headers]
151
+ valid_data = [str(d).strip()[:15] for d in first_row]
152
+
153
+ prompt = f"""
154
+ You are a Financial Data Engineer.
155
+
156
+ File: "{filename}"
157
+ Detected Type: "{file_type}"
158
+ Expected Schema: {expected_cols}
159
+
160
+ Current Headers (Row N): {valid_raw}
161
+ First Data Row (Row N+1): {valid_data}
162
+
163
+ Task: Return a list of {len(raw_headers)} clean snake_case column names.
164
+
165
+ CRITICAL RULES:
166
+ 1. PRIORITIZE THE EXPECTED SCHEMA. If the data looks like it matches the expectation, use those names.
167
+ 2. If Expected Schema has 5 cols but file has 7, keep the 5 and name the others based on context (e.g., 'total').
168
+ 3. If header is a Year ("1999"), keep it as "year_1999".
169
+ 4. If header is empty/garbage, use the Data Row to guess (e.g. "Kenya Commercial Bank" -> "bank_name").
170
+
171
+ Respond ONLY with a JSON list of strings.
172
+ """
173
+
174
+ try:
175
+ res = client.chat(model=model, messages=[{'role': 'user', 'content': prompt}], format='json')
176
+ content = json.loads(res['message']['content'])
177
+
178
+ if isinstance(content, dict):
179
+ for val in content.values():
180
+ if isinstance(val, list): return val
181
+ return content if isinstance(content, list) else [f"col_{i}" for i in range(len(raw_headers))]
182
+ except:
183
+ # FALLBACK: If LLM fails, return the Expected Schema (padded if needed)
184
+ if expected_cols:
185
+ if len(expected_cols) < len(raw_headers):
186
+ return expected_cols + [f"extra_{i}" for i in range(len(raw_headers)-len(expected_cols))]
187
+ return expected_cols[:len(raw_headers)]
188
+ return [f"col_{i}" for i in range(len(raw_headers))]
189
+
190
+ # --- 4. SPECIFIC TRANSFORMS ---
191
+
192
+ def apply_specific_transforms(df: pd.DataFrame, filename: str) -> pd.DataFrame:
193
+ fname = filename.lower()
194
+
195
+ # Rule 20: Revenue & Expenditure - Remove top 3 rows
196
+ if "revenue" in fname:
197
+ if len(df) > 3: df = df.iloc[3:].reset_index(drop=True)
198
+
199
+ # Rule 9: Depository Survey - Wide Table Logic
200
+ if "depository_corporation" in fname:
201
+ # This is a massive wide table. We usually want to melt it.
202
+ # Assuming col 0 is Category and rest are dates
203
+ try:
204
+ id_vars = [df.columns[0]]
205
+ value_vars = [c for c in df.columns if c != df.columns[0]]
206
+ df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name="date", value_name="amount_millions")
207
+ except: pass
208
+
209
+ # Rule 1/19/21/22: Year + Month merging
210
+ # Check if we have 'year' and 'month' columns
211
+ cols = [str(c).lower() for c in df.columns]
212
+ if 'year' in cols and 'month' in cols:
213
+ try:
214
+ # Simple merge
215
+ y_idx = cols.index('year')
216
+ m_idx = cols.index('month')
217
+ df['period'] = df.iloc[:, y_idx].astype(str) + '-' + df.iloc[:, m_idx].astype(str)
218
+ except: pass
219
+
220
+ return df
221
+
222
+ # --- 5. PROCESSING CORE ---
223
+
224
+ def process_file_v6(file_path: Path, engine, client, model):
225
+ # 1. Skip Check
226
+ if any(p in file_path.name.lower() for p in SKIP_PATTERNS):
227
+ logger.warning(f" Skipping {file_path.name} (Blacklisted)")
228
+ return
229
+
230
+ logger.info(f"Processing {file_path.name}...")
231
+
232
+ # 2. Read
233
+ df_raw = read_csv_robust(file_path)
234
+ if df_raw.empty: return
235
+
236
+ # 3. Identify Expectations for Header Scanning
237
+ expected_keys = []
238
+ for key, cols in SCHEMA_DEFINITIONS.items():
239
+ if key in file_path.name.lower():
240
+ expected_keys = cols
241
+ break
242
+
243
+ # 4. Find Header
244
+ header_idx, data_start = find_best_header_row(df_raw, expected_keys)
245
+
246
+ # 5. Extract Headers
247
+ raw_headers = df_raw.iloc[header_idx].tolist()
248
+
249
+ # Double Header Check
250
+ if header_idx > 0:
251
+ row_above = df_raw.iloc[header_idx-1].fillna("").astype(str).tolist()
252
+ if sum(len(x) for x in row_above) > 10:
253
+ raw_headers = [f"{p} {c}".strip() for p, c in zip(row_above, raw_headers)]
254
+
255
+ if len(raw_headers) != len(df_raw.columns):
256
+ raw_headers = [f"col_{i}" for i in range(len(df_raw.columns))]
257
+
258
+ # 6. LLM / Hybrid Map
259
+ first_row = df_raw.iloc[data_start].tolist() if data_start < len(df_raw) else [""]*len(raw_headers)
260
+ clean_headers = get_clean_headers(raw_headers, first_row, file_path.name, client, model)
261
+
262
+ # Align Lengths
263
+ if len(clean_headers) < len(df_raw.columns):
264
+ clean_headers += [f"extra_{i}" for i in range(len(df_raw.columns) - len(clean_headers))]
265
+ clean_headers = clean_headers[:len(df_raw.columns)]
266
+
267
+ # 7. Build DF
268
+ df = df_raw.iloc[data_start:].copy()
269
+ df.columns = clean_headers
270
+
271
+ # 8. Transforms
272
+ df = apply_specific_transforms(df, file_path.name)
273
+
274
+ # 9. Clean & Save
275
+ df = df.loc[:, ~df.columns.str.contains('^unnamed', case=False)]
276
+ df.dropna(thresh=1, inplace=True)
277
+
278
+ for c in df.columns:
279
+ if any(x in str(c).lower() for x in ['rate', 'value', 'amount', 'mean', 'buy', 'sell']):
280
+ df[c] = df[c].apply(lambda x: pd.to_numeric(str(x).replace(',', '').replace('(', '-').replace(')', ''), errors='ignore'))
281
+
282
+ table_name = re.sub(r'cbk_batch_\d+_\d+_', '', file_path.stem)
283
+ table_name = re.sub(r'[^a-zA-Z0-9]', '_', table_name).lower()[:60].lstrip('_')
284
+ df['source_file'] = file_path.name
285
+
286
+ try:
287
+ df.to_sql(table_name, engine, if_exists='replace', index=False)
288
+ logger.info(f" Saved {len(df)} rows to '{table_name}'")
289
+ except Exception as e:
290
+ logger.error(f" SQL Error: {e}")
291
+
292
+ # --- MAIN ---
293
+
294
+ def process_cbk_files(input_dir: str, db_name="mshauri_fedha_v6.db", model="qwen2.5:14b"):
295
+ SCRATCH = os.environ.get("SCRATCH", "/tmp")
296
+ BASE_DIR = Path(SCRATCH)
297
+ INPUT_PATH = Path(input_dir) if Path(input_dir).exists() else BASE_DIR / input_dir
298
+
299
+ if not INPUT_PATH.exists(): return
300
+
301
+ OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
302
+ CUSTOM_PORT = "25000"
303
+ OLLAMA_HOST = f"http://127.0.0.1:{CUSTOM_PORT}"
304
+
305
+ if not _manage_ollama_server(OLLAMA_HOST, CUSTOM_PORT, OLLAMA_BIN, model): return
306
+
307
+ engine = create_engine(f"sqlite:///{db_name}")
308
+ client = Client(host=OLLAMA_HOST)
309
+
310
+ files = sorted(list(INPUT_PATH.glob("*.csv")))
311
+ print(f"🚀 Processing {len(files)} files...")
312
+
313
+ for f in files:
314
+ process_file_v6(f, engine, client, model)
315
+
316
+ print("\n Done.")
317
+ with engine.connect() as conn:
318
+ tables = conn.execute(text("SELECT name FROM sqlite_master WHERE type='table'")).fetchall()
319
+ print(f"📊 Created {len(tables)} tables.")
320
+
321
+ if __name__ == "__main__":
322
+ pass