Spaces:
Sleeping
Sleeping
prototype stage
Browse files- .gitignore +108 -0
- notebooks/inspect_md.ipynb +426 -0
- notebooks/inspect_news.ipynb +476 -0
- notebooks/structure_data.ipynb +0 -0
- notebooks/test_demo.ipynb +309 -0
- notebooks/transform.ipynb +315 -0
- notebooks/unzip_stores.ipynb +197 -0
- src/extract/download_file_links.py +671 -0
- src/load/clean_db.py +244 -0
- src/load/explore_news_schema.py +74 -0
- src/load/ingest_md.py +117 -0
- src/load/ingest_news.py +162 -0
- src/load/inspect_db.py +92 -0
- src/load/mshauri_demo.py +198 -0
- src/load/start_ollama.py +84 -0
- src/transform/config.py +64 -0
- src/transform/download_files.py +141 -0
- src/transform/extract.py +279 -0
- src/transform/get_csv_from_md.py +223 -0
- src/transform/run_transform.py +133 -0
- src/transform/structure_data.py +322 -0
.gitignore
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -------------------------------------------------------------------------
|
| 2 |
+
# 1. Python & Virtual Environments (Standard)
|
| 3 |
+
# -------------------------------------------------------------------------
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*$py.class
|
| 7 |
+
*.so
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
|
| 28 |
+
# -------------------------------------------------------------------------
|
| 29 |
+
# 2. Security & Secrets (CRITICAL)
|
| 30 |
+
# Never commit your API keys (HuggingFace, OpenAI, etc.)
|
| 31 |
+
# -------------------------------------------------------------------------
|
| 32 |
+
.env
|
| 33 |
+
.env.local
|
| 34 |
+
.env.*
|
| 35 |
+
secrets.json
|
| 36 |
+
credentials.json
|
| 37 |
+
|
| 38 |
+
# -------------------------------------------------------------------------
|
| 39 |
+
# 3. Large AI Models & Weights (Hugging Face / PyTorch)
|
| 40 |
+
# These files are GBs in size; use DVC to track them instead.
|
| 41 |
+
# -------------------------------------------------------------------------
|
| 42 |
+
*.bin
|
| 43 |
+
*.pt
|
| 44 |
+
*.pth
|
| 45 |
+
*.ckpt
|
| 46 |
+
*.safetensors
|
| 47 |
+
*.onnx
|
| 48 |
+
models/
|
| 49 |
+
weights/
|
| 50 |
+
checkpoints/
|
| 51 |
+
lora-adapters/
|
| 52 |
+
|
| 53 |
+
# -------------------------------------------------------------------------
|
| 54 |
+
# 4. Data & RAG Stores (MshauriFedha Specific)
|
| 55 |
+
# Ignore raw PDFs and local vector databases (Chroma/Faiss).
|
| 56 |
+
# -------------------------------------------------------------------------
|
| 57 |
+
data/
|
| 58 |
+
datasets/
|
| 59 |
+
corpus/
|
| 60 |
+
# Ignore ChromaDB and Faiss local persistence folders
|
| 61 |
+
chroma_db/
|
| 62 |
+
chroma_storage/
|
| 63 |
+
faiss_indexes/
|
| 64 |
+
storage/
|
| 65 |
+
|
| 66 |
+
# -------------------------------------------------------------------------
|
| 67 |
+
# 5. CSCS & HPC Specifics
|
| 68 |
+
# Ignore the huge container images and Slurm log files.
|
| 69 |
+
# -------------------------------------------------------------------------
|
| 70 |
+
*.sif
|
| 71 |
+
*.sif.*
|
| 72 |
+
*.tar.gz
|
| 73 |
+
slurm-*.out
|
| 74 |
+
slurm-*.err
|
| 75 |
+
core.*
|
| 76 |
+
|
| 77 |
+
# -------------------------------------------------------------------------
|
| 78 |
+
# 6. Jupyter Notebooks
|
| 79 |
+
# -------------------------------------------------------------------------
|
| 80 |
+
.ipynb_checkpoints/
|
| 81 |
+
*-checkpoint.ipynb
|
| 82 |
+
# Optional: if you don't want to commit notebook outputs (just code)
|
| 83 |
+
# *.ipynb (Uncomment this if you only want to commit .py scripts)
|
| 84 |
+
|
| 85 |
+
# -------------------------------------------------------------------------
|
| 86 |
+
# 7. DVC (Data Version Control)
|
| 87 |
+
# We ignore the local cache/config but KEEP the .dvc files.
|
| 88 |
+
# -------------------------------------------------------------------------
|
| 89 |
+
/dvc_storage
|
| 90 |
+
.dvc/config.local
|
| 91 |
+
.dvc/tmp
|
| 92 |
+
.dvc/cache
|
| 93 |
+
|
| 94 |
+
# -------------------------------------------------------------------------
|
| 95 |
+
# 8. IDE & Editors
|
| 96 |
+
# -------------------------------------------------------------------------
|
| 97 |
+
.vscode/
|
| 98 |
+
.idea/
|
| 99 |
+
.DS_Store
|
| 100 |
+
|
| 101 |
+
# -------------------------------------------------------------------------
|
| 102 |
+
# 9. Docker / Deployment
|
| 103 |
+
# -------------------------------------------------------------------------
|
| 104 |
+
docker-compose.override.yml
|
| 105 |
+
.coverage
|
| 106 |
+
htmlcov/
|
| 107 |
+
*.err
|
| 108 |
+
*.out
|
notebooks/inspect_md.ipynb
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "62db404a-4930-4279-afa2-35ae4d11d857",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Inspect markdown files - KNBS and CBK\n",
|
| 9 |
+
"In this notebook, the core objective is to inspect and ingest the text from the already processed markdown files for CBK and KNBS."
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 8,
|
| 15 |
+
"id": "e8696cf9-6995-4af3-937f-9154ee6d0a99",
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"outputs": [],
|
| 18 |
+
"source": [
|
| 19 |
+
"%load_ext autoreload\n",
|
| 20 |
+
"%autoreload 2"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": 4,
|
| 26 |
+
"id": "44ff42b3-2377-4e1e-accb-02706eaae797",
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [],
|
| 29 |
+
"source": [
|
| 30 |
+
"import sys\n",
|
| 31 |
+
"import logging\n",
|
| 32 |
+
"import warnings\n",
|
| 33 |
+
"import os\n",
|
| 34 |
+
"import pandas as pd\n",
|
| 35 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
| 36 |
+
"from pathlib import Path\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"# Configure logging to see output in the notebook\n",
|
| 39 |
+
"logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": 5,
|
| 45 |
+
"id": "0d114027-fee3-4186-9d02-f6535c728553",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"# Fix paths for src files\n",
|
| 50 |
+
"project_root = Path(os.getcwd()).parent\n",
|
| 51 |
+
"script_dir = project_root / \"src\"\n",
|
| 52 |
+
"if str(script_dir) not in sys.path:\n",
|
| 53 |
+
" sys.path.append(str(script_dir))"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": 6,
|
| 59 |
+
"id": "64a8246f-58d2-4aa7-bd2c-c6a2741f8c19",
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [],
|
| 62 |
+
"source": [
|
| 63 |
+
"from load.start_ollama import start_ollama_server\n",
|
| 64 |
+
"from load.ingest_md import ingest_markdown_reports"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"cell_type": "code",
|
| 69 |
+
"execution_count": 7,
|
| 70 |
+
"id": "e6fa2ada-beac-448a-b1d2-535fd2b5d0b1",
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [
|
| 73 |
+
{
|
| 74 |
+
"name": "stdout",
|
| 75 |
+
"output_type": "stream",
|
| 76 |
+
"text": [
|
| 77 |
+
"✅ Ollama is already running.\n"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"data": {
|
| 82 |
+
"text/plain": [
|
| 83 |
+
"True"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
"execution_count": 7,
|
| 87 |
+
"metadata": {},
|
| 88 |
+
"output_type": "execute_result"
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
"source": [
|
| 92 |
+
"start_ollama_server()"
|
| 93 |
+
]
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"execution_count": 4,
|
| 98 |
+
"id": "894e6598-3c57-4484-bc3c-be043f06b5ca",
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"outputs": [],
|
| 101 |
+
"source": [
|
| 102 |
+
"# Define your paths\n",
|
| 103 |
+
"SCRATCH_DIR = os.environ.get(\"SCRATCH\")\n",
|
| 104 |
+
"KNBS_MARKDOWN_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/knbs/marker-output\")\n",
|
| 105 |
+
"VECTOR_DB_PATH = \"mshauri_fedha_chroma_db\"\n",
|
| 106 |
+
"EMBEDDING_MODEL = \"nomic-embed-text\""
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": 5,
|
| 112 |
+
"id": "d8b23cd1-ea3f-4f7b-b553-dbeca35ee61a",
|
| 113 |
+
"metadata": {},
|
| 114 |
+
"outputs": [
|
| 115 |
+
{
|
| 116 |
+
"name": "stdout",
|
| 117 |
+
"output_type": "stream",
|
| 118 |
+
"text": [
|
| 119 |
+
"📄 Scanning for Markdown Reports in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/knbs/marker-output...\n"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"name": "stderr",
|
| 124 |
+
"output_type": "stream",
|
| 125 |
+
"text": [
|
| 126 |
+
"100%|██████████| 574/574 [00:00<00:00, 4626.46it/s]\n"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"name": "stdout",
|
| 131 |
+
"output_type": "stream",
|
| 132 |
+
"text": [
|
| 133 |
+
" Loaded 574 report files.\n",
|
| 134 |
+
" ✂️ Split into 32717 chunks.\n",
|
| 135 |
+
"🧠 Appending to Vector Store...\n",
|
| 136 |
+
"INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"name": "stderr",
|
| 141 |
+
"output_type": "stream",
|
| 142 |
+
"text": [
|
| 143 |
+
"Ingesting Reports: 100%|██████████| 32717/32717 [1:09:38<00:00, 7.83chunk/s]"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"name": "stdout",
|
| 148 |
+
"output_type": "stream",
|
| 149 |
+
"text": [
|
| 150 |
+
"\n",
|
| 151 |
+
"✅ Reports Added. Hybrid Knowledge Base is ready.\n"
|
| 152 |
+
]
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"name": "stderr",
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"text": [
|
| 158 |
+
"\n"
|
| 159 |
+
]
|
| 160 |
+
}
|
| 161 |
+
],
|
| 162 |
+
"source": [
|
| 163 |
+
"# Run the ingestion on KNBS text\n",
|
| 164 |
+
"ingest_markdown_reports(\n",
|
| 165 |
+
" markdown_dir=KNBS_MARKDOWN_DIR,\n",
|
| 166 |
+
" vector_db_path=VECTOR_DB_PATH,\n",
|
| 167 |
+
" model=EMBEDDING_MODEL\n",
|
| 168 |
+
")"
|
| 169 |
+
]
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"cell_type": "code",
|
| 173 |
+
"execution_count": 6,
|
| 174 |
+
"id": "95eae0fe-f71f-475d-9944-2f202a30174c",
|
| 175 |
+
"metadata": {},
|
| 176 |
+
"outputs": [],
|
| 177 |
+
"source": [
|
| 178 |
+
"from langchain_community.vectorstores import Chroma\n",
|
| 179 |
+
"from langchain_community.embeddings import OllamaEmbeddings"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": 10,
|
| 185 |
+
"id": "72488cf7-3778-481a-8673-b7b08dd28e5a",
|
| 186 |
+
"metadata": {},
|
| 187 |
+
"outputs": [],
|
| 188 |
+
"source": [
|
| 189 |
+
"CBK_MARKDOWN_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/cbk/marker-output\")"
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"cell_type": "code",
|
| 194 |
+
"execution_count": 11,
|
| 195 |
+
"id": "8b6ef094-487a-4148-9435-1e9546d6f5a3",
|
| 196 |
+
"metadata": {},
|
| 197 |
+
"outputs": [
|
| 198 |
+
{
|
| 199 |
+
"name": "stdout",
|
| 200 |
+
"output_type": "stream",
|
| 201 |
+
"text": [
|
| 202 |
+
"📄 Scanning for Markdown Reports in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/marker-output...\n"
|
| 203 |
+
]
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"name": "stderr",
|
| 207 |
+
"output_type": "stream",
|
| 208 |
+
"text": [
|
| 209 |
+
"100%|██████████| 958/958 [00:11<00:00, 79.89it/s] \n"
|
| 210 |
+
]
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"name": "stdout",
|
| 214 |
+
"output_type": "stream",
|
| 215 |
+
"text": [
|
| 216 |
+
" Loaded 958 report files.\n",
|
| 217 |
+
" ✂️ Split into 4582 chunks.\n",
|
| 218 |
+
"🧠 Appending to Vector Store...\n"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"name": "stderr",
|
| 223 |
+
"output_type": "stream",
|
| 224 |
+
"text": [
|
| 225 |
+
"Ingesting Reports: 100%|██████████| 4582/4582 [10:21<00:00, 7.37chunk/s]"
|
| 226 |
+
]
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"name": "stdout",
|
| 230 |
+
"output_type": "stream",
|
| 231 |
+
"text": [
|
| 232 |
+
"\n",
|
| 233 |
+
"✅ Reports Added. Hybrid Knowledge Base is ready.\n"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"name": "stderr",
|
| 238 |
+
"output_type": "stream",
|
| 239 |
+
"text": [
|
| 240 |
+
"\n"
|
| 241 |
+
]
|
| 242 |
+
}
|
| 243 |
+
],
|
| 244 |
+
"source": [
|
| 245 |
+
"# Run the ingestion on CBK text\n",
|
| 246 |
+
"ingest_markdown_reports(\n",
|
| 247 |
+
" markdown_dir=CBK_MARKDOWN_DIR,\n",
|
| 248 |
+
" vector_db_path=VECTOR_DB_PATH,\n",
|
| 249 |
+
" model=EMBEDDING_MODEL\n",
|
| 250 |
+
")"
|
| 251 |
+
]
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"cell_type": "markdown",
|
| 255 |
+
"id": "d85e55c2-1f09-4550-b2d2-4ae5ccae5ae3",
|
| 256 |
+
"metadata": {},
|
| 257 |
+
"source": [
|
| 258 |
+
"## Test the performance\n",
|
| 259 |
+
"Using similarity search, we test the performance of the embedding model used here."
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"cell_type": "code",
|
| 264 |
+
"execution_count": 15,
|
| 265 |
+
"id": "6aece829-bceb-498b-a549-cf29287b1bea",
|
| 266 |
+
"metadata": {},
|
| 267 |
+
"outputs": [
|
| 268 |
+
{
|
| 269 |
+
"name": "stdout",
|
| 270 |
+
"output_type": "stream",
|
| 271 |
+
"text": [
|
| 272 |
+
"🔎 Checking for tables in: 'Interest rates commercial banks 2020'\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"---------------------------------------------------\n",
|
| 275 |
+
"📰 Source: knbs_batch_42_193220_2020kenyafactsfigures\n",
|
| 276 |
+
"| Value of shares traded (KSh Bn) | 209 | 147 | 172 | 176 | 154 |\n",
|
| 277 |
+
"| Equities market capitalization (KSh Bn) | 2,054 | 1,932 | 2,522 | 2,102 | 2,540 |\n",
|
| 278 |
+
"| NSE 20 Share Index (Base Jan 1966=100) | 4,040 | 3,186 | 3,712 | 2,834 | 2,654 |\n",
|
| 279 |
+
"\n",
|
| 280 |
+
"<sup>1</sup> W eig hted av erag e commercial bank interes t rates\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"# **Monetary and Financial Statistics**\n",
|
| 283 |
+
"\n",
|
| 284 |
+
"**Table 21: Commercial Banks' Deposits, Loans and Advances, 2015 - 2019**\n",
|
| 285 |
+
"\n",
|
| 286 |
+
"| 2015 | 2016 | 2017 | 2018 | 2019* | |\n",
|
| 287 |
+
"|------------------------------------------------|-------------|-------------|-------------|-------------|-------------|\n",
|
| 288 |
+
"| Commercial bank<br>s (KS<br>h million) | | | | | |\n",
|
| 289 |
+
"| Deposits<br>liabilities | 2,661,140.0 | 2,771,710.6 | 3,068,723.8 | 3,414,705.5 | 3,634,995.9 |\n",
|
| 290 |
+
"| Total loans<br>and advances | 2,873,799.6 | 3,127,888.0 | 3,318,907.0 | 3,543,932.0 | 3,838,796.6 |\n",
|
| 291 |
+
"| Public sector | 630,049.0 | 814,585.0 | 930,174.5 | 1,057,217.2 | 1,177,091.3 |\n",
|
| 292 |
+
"| Private sector | 2,243,750.7 | 2,317,025.0 | 2,415,922.9 | 2,486,714.9 | 2,664,382.5 |\n",
|
| 293 |
+
"| Number of authorised institutions in operation | | | | | |\n",
|
| 294 |
+
"| Licensed banks | 43 | 43 | 42 | 42 ...\n",
|
| 295 |
+
"\n",
|
| 296 |
+
"---------------------------------------------------\n",
|
| 297 |
+
"📰 Source: knbs_batch_42_193220_thekenyapovertyreport2020\n",
|
| 298 |
+
"| Interest rate on commercial bank loans and advances $(\\%)$ | 17.44 | 2.607 | 1 | 9 | | 12.02 |\n",
|
| 299 |
+
"| Formal Employment sector (000's) | 2,601 | | ,,,, ...\n",
|
| 300 |
+
"\n",
|
| 301 |
+
"---------------------------------------------------\n",
|
| 302 |
+
"📰 Source: knbs_batch_35_193128_2021kenyafactsfigures\n",
|
| 303 |
+
"| 100 Japanese Yen | 93.55 | 92.22 | 91.74 | 93.59 | 99.80 |\n",
|
| 304 |
+
"| 1 SA Rand | 6.93 | | | | |\n",
|
| 305 |
+
"| KSh /TSh | 21.54 | 21.63 | 22.48 | 22.63 | 21.76 |\n",
|
| 306 |
+
"| KSh/ Ush | 33.68 | 34.92 | 36.81 | 36.32 | 34.93 |\n",
|
| 307 |
+
"| Overall Weighted Index 2009=100 | 114.30 | 114.83 | 116.52 | 115.66 | 113.04 |\n",
|
| 308 |
+
"\n",
|
| 309 |
+
"<sup>\\*</sup>Provisional\n",
|
| 310 |
+
"\n",
|
| 311 |
+
"<sup>2</sup> Countries in the Euro area included in the computation of Trade Weighted Fisher's Ideal Index are: Germany, France, Switzerland, Netherlands, Belgium and Italy.\n",
|
| 312 |
+
"\n",
|
| 313 |
+
"**Table 19: Nominal Interest Rates, 2016 – 2020**\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"Percentage **2016 2017 2018 2019 2020** 91-day Treasury bill rate................................... 8.44 8.01 7.34 7.17 6.90 Inter-Bank Offered Rate.................................... 5.92 7.27 8.15 6.03 5.29 Overdraft Rates.................................................. 13.49 13.54 12.17 11.67 11.51 Commercial Banks Loans and Advances...... 13.69 13.64 12.51 12.24 12.02 Savings deposits rate....................................... 6.37 6.91 5.13 4.02 2.70\n",
|
| 316 |
+
"\n",
|
| 317 |
+
"<sup>1</sup>Weighted average commercial bank interest rates\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"Table 20: Securities Exchange, 2016 - 2020\n",
|
| 320 |
+
"\n",
|
| 321 |
+
"| 2016 | 2017 | 2018 | 2019 | 2020* | |\n",
|
| 322 |
+
"|-----------------------------------------|-------|-------|-------|-------|-------|\n",
|
| 323 |
+
"| Value of shares traded (KSh Bn) | 147 | 172 | 176 | 154 | 149 |\n",
|
| 324 |
+
"| Equities Market c...\n",
|
| 325 |
+
"\n",
|
| 326 |
+
"---------------------------------------------------\n",
|
| 327 |
+
"📰 Source: knbs_batch_32_193040_2025_facts_and_figures\n",
|
| 328 |
+
"| Kenya Development Corporation2 | - | 10 | 9 | 4 | 7 | - | 521.4 | 613.8 | 510.0 | 599.9 |\n",
|
| 329 |
+
"| Industrial and Commercial Development Corporation | 3 | - | - | - | - | 100.9 | - | - | - | - |\n",
|
| 330 |
+
"| Sub - total | 320 | 328 | 303 | 362 | 343 | 1,096.3 | 1,394.5 | 1,690.8 | 1,869.9 | 2,304.9 |\n",
|
| 331 |
+
"| All other commercial banks1 | - | - | - | - | - | 410,640 | 463,981 | 527,235 | 637,513 | 560,643 |\n",
|
| 332 |
+
"| TOTAL | 320 | 328 | 303 | 362 | 343 | 411,736 | 465,376 | 528,926 | 639,383 | 562,948 |\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"*<sup>\\*</sup> Provisional* \n",
|
| 335 |
+
"\n",
|
| 336 |
+
"*<sup>1</sup> Source: Central Bank of Kenya (excludes DBK).* \n",
|
| 337 |
+
"\n",
|
| 338 |
+
"*<sup>2</sup> IDB Capital, Tourism Finance Corporation and ICDC merged to form KDC in 2020* \n",
|
| 339 |
+
"\n",
|
| 340 |
+
"\n",
|
| 341 |
+
"\n",
|
| 342 |
+
"**Table 43: Selected EPZ Performance Indicators, 2020 - 2024**\n",
|
| 343 |
+
"\n",
|
| 344 |
+
"| | Unit | 2020 | 2021 | 2022 | 2023 | 2024* |\n",
|
| 345 |
+
"|------------------------------------------|----------------|---------|---------|---------|---------|---------|\n",
|
| 346 |
+
"| Gazetted Zones | Number | 76 | 82 | 89 | 102 | 105 |\n",
|
| 347 |
+
"| ...\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"---------------------------------------------------\n",
|
| 350 |
+
"📰 Source: knbs_batch_34_193116_2022economicsurvey\n",
|
| 351 |
+
"| Savings deposits. | - | - | - | - | - | - | |\n",
|
| 352 |
+
"| Loan and Advances (maximum) | 13.25 | 12.10 | 12.19 | 12.04 | 12.04 | 12.16 | |\n",
|
| 353 |
+
"| Overdraft. | - | - | - | - | - | - | |\n",
|
| 354 |
+
"| Loans-Deposits Spread | - | - | - | - | 5.69 | - | |\n",
|
| 355 |
+
"\n",
|
| 356 |
+
"*Source: Central Bank of Kenya.*\n",
|
| 357 |
+
"\n",
|
| 358 |
+
"*Selected financial aggregates values are deflated using December Consumer Price Indices*\n",
|
| 359 |
+
"\n",
|
| 360 |
+
"*Weighted average commercial bank interest rates*\n",
|
| 361 |
+
"\n",
|
| 362 |
+
"*<sup>\\*</sup>Provisional*\n",
|
| 363 |
+
"\n",
|
| 364 |
+
"**4.12.** Table 4.7 shows the selected real principal interest rates from 2017 to 2021. Real interest rates reflect the real cost of borrowing, savings and return on investment. The weighted average real interest rate for commercial bank deposits increased to 0.77 per cent in 2021 from 0.68 per cent in 2020. Commercial banks loans and advances rate increased from 6.40 per cent in December 2020 to 6.43 per cent in December 2021. The real average interest rate for the 91-day Treasury Bills increased from 1.28 per cent in December 2020 to 1.53 per cent in December 2021 while the inter-bank rate declined further from negative 0.33 per cent in December 2020 to negative 0.63 per cent in December 2021.\n",
|
| 365 |
+
"\n",
|
| 366 |
+
"**Table 4.7: Selected Real Principal Interest Rates, 2017 – 2021**\n",
|
| 367 |
+
"\n",
|
| 368 |
+
"*Per cent*\n",
|
| 369 |
+
"\n",
|
| 370 |
+
"| Average Interest Rate for 91-day Treasury Bills | Year | Nominal Interest | Inflation Rate | Real Interest1 |\n",
|
| 371 |
+
"|---------------------------...\n",
|
| 372 |
+
"\n"
|
| 373 |
+
]
|
| 374 |
+
}
|
| 375 |
+
],
|
| 376 |
+
"source": [
|
| 377 |
+
"# Connect\n",
|
| 378 |
+
"embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=\"http://127.0.0.1:25000\")\n",
|
| 379 |
+
"vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embeddings)\n",
|
| 380 |
+
"\n",
|
| 381 |
+
"# Query for a known table\n",
|
| 382 |
+
"query = \"Interest rates commercial banks 2020\" \n",
|
| 383 |
+
"\n",
|
| 384 |
+
"results = vectorstore.similarity_search(query, k=5)\n",
|
| 385 |
+
"\n",
|
| 386 |
+
"print(f\"🔎 Checking for tables in: '{query}'\\n\")\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"for i, doc in enumerate(results):\n",
|
| 389 |
+
" content = doc.page_content\n",
|
| 390 |
+
" print(\"---------------------------------------------------\")\n",
|
| 391 |
+
" print(f\"Source: {doc.metadata.get('source', 'N/A').split('/')[-1].split('.')[0].replace('-', '')}\")\n",
|
| 392 |
+
" # Print first 500 chars to see if headers align\n",
|
| 393 |
+
" print(content[:1500] + \"...\\n\")"
|
| 394 |
+
]
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"cell_type": "code",
|
| 398 |
+
"execution_count": null,
|
| 399 |
+
"id": "dc9072f0-fcd2-4b24-9050-795e64b26ff6",
|
| 400 |
+
"metadata": {},
|
| 401 |
+
"outputs": [],
|
| 402 |
+
"source": []
|
| 403 |
+
}
|
| 404 |
+
],
|
| 405 |
+
"metadata": {
|
| 406 |
+
"kernelspec": {
|
| 407 |
+
"display_name": "Python 3 (ipykernel)",
|
| 408 |
+
"language": "python",
|
| 409 |
+
"name": "python3"
|
| 410 |
+
},
|
| 411 |
+
"language_info": {
|
| 412 |
+
"codemirror_mode": {
|
| 413 |
+
"name": "ipython",
|
| 414 |
+
"version": 3
|
| 415 |
+
},
|
| 416 |
+
"file_extension": ".py",
|
| 417 |
+
"mimetype": "text/x-python",
|
| 418 |
+
"name": "python",
|
| 419 |
+
"nbconvert_exporter": "python",
|
| 420 |
+
"pygments_lexer": "ipython3",
|
| 421 |
+
"version": "3.12.3"
|
| 422 |
+
}
|
| 423 |
+
},
|
| 424 |
+
"nbformat": 4,
|
| 425 |
+
"nbformat_minor": 5
|
| 426 |
+
}
|
notebooks/inspect_news.ipynb
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "4b0cd97a-60f0-4582-a8c3-b4d9dbf7ab03",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Inspect Business News\n",
|
| 9 |
+
"In this notebook, the study will seek to inspect news files (stored as csv files), and ingest its content to the vectordb"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": 1,
|
| 15 |
+
"id": "ed41b128-7baa-43e2-9b1a-2cfdc955440a",
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"outputs": [],
|
| 18 |
+
"source": [
|
| 19 |
+
"%load_ext autoreload\n",
|
| 20 |
+
"%autoreload 2"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": 6,
|
| 26 |
+
"id": "c80c78f8-8118-46b5-b954-e0b15d935ced",
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [],
|
| 29 |
+
"source": [
|
| 30 |
+
"import pandas as pd\n",
|
| 31 |
+
"import glob\n",
|
| 32 |
+
"import os\n",
|
| 33 |
+
"import sys\n",
|
| 34 |
+
"import warnings\n",
|
| 35 |
+
"from pathlib import Path\n",
|
| 36 |
+
"warnings.filterwarnings(\"ignore\")"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 7,
|
| 42 |
+
"id": "7801b40d-c6a9-44cd-8afb-e544731aeb52",
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"# Fix paths for src files\n",
|
| 47 |
+
"project_root = Path(os.getcwd()).parent\n",
|
| 48 |
+
"script_dir = project_root / \"src\"\n",
|
| 49 |
+
"if str(script_dir) not in sys.path:\n",
|
| 50 |
+
" sys.path.append(str(script_dir))"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": 8,
|
| 56 |
+
"id": "2f940902-b38d-4e6e-9566-55f511dc0bc9",
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"outputs": [],
|
| 59 |
+
"source": [
|
| 60 |
+
"from load.explore_news_schema import analyze_schemas"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": 4,
|
| 66 |
+
"id": "788c9e1d-5c17-4387-9153-77a31cd26eec",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [
|
| 69 |
+
{
|
| 70 |
+
"name": "stdout",
|
| 71 |
+
"output_type": "stream",
|
| 72 |
+
"text": [
|
| 73 |
+
"🔍 Scanning 22 files in '/capstor/scratch/cscs/tligawa/mshauri-fedha/data/news'...\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"--- Schema Report ---\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"TYPE 1: Found in 5 files\n",
|
| 78 |
+
"Columns: ['description', 'published date', 'publisher', 'title', 'url']\n",
|
| 79 |
+
"Examples: ['google_news_10-11-2025.csv', 'google_news_10-11-2025-19-27.csv', 'google_news_19-11-2025-19-49.csv'] ... (+2 others)\n",
|
| 80 |
+
"\n",
|
| 81 |
+
"TYPE 2: Found in 7 files\n",
|
| 82 |
+
"Columns: ['authors', 'date', 'full_content', 'image', 'source', 'summary', 'title', 'url', 'word_count']\n",
|
| 83 |
+
"Examples: ['kenya_news_full_27-10-2025.csv', 'kenya_news_full_17-11-2025-17-52.csv', 'newsdata_10-11-2025.csv'] ... (+4 others)\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"TYPE 3: Found in 10 files\n",
|
| 86 |
+
"Columns: ['content', 'date', 'source', 'title', 'url']\n",
|
| 87 |
+
"Examples: ['gnews_19-11-2025-19-49.csv', 'the_news_10-11-2025.csv', 'the_news_19-11-2025-19-49.csv'] ... (+7 others)\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"--- Date Format Sample ---\n",
|
| 90 |
+
"Sample from column 'published date' in google_news_10-11-2025.csv:\n",
|
| 91 |
+
"['Sun, 09 Nov 2025 03:15:00 GMT', 'Sun, 09 Nov 2025 18:45:00 GMT', 'Tue, 04 Nov 2025 06:00:00 GMT', 'Tue, 04 Nov 2025 17:33:08 GMT', 'Wed, 05 Nov 2025 09:38:38 GMT']\n"
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"source": [
|
| 96 |
+
"# Path to where you just downloaded the files\n",
|
| 97 |
+
"SCRATCH_DIR = os.environ.get(\"SCRATCH\")\n",
|
| 98 |
+
"NEWS_DIR = os.path.join(SCRATCH_DIR, \"mshauri-fedha/data/news\")\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"# Run the exploration\n",
|
| 101 |
+
"analyze_schemas(NEWS_DIR)"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "code",
|
| 106 |
+
"execution_count": 9,
|
| 107 |
+
"id": "f65ee0d4-2bac-46f4-8cd4-44ac9c903ee3",
|
| 108 |
+
"metadata": {},
|
| 109 |
+
"outputs": [],
|
| 110 |
+
"source": [
|
| 111 |
+
"from load.ingest_news import ingest_news_data"
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"cell_type": "code",
|
| 116 |
+
"execution_count": 10,
|
| 117 |
+
"id": "c7864bbb-e731-45b4-bd9e-4dbf9a063653",
|
| 118 |
+
"metadata": {},
|
| 119 |
+
"outputs": [],
|
| 120 |
+
"source": [
|
| 121 |
+
"from load.start_ollama import start_ollama_server, pull_embedding_model"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"cell_type": "code",
|
| 126 |
+
"execution_count": 7,
|
| 127 |
+
"id": "279c443b-4502-40d9-9b29-06e1929ba842",
|
| 128 |
+
"metadata": {},
|
| 129 |
+
"outputs": [
|
| 130 |
+
{
|
| 131 |
+
"name": "stdout",
|
| 132 |
+
"output_type": "stream",
|
| 133 |
+
"text": [
|
| 134 |
+
"🚀 Starting Ollama Server...\n",
|
| 135 |
+
"⏳ Waiting for server to boot...\n",
|
| 136 |
+
"✅ Server started successfully.\n"
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"data": {
|
| 141 |
+
"text/plain": [
|
| 142 |
+
"True"
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
"execution_count": 7,
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"output_type": "execute_result"
|
| 148 |
+
}
|
| 149 |
+
],
|
| 150 |
+
"source": [
|
| 151 |
+
"start_ollama_server()"
|
| 152 |
+
]
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "code",
|
| 156 |
+
"execution_count": 8,
|
| 157 |
+
"id": "03b5c2c0-4d6a-4e06-ab88-806ca5eaa2d6",
|
| 158 |
+
"metadata": {},
|
| 159 |
+
"outputs": [
|
| 160 |
+
{
|
| 161 |
+
"name": "stdout",
|
| 162 |
+
"output_type": "stream",
|
| 163 |
+
"text": [
|
| 164 |
+
"⬇️ Requesting pull for 'nomic-embed-text'...\n",
|
| 165 |
+
" success manifest digest00%\n",
|
| 166 |
+
"✅ Model 'nomic-embed-text' installed successfully!\n"
|
| 167 |
+
]
|
| 168 |
+
}
|
| 169 |
+
],
|
| 170 |
+
"source": [
|
| 171 |
+
"# pull embedding model\n",
|
| 172 |
+
"pull_embedding_model(\"nomic-embed-text\")"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "code",
|
| 177 |
+
"execution_count": 9,
|
| 178 |
+
"id": "3a4a7e63-934c-4b0d-824f-f525278139dc",
|
| 179 |
+
"metadata": {},
|
| 180 |
+
"outputs": [
|
| 181 |
+
{
|
| 182 |
+
"name": "stdout",
|
| 183 |
+
"output_type": "stream",
|
| 184 |
+
"text": [
|
| 185 |
+
"🚀 Found 22 news files. Processing...\n"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"name": "stderr",
|
| 190 |
+
"output_type": "stream",
|
| 191 |
+
"text": [
|
| 192 |
+
"Reading CSVs: 100%|██████████| 22/22 [00:00<00:00, 179.19file/s]"
|
| 193 |
+
]
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"name": "stdout",
|
| 197 |
+
"output_type": "stream",
|
| 198 |
+
"text": [
|
| 199 |
+
" 📉 Condensed into 198 unique articles.\n",
|
| 200 |
+
"🧠 Embedding 455 chunks into Vector DB...\n"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"name": "stderr",
|
| 205 |
+
"output_type": "stream",
|
| 206 |
+
"text": [
|
| 207 |
+
"\n",
|
| 208 |
+
"/users/tligawa/mshauri-fedha/notebooks/ingest_news.py:149: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
|
| 209 |
+
" embeddings = OllamaEmbeddings(model=model, base_url=\"http://127.0.0.1:25000\")\n",
|
| 210 |
+
"/users/tligawa/mshauri-fedha/notebooks/ingest_news.py:150: LangChainDeprecationWarning: The class `Chroma` was deprecated in LangChain 0.2.9 and will be removed in 1.0. An updated version of the class exists in the `langchain-chroma package and should be used instead. To use it run `pip install -U `langchain-chroma` and import as `from `langchain_chroma import Chroma``.\n",
|
| 211 |
+
" vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)\n"
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"name": "stdout",
|
| 216 |
+
"output_type": "stream",
|
| 217 |
+
"text": [
|
| 218 |
+
"Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"name": "stderr",
|
| 223 |
+
"output_type": "stream",
|
| 224 |
+
"text": [
|
| 225 |
+
"Embedding News: 100%|██████████| 455/455 [00:21<00:00, 21.06chunk/s]"
|
| 226 |
+
]
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"name": "stdout",
|
| 230 |
+
"output_type": "stream",
|
| 231 |
+
"text": [
|
| 232 |
+
"\n",
|
| 233 |
+
"✅ News Ingestion Complete.\n"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"name": "stderr",
|
| 238 |
+
"output_type": "stream",
|
| 239 |
+
"text": [
|
| 240 |
+
"\n"
|
| 241 |
+
]
|
| 242 |
+
}
|
| 243 |
+
],
|
| 244 |
+
"source": [
|
| 245 |
+
"VECTOR_DB = \"mshauri_fedha_chroma_db\"\n",
|
| 246 |
+
"EMBEDDING_MODEL = \"nomic-embed-text\" # Make sure this matches your existing DB model\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"# Run\n",
|
| 249 |
+
"ingest_news_data(NEWS_DIR, VECTOR_DB, EMBEDDING_MODEL)"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"cell_type": "code",
|
| 254 |
+
"execution_count": null,
|
| 255 |
+
"id": "d993f352-44a4-4c73-ae2f-cc3b2e85a882",
|
| 256 |
+
"metadata": {},
|
| 257 |
+
"outputs": [],
|
| 258 |
+
"source": [
|
| 259 |
+
"from langchain_community.vectorstores import Chroma\n",
|
| 260 |
+
"from langchain_community.embeddings import OllamaEmbeddings"
|
| 261 |
+
]
|
| 262 |
+
},
|
| 263 |
+
{
|
| 264 |
+
"cell_type": "code",
|
| 265 |
+
"execution_count": null,
|
| 266 |
+
"id": "ee969dc7-6ecb-45e3-ab89-875536177543",
|
| 267 |
+
"metadata": {},
|
| 268 |
+
"outputs": [],
|
| 269 |
+
"source": [
|
| 270 |
+
"# --- CONFIG ---\n",
|
| 271 |
+
"VECTOR_DB_PATH = \"mshauri_fedha_chroma_db\"\n",
|
| 272 |
+
"EMBEDDING_MODEL = \"nomic-embed-text\"\n",
|
| 273 |
+
"OLLAMA_URL = \"http://127.0.0.1:25000\""
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": null,
|
| 279 |
+
"id": "e681dd43-f0f3-47ec-8445-4d188eb7886a",
|
| 280 |
+
"metadata": {},
|
| 281 |
+
"outputs": [],
|
| 282 |
+
"source": [
|
| 283 |
+
"# Connect to DB\n",
|
| 284 |
+
"print(\"Connecting to Vector Store...\")\n",
|
| 285 |
+
"embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL, base_url=OLLAMA_URL)\n",
|
| 286 |
+
"vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embeddings)"
|
| 287 |
+
]
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"cell_type": "code",
|
| 291 |
+
"execution_count": null,
|
| 292 |
+
"id": "5decfff3-26d7-4784-b6b0-7fd98a34f3ca",
|
| 293 |
+
"metadata": {},
|
| 294 |
+
"outputs": [],
|
| 295 |
+
"source": [
|
| 296 |
+
"# Get Stats\n",
|
| 297 |
+
"count = vectorstore._collection.count()\n",
|
| 298 |
+
"print(f\"Total Documents stored: {count}\")"
|
| 299 |
+
]
|
| 300 |
+
},
|
| 301 |
+
{
|
| 302 |
+
"cell_type": "code",
|
| 303 |
+
"execution_count": 10,
|
| 304 |
+
"id": "d55aab85-10c7-44a0-83cf-69c125416b61",
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"outputs": [
|
| 307 |
+
{
|
| 308 |
+
"name": "stdout",
|
| 309 |
+
"output_type": "stream",
|
| 310 |
+
"text": [
|
| 311 |
+
"🔌 Connecting to Vector Store...\n",
|
| 312 |
+
"✅ Total Documents stored: 455\n",
|
| 313 |
+
"\n",
|
| 314 |
+
"👀 Random Sample Document:\n",
|
| 315 |
+
"--- Metadata ---\n",
|
| 316 |
+
"Date: 2025-11-03\n",
|
| 317 |
+
"Source: african markets\n",
|
| 318 |
+
"Type: news\n",
|
| 319 |
+
"\n",
|
| 320 |
+
"--- Content (First 300 chars) ---\n",
|
| 321 |
+
"Title: BGFI Holding finally gets regulatory approval for its BVMAC IPO: inside a tumultuous IPO journey - african markets\n",
|
| 322 |
+
"Date: 2025-11-03\n",
|
| 323 |
+
"Source: african markets\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"BGFI Holding finally gets regulatory approval for its BVMAC IPO: inside a tumultuous IPO journey african markets...\n"
|
| 326 |
+
]
|
| 327 |
+
}
|
| 328 |
+
],
|
| 329 |
+
"source": [
|
| 330 |
+
"# Peek at a Sample\n",
|
| 331 |
+
"print(\"\\n Random Sample Document:\")\n",
|
| 332 |
+
"# We fetch 1 random ID just to peek\n",
|
| 333 |
+
"result = vectorstore.get(limit=1)\n",
|
| 334 |
+
"\n",
|
| 335 |
+
"if result['ids']:\n",
|
| 336 |
+
" meta = result['metadatas'][0]\n",
|
| 337 |
+
" content = result['documents'][0]\n",
|
| 338 |
+
" \n",
|
| 339 |
+
" print(f\"--- Metadata ---\")\n",
|
| 340 |
+
" print(f\"Date: {meta.get('date')}\")\n",
|
| 341 |
+
" print(f\"Source: {meta.get('source')}\")\n",
|
| 342 |
+
" print(f\"Type: {meta.get('type')}\")\n",
|
| 343 |
+
" \n",
|
| 344 |
+
" print(f\"\\n--- Content (First 300 chars) ---\")\n",
|
| 345 |
+
" print(content[:300] + \"...\")\n",
|
| 346 |
+
"else:\n",
|
| 347 |
+
" print(\"Database is empty!\")"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"cell_type": "code",
|
| 352 |
+
"execution_count": 11,
|
| 353 |
+
"id": "f1192ee8-f09a-4e11-84df-aacea487485e",
|
| 354 |
+
"metadata": {},
|
| 355 |
+
"outputs": [
|
| 356 |
+
{
|
| 357 |
+
"name": "stdout",
|
| 358 |
+
"output_type": "stream",
|
| 359 |
+
"text": [
|
| 360 |
+
"\n",
|
| 361 |
+
"🔎 Searching for: 'How have the protests impacted the Kenyan economy?'...\n",
|
| 362 |
+
"Found 10 relevant articles:\n",
|
| 363 |
+
"\n",
|
| 364 |
+
"Result #1 -------------------------\n",
|
| 365 |
+
"📅 Date: 2025-09-29\n",
|
| 366 |
+
"📰 Source: Devdiscourse\n",
|
| 367 |
+
"📝 Excerpt: Title: Madagascar's Government Dissolution Amidst Gen Z-Inspired Protests: A Call for Dialogue and Reform Date: 2025-09-29 Source: Devdiscourse In response to youth-led protests over worsening water and power shortages, Malagasy President Andry Rajoelina announced the dissolution of the government on Monday. The unrest, largely influenced by Gen Z movements in Kenya and Nepal, marks the largest such demonstrations in Madagascar in years. These rallies significantly challenge Rajoelina's leadership since his recent 2023 re-election. The president offered an apology for governmental shortcomings and vowed to engage in dialogue with the youth while ensuring support for affected businesses. The protests have seen significant casualties, with both protestors and bystanders affected, although official figures remain contested. (With inputs from agencies.)...\n",
|
| 368 |
+
"------------------------------------\n",
|
| 369 |
+
"\n",
|
| 370 |
+
"Result #2 -------------------------\n",
|
| 371 |
+
"📅 Date: 2025-10-27\n",
|
| 372 |
+
"📰 Source: capitalfm\n",
|
| 373 |
+
"📝 Excerpt: Title: Motorists protest ‘hidden’ concession in Rironi-Mau Summit Expressway Date: 2025-10-27 Source: capitalfm...\n",
|
| 374 |
+
"------------------------------------\n",
|
| 375 |
+
"\n",
|
| 376 |
+
"Result #3 -------------------------\n",
|
| 377 |
+
"📅 Date: 2024-06-21\n",
|
| 378 |
+
"📰 Source: riotimesonline.com\n",
|
| 379 |
+
"📝 Excerpt: In Kenya, a significant backlash has emerged against the 2024 Finance Bill, which proposes various tax increases. The bill was approved by the Kenyan parliament. This happened despite modifications by Parliament’s Finance Committee aimed at mitigating public dissatisfaction by dropping several contentious tax proposals. This legislative move coincided with a wave of protests primarily driven by Gen Z and millennials, marking a pivotal moment in the country’s political landscape. Protests erupted across multiple Kenyan cities, including Nairobi and key areas within President William Ruto’s Rift Valley strongholds like Eldoret and Kericho. These demonstrations were notable not only for their scope but also for their organization. They were orchestrated online without the backing of established political parties. The phrase “Tuko wengi,” meaning “We are many,” echoed through the streets of Eldoret, symbolizing the protesters’ unity and significant numbers. The demographic profile of the protesters is particularly noteworthy. Approximately 75% of Kenya’s population is under 35, with a median age of 19. Young Kenyans Mobilize This young populace could be a transformative force in future electoral processes, potentially starting with the 2027 general elections. Their active engagement in these protests reflects a broader trend of increasing political mobilization through digital platforms. This trend facilitates widespread participation and amplifies their collective voice. This series of protests underscores a deep-seated discontent among the youth, who perceive the tax hikes as detrimental to their economic prospects. Their willingness to publicly express their dissatisfaction highlights a shift towards more grassroots political involvement. This suggests that these young, digitally-savvy generations could significantly shape the future of Kenyan politics...\n",
|
| 380 |
+
"------------------------------------\n",
|
| 381 |
+
"\n",
|
| 382 |
+
"Result #4 -------------------------\n",
|
| 383 |
+
"📅 Date: 2023-07-19\n",
|
| 384 |
+
"📰 Source: abcnews.go.com\n",
|
| 385 |
+
"📝 Excerpt: . Luis Tato/AFP via Getty Images At least five protesters were injured on Wednesday as demonstrators clashed with police. Amnesty International Kenya said, said that \"para-military police officers and armored water cannon trucks [are] already patrolling and engaging protestors across several towns and neighborhoods.\" In Kibera -- a stronghold of the opposition -- protests turned violent, with demonstrators setting fire to tires and furniture, stones being pelted, and tear gas being deployed by police. In the most recent round of anti-government protests at least 23 people are reported to have been killed according to the U.N., with over 300 arrested. Protests have also been reported in Kenya's Kisumu, Kisii and Migori counties. A Kenya Police Officer shoots a tear gas canister to disperse some protesters as they gather to demonstrate in Nairobi, Kenya, on July 12, 2023. Luis Tato/AFP via Getty Images Kenya's Ministry of Education also announced that all primary and secondary schools in Nairobi and the coastal city Mombasa are to close on Wednesday as a \"precautionary measure\" following \"credible security intelligence.\" Several businesses also remain closed. The protests come after Ruto last month signed into law a contentious finance bill at Nairobi's State House that proposed doubling the tax levied on fuel from 8% to 16%. The bill aimed to aid in offsetting Kenya's external debt, officials said. However, the bill will have a ripple effect on the price of basic commodities, compounding on the economic strain of Kenyans already struggling with the rising cost of living. Riot police detain a supporter of Kenya's opposition leader Raila Odinga as he participates in an anti-government protest against the imposition of tax hikes by the government in Nairobi, Kenya, July 19, 2023. Thomas Mukoya/Reuters Implementation of the Bill -- which was due to come into effect on July 1 -- was halted by Kenya's High Court following a case brought by opposition Sen...\n",
|
| 386 |
+
"------------------------------------\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"Result #5 -------------------------\n",
|
| 389 |
+
"📅 Date: 2023-07-19\n",
|
| 390 |
+
"📰 Source: abcnews.go.com\n",
|
| 391 |
+
"📝 Excerpt: An opposition leader called for three days of protests against a finance bill. 6 dead as Kenya rocked by nationwide anti-government protests over gas tax, Amnesty says A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. A riot policeman reloads a teargas grenade launcher during clashes with protesters in the Kibera area of Nairobi, Kenya, July 19, 2023. LONDON -- Kenya was bracing for days of anti-government protests led by the government's political opposition over a contentious new finance bill and the rising cost of living At least six people were shot and killed and at least a dozen others were injured on Wednesday, the first day of a planned three-day protest against higher taxes, Mathias Kinyoda, of Amnesty International Kenya, told ABC News. At least 87 demonstrators were arrested nationwide, he said. The protests were called by opposition leader Raila Odinga. The unrest was set to take place despite Kenya's President William Ruto vowing no protests would take place in the East African Nation. \"We are here, first and foremost, to confirm that the peaceful protests planned for Wednesday, Thursday and Friday this week are on as earlier declared by our leadership,\" read a statement by Odinga's party, Azimio La Umoja, sent to ABC News. A Kenya Police Officer runs away from a group of opposition supporters chasing him and throwing stones during anti-government protests in Nairobi on July 19, 2023. Luis Tato/AFP via Getty Images At least five protesters were injured on Wednesday as demonstrators clashed with police...\n",
|
| 392 |
+
"------------------------------------\n",
|
| 393 |
+
"\n",
|
| 394 |
+
"Result #6 -------------------------\n",
|
| 395 |
+
"📅 Date: 2022-07-20\n",
|
| 396 |
+
"📰 Source: thesouthafrican.com\n",
|
| 397 |
+
"📝 Excerpt: Inflation in Kenya: why and how to fix it; Image: Adobe stock Inflation has hit many countries recently, from the United States to Sri Lanka. In Kenya, too, the rising prices of basic commodities have left most citizens wondering what’s going on. The price of a 2kg packet of maize and wheat flour hit 200 shillings (US$2) from a low of 120 shillings in about three months. That is a 67% increase. The 12-month overall inflation rate reached 7.91% in June 2022. Politicians eyeing Kenya’s 9 August polls have been offering solutions in exchange for votes. ADVERTISEMENT Kenya’s average annual per capita income is US$5 270. With inflation, citizens lose even this limited purchasing power. The same money buys less. Wages and salaries do not go up fast enough. Citizens’ discontent can change the way they vote in democratic countries or lead to violence in undemocratic ones. The Kenya African National Union, which ruled the country from independence in 1963, was voted out in 2002 partly because of citizens’ discontent over the state of the economy. And in the US, economic discontent has been a big factor in voting; it led to Donald Trump’s win in the 2016 presidential polls. That’s why politicians are so quick to promise relief. But can they provide it? The two key drivers of inflation in Kenya’s consumer price index are food and energy. Russia’s war on Ukraine has raised the price of oil to the highest level in history, which spills over to the rest of global economy. And about 30% to 50% of Kenya’s imported wheat comes from Russia and Ukraine. Ukraine is exporting 60% less wheat this year compared with 2021, leading to a rise in price of wheat and its derivatives like bread. Unreliable rains have cut domestic production of maize and other food crops in Kenya. Production is about 15% to 20% below the five-year average. ADVERTISEMENT Some of the drivers of Kenya’s inflation are local while others are external and beyond its control...\n",
|
| 398 |
+
"------------------------------------\n",
|
| 399 |
+
"\n",
|
| 400 |
+
"Result #7 -------------------------\n",
|
| 401 |
+
"📅 Date: 2025-10-21\n",
|
| 402 |
+
"📰 Source: Crypto News\n",
|
| 403 |
+
"📝 Excerpt: . While Tether hasn’t broken down that figure by region, its gaze is fixed on Africa, where it sees its next chapter of growth unfolding. The firm points to a Chainalysis report revealing a 52% explosion in on-chain transaction volume across Sub-Saharan Africa, which rocketed past $205 billion in a single year. Behind that surge are small business owners and individuals turning to digital assets as a lifeline. They’re navigating the same harsh realities the data confirms: soaring inflation, unpredictable local currencies, and banking systems that have left many behind. To put faces to these numbers, Tether released a short documentary from Kenya. The film highlights local merchants using USDT to pay international suppliers and families relying on it to receive remittances from abroad. It’s a grassroots look at how a global digital dollar is providing a tangible anchor in economies often defined by their volatility....\n",
|
| 404 |
+
"------------------------------------\n",
|
| 405 |
+
"\n",
|
| 406 |
+
"Result #8 -------------------------\n",
|
| 407 |
+
"📅 Date: 2022-07-20\n",
|
| 408 |
+
"📰 Source: thesouthafrican.com\n",
|
| 409 |
+
"📝 Excerpt: . Production is about 15% to 20% below the five-year average. ADVERTISEMENT Some of the drivers of Kenya’s inflation are local while others are external and beyond its control. My view, based on my analysis of the Kenyan economy and other countries, is that inflation can be managed but there are no quick fixes. ALSO READ: Throwback: Family turns ‘white’ after freak accident (pics) Drivers of inflation In Kenya, a confluence of many factors has inflated prices, particularly after the Ukraine war and the pandemic. One is elections. Lots of money is spent during electioneering. Some of it is just given out with no commensurate productivity. Kenya saw this in 1990, when money in circulation rose before the 1992 elections and so did the rate of inflation. The second factor is corruption and mismanagement. Whether it’s in procurement where prices are inflated, or when goods are not supplied or substandard ones are supplied, consumers pay the price. The cost of corruption to the economy has been put by President Uhuru Kenyatta at 2 billion shillings, translating to about 7% of GDP annually. If people make illegal water or power connections, honest people pay for that. If a tender for building a road is inflated, someone pays for it. If government and its agencies over-employ, someone pays for that. If it takes longer to get services like port clearance and building approvals, someone pays for it. If bribes are exchanged, someone pays for it. The 50-shilling note given to police at a roadblock is paid by someone else. A third driver of inflation is a weak currency. Kenya’s currency has declined by 3.5% since the start of the year, partly because of decisions taken in other countries that affect the value of their currencies. A fourth driver of inflation is tax. It raises the price of goods and services. The Finance Act 2022 brought in new taxes and raised the rates of other existing ones. It seems the government did this to raise money without incurring more debts...\n",
|
| 410 |
+
"------------------------------------\n",
|
| 411 |
+
"\n",
|
| 412 |
+
"Result #9 -------------------------\n",
|
| 413 |
+
"📅 Date: 2023-06-23\n",
|
| 414 |
+
"📰 Source: france24.com\n",
|
| 415 |
+
"📝 Excerpt: Title: Kenya: Tax on all salaries to finance housing Date: 2023-06-23 Source: france24.com Kenya's parliament's voted throuh a raft of tax hikes in the first budget of president William Ruto. The contentious changes will double the tax on fuel to 16% to generate another 61 billion shillings or a little under 400,000 euros, for the government. The bill still has to be signed by Ruto and opposition members have threatened to call fresh protests if he does. The bill also proposes a new housing levy which Ruto says will help build more affordable houses but many Kenyans hit hard by the rising cost of living say they can't shoulder the extra cost. FRANCE 24's correspondent Bastien Renouil reports. Video by: Bastien RENOUIL...\n",
|
| 416 |
+
"------------------------------------\n",
|
| 417 |
+
"\n",
|
| 418 |
+
"Result #10 -------------------------\n",
|
| 419 |
+
"📅 Date: 2025-11-10\n",
|
| 420 |
+
"📰 Source: capitalfm\n",
|
| 421 |
+
"📝 Excerpt: NAIROBI, Kenya, Nov 10 — President William Ruto says he has “no regrets” over his directive to police to shoot protesters “in the feet,” defending the use of force during violent anti-government demonstrations last year. Speaking in an interview with Al Jazeera aired on Sunday, Ruto maintained that police acted lawfully when responding to riots that left dozens dead and businesses destroyed. “I don’t regret those comments at all because the law allows the police to use force when other people’s lives are in danger,” he said. When challenged on whether shooting protesters was excessive, he replied, “That is according to you. I think the police know what they need to do.” The President said his administration had to “balance” between allowing peaceful demonstrations and curbing criminal activity during the unrest. “We have had to balance between dealing with violent criminals on one end and managing protests on the other,” he told James Bays. Ruto says police to immobilize vandals, take them to court » Capital News July 9 directive Ruto’s comments referred to a July 9 address in Nairobi’s Kilimani area, where he warned vandals and looters that they would be immobilized and taken to court. “Anybody torching and destroying another person’s business should be shot in the feet and taken to the hospital pending court appearance,” Ruto declared at the time. “We want people to do business. Enough is enough.” The President spoke amid escalating anti-government protests that saw supermarkets and small businesses looted and torched in several towns, including Meru, Kitengela, and Kahawa Sukari. Magunas Supermarket in Meru was among the worst hit — ransacked and later set ablaze. According to the Kenya National Commission on Human Rights (KNCHR), thirty-one people were killed in the early wave of protests, with the toll later rising to sixty-five as demonstrations continued through June and July...\n",
|
| 422 |
+
"------------------------------------\n",
|
| 423 |
+
"\n"
|
| 424 |
+
]
|
| 425 |
+
}
|
| 426 |
+
],
|
| 427 |
+
"source": [
|
| 428 |
+
"# --- TEST QUERY ---\n",
|
| 429 |
+
"query = \"How have the protests impacted the Kenyan economy?\"\n",
|
| 430 |
+
"\n",
|
| 431 |
+
"print(f\"\\n🔎 Searching for: '{query}'...\")\n",
|
| 432 |
+
"\n",
|
| 433 |
+
"# Perform Similarity Search\n",
|
| 434 |
+
"results = vectorstore.similarity_search(query, k=10)\n",
|
| 435 |
+
"\n",
|
| 436 |
+
"print(f\"Found {len(results)} relevant articles:\\n\")\n",
|
| 437 |
+
"\n",
|
| 438 |
+
"for i, doc in enumerate(results):\n",
|
| 439 |
+
" print(f\"Result #{i+1} -------------------------\")\n",
|
| 440 |
+
" print(f\"Date: {doc.metadata.get('date', 'N/A')}\")\n",
|
| 441 |
+
" print(f\"Source: {doc.metadata.get('source', 'N/A')}\")\n",
|
| 442 |
+
" print(f\"Excerpt: {doc.page_content[:2500].replace(chr(10), ' ')}...\") # Remove newlines for clean print\n",
|
| 443 |
+
" print(\"------------------------------------\\n\")"
|
| 444 |
+
]
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"cell_type": "code",
|
| 448 |
+
"execution_count": null,
|
| 449 |
+
"id": "44fc797c-1462-48a6-9eba-60e06afbd6c9",
|
| 450 |
+
"metadata": {},
|
| 451 |
+
"outputs": [],
|
| 452 |
+
"source": []
|
| 453 |
+
}
|
| 454 |
+
],
|
| 455 |
+
"metadata": {
|
| 456 |
+
"kernelspec": {
|
| 457 |
+
"display_name": "Python 3 (ipykernel)",
|
| 458 |
+
"language": "python",
|
| 459 |
+
"name": "python3"
|
| 460 |
+
},
|
| 461 |
+
"language_info": {
|
| 462 |
+
"codemirror_mode": {
|
| 463 |
+
"name": "ipython",
|
| 464 |
+
"version": 3
|
| 465 |
+
},
|
| 466 |
+
"file_extension": ".py",
|
| 467 |
+
"mimetype": "text/x-python",
|
| 468 |
+
"name": "python",
|
| 469 |
+
"nbconvert_exporter": "python",
|
| 470 |
+
"pygments_lexer": "ipython3",
|
| 471 |
+
"version": "3.12.3"
|
| 472 |
+
}
|
| 473 |
+
},
|
| 474 |
+
"nbformat": 4,
|
| 475 |
+
"nbformat_minor": 5
|
| 476 |
+
}
|
notebooks/structure_data.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/test_demo.ipynb
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "d5c236df-3b17-4889-bfa0-62875afabb70",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Demo notebook"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 9,
|
| 14 |
+
"id": "ae890231-b53d-451b-912c-ad84bd1f3360",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"!pip install langchain-ollama --quiet"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 10,
|
| 24 |
+
"id": "ed3dafc1-5641-42ee-94e4-299295939a8f",
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [],
|
| 27 |
+
"source": [
|
| 28 |
+
"import os\n",
|
| 29 |
+
"import sys\n",
|
| 30 |
+
"from pathlib import Path"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": 11,
|
| 36 |
+
"id": "25b50f89-d265-42b1-a97b-8e7790856595",
|
| 37 |
+
"metadata": {},
|
| 38 |
+
"outputs": [],
|
| 39 |
+
"source": [
|
| 40 |
+
"# Fix paths for src files\n",
|
| 41 |
+
"project_root = Path(os.getcwd()).parent\n",
|
| 42 |
+
"script_dir = project_root / \"src\"\n",
|
| 43 |
+
"if str(script_dir) not in sys.path:\n",
|
| 44 |
+
" sys.path.append(str(script_dir))"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": 12,
|
| 50 |
+
"id": "dea362be-192d-4929-b140-3778cba1df25",
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [],
|
| 53 |
+
"source": [
|
| 54 |
+
"import warnings\n",
|
| 55 |
+
"warnings.filterwarnings(\"ignore\")"
|
| 56 |
+
]
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"cell_type": "code",
|
| 60 |
+
"execution_count": 13,
|
| 61 |
+
"id": "333e0906-23f5-4836-8beb-188d2155e879",
|
| 62 |
+
"metadata": {},
|
| 63 |
+
"outputs": [],
|
| 64 |
+
"source": [
|
| 65 |
+
"from load.mshauri_demo import create_mshauri_agent, ask_mshauri"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 14,
|
| 71 |
+
"id": "d8f69073-a21a-4b9d-89c5-d7554d7ac605",
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [
|
| 74 |
+
{
|
| 75 |
+
"name": "stdout",
|
| 76 |
+
"output_type": "stream",
|
| 77 |
+
"text": [
|
| 78 |
+
"⚙️ Initializing Mshauri Fedha (Model: qwen3:32b)...\n",
|
| 79 |
+
"✅ Mshauri Agent Ready (Zero-Dependency Mode).\n"
|
| 80 |
+
]
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"source": [
|
| 84 |
+
"# Initialize agent\n",
|
| 85 |
+
"agent = create_mshauri_agent()"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"cell_type": "code",
|
| 90 |
+
"execution_count": 4,
|
| 91 |
+
"id": "7a607e94-d378-4127-9c97-1d0a5a3e629f",
|
| 92 |
+
"metadata": {},
|
| 93 |
+
"outputs": [],
|
| 94 |
+
"source": [
|
| 95 |
+
"query1 = \"What was the total value of commodity tea exports in 1998?\"\n",
|
| 96 |
+
"query2 = \"Why has the cost of living increased according to the reports?\"\n",
|
| 97 |
+
"query3 = \"What is the latest inflation rate?\"\n",
|
| 98 |
+
"query4 = \"What is the annual GDP for 2020?\"\n",
|
| 99 |
+
"query5 = \"How is the Kenyan economy performing compared to other African countries and countries like USA and Australia?\"\n",
|
| 100 |
+
"query6 = \"What sector of the Kenyan economy has been constantly improving? Show the numbers\"\n",
|
| 101 |
+
"query7 = \"summarize recent loan default trends in microfinance institutions.\"\n",
|
| 102 |
+
"query8 = \"What was the total public debt for 1999?\"\n",
|
| 103 |
+
"query9 = \"Is the Kenyan economy improving? Considering the quality of life of its citizens\"\n",
|
| 104 |
+
"query10 = \"Why did the shilling depreciate?\""
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "code",
|
| 109 |
+
"execution_count": 15,
|
| 110 |
+
"id": "5d5a7ff3-aa69-4083-81cf-d6e34c41be6f",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"outputs": [],
|
| 113 |
+
"source": [
|
| 114 |
+
"from load.start_ollama import start_ollama_server"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"cell_type": "code",
|
| 119 |
+
"execution_count": 7,
|
| 120 |
+
"id": "dc20cc1d-da64-4394-85d5-7b77ad0e88c5",
|
| 121 |
+
"metadata": {},
|
| 122 |
+
"outputs": [
|
| 123 |
+
{
|
| 124 |
+
"name": "stdout",
|
| 125 |
+
"output_type": "stream",
|
| 126 |
+
"text": [
|
| 127 |
+
"🚀 Starting Ollama Server...\n",
|
| 128 |
+
"⏳ Waiting for server to boot...\n",
|
| 129 |
+
"✅ Server started successfully.\n"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"data": {
|
| 134 |
+
"text/plain": [
|
| 135 |
+
"True"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
"execution_count": 7,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"output_type": "execute_result"
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"source": [
|
| 144 |
+
"# Start ollama server\n",
|
| 145 |
+
"start_ollama_server()"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"cell_type": "code",
|
| 150 |
+
"execution_count": 8,
|
| 151 |
+
"id": "e3942375-e589-4469-94ef-b0bae5e0684f",
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"outputs": [
|
| 154 |
+
{
|
| 155 |
+
"name": "stdout",
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"text": [
|
| 158 |
+
"\n",
|
| 159 |
+
"❓ User: Why did the shilling depreciate?\n",
|
| 160 |
+
"----------------------------------------\n",
|
| 161 |
+
"🚀 Starting Agent Loop for: 'Why did the shilling depreciate?'\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"🧠 Step 1: Thought: I need to determine the reasons behind the Kenyan shilling's depreciation. This requires analyzing economic factors like inflation, trade deficits, political stability, or global market trends. I'll first check the database for relevant quantitative data, then use financial reports/news if needed.\n",
|
| 164 |
+
"\n",
|
| 165 |
+
"Action: sql_db_list_tables \n",
|
| 166 |
+
"Action Input:\n",
|
| 167 |
+
"🛠️ Calling 'sql_db_list_tables' with: \n",
|
| 168 |
+
"\n",
|
| 169 |
+
"🧠 Step 2: Thought: I need to analyze economic indicators like trade deficits, inflation, or central bank policies. First, check the foreign trade summary for import/export trends and exchange rate history.\n",
|
| 170 |
+
"\n",
|
| 171 |
+
"Action: sql_db_schema \n",
|
| 172 |
+
"Action Input: foreign_trade_summary, cbk_indicative_rates\n",
|
| 173 |
+
"🛠️ Calling 'sql_db_schema' with: foreign_trade_summary, cbk_indicative_rates\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"🧠 Step 3: Thought: I need to analyze trade deficits and exchange rate trends. First, check recent trade balances and exchange rates.\n",
|
| 176 |
+
"\n",
|
| 177 |
+
"Action: sql_db_query_checker \n",
|
| 178 |
+
"Action Input: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
|
| 179 |
+
"🛠️ Calling 'sql_db_query_checker' with: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
|
| 180 |
+
"\n",
|
| 181 |
+
"🧠 Step 4: Action: sql_db_query \n",
|
| 182 |
+
"Action Input: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
|
| 183 |
+
"🛠️ Calling 'sql_db_query' with: SELECT year, month, trade_balance FROM foreign_trade_summary ORDER BY year DESC, month DESC LIMIT 12\n",
|
| 184 |
+
"\n",
|
| 185 |
+
"🧠 Step 5: Thought: The trade balance data shows a consistent deficit, indicating higher imports than exports, which increases demand for foreign currency. This likely contributes to shilling depreciation. To confirm, check exchange rate trends over time.\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"Action: sql_db_query_checker \n",
|
| 188 |
+
"Action Input: SELECT date, mean_rate FROM cbk_indicative_rates WHERE currency = 'US DOLLAR' ORDER BY date DESC LIMIT 12\n",
|
| 189 |
+
"🛠️ Calling 'sql_db_query_checker' with: SELECT date, mean_rate FROM cbk_indicative_rates WHERE currency = 'US DOLLAR' ORDER BY date DESC LIMIT 12\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"🧠 Step 6: Thought: The trade balance data shows persistent deficits, increasing demand for foreign currency. Exchange rate trends for the US dollar (mean rate rising) confirm shilling depreciation. To fully answer \"why,\" I should also check for inflation data or policy changes, but since the database lacks an inflation table, I'll rely on available data and external sources.\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"Action: search_financial_reports_and_news \n",
|
| 194 |
+
"Action Input: Kenyan shilling depreciation causes 2024-2025\n",
|
| 195 |
+
"🛠️ Calling 'search_financial_reports_and_news' with: Kenyan shilling depreciation causes 2024-2025\n"
|
| 196 |
+
]
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"name": "stderr",
|
| 200 |
+
"output_type": "stream",
|
| 201 |
+
"text": [
|
| 202 |
+
"/users/tligawa/mshauri-fedha/notebooks/mshauri_demo.py:159: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the `langchain-ollama package and should be used instead. To use it run `pip install -U `langchain-ollama` and import as `from `langchain_ollama import OllamaEmbeddings``.\n",
|
| 203 |
+
" embeddings = OllamaEmbeddings(model=DEFAULT_EMBED_MODEL, base_url=ollama_url)\n",
|
| 204 |
+
"/users/tligawa/mshauri-fedha/notebooks/mshauri_demo.py:160: LangChainDeprecationWarning: The class `Chroma` was deprecated in LangChain 0.2.9 and will be removed in 1.0. An updated version of the class exists in the `langchain-chroma package and should be used instead. To use it run `pip install -U `langchain-chroma` and import as `from `langchain_chroma import Chroma``.\n",
|
| 205 |
+
" vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)\n"
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"name": "stdout",
|
| 210 |
+
"output_type": "stream",
|
| 211 |
+
"text": [
|
| 212 |
+
"\n",
|
| 213 |
+
"🧠 Step 7: The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\n",
|
| 214 |
+
"\n",
|
| 215 |
+
"### 1. **Persistent Trade Deficits** \n",
|
| 216 |
+
" - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \n",
|
| 217 |
+
" - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"### 2. **Low Central Bank Intervention** \n",
|
| 220 |
+
" - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \n",
|
| 221 |
+
" - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"### 3. **Low Interest Rates** \n",
|
| 224 |
+
" - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \n",
|
| 225 |
+
" - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"### 4. **Weak Export Sector Growth** \n",
|
| 228 |
+
" - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \n",
|
| 229 |
+
" - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"### 5. **Global and Regional Factors** \n",
|
| 232 |
+
" - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \n",
|
| 233 |
+
" - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"### Conclusion \n",
|
| 236 |
+
"The shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.\n",
|
| 237 |
+
"----------------------------------------\n",
|
| 238 |
+
"💡 Mshauri: The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\n",
|
| 239 |
+
"\n",
|
| 240 |
+
"### 1. **Persistent Trade Deficits** \n",
|
| 241 |
+
" - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \n",
|
| 242 |
+
" - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\n",
|
| 243 |
+
"\n",
|
| 244 |
+
"### 2. **Low Central Bank Intervention** \n",
|
| 245 |
+
" - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \n",
|
| 246 |
+
" - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"### 3. **Low Interest Rates** \n",
|
| 249 |
+
" - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \n",
|
| 250 |
+
" - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\n",
|
| 251 |
+
"\n",
|
| 252 |
+
"### 4. **Weak Export Sector Growth** \n",
|
| 253 |
+
" - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \n",
|
| 254 |
+
" - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\n",
|
| 255 |
+
"\n",
|
| 256 |
+
"### 5. **Global and Regional Factors** \n",
|
| 257 |
+
" - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \n",
|
| 258 |
+
" - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"### Conclusion \n",
|
| 261 |
+
"The shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.\n"
|
| 262 |
+
]
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"data": {
|
| 266 |
+
"text/plain": [
|
| 267 |
+
"'The depreciation of the Kenyan shilling (KES) can be attributed to a combination of economic factors, supported by data from trade, monetary policy, and sectoral performance:\\n\\n### 1. **Persistent Trade Deficits** \\n - **Foreign Trade Data**: The *foreign_trade_summary* table reveals consistent negative trade balances (e.g., large deficits in 2022–2023). Kenya imports significantly more goods (e.g., machinery, fuel, and manufactured products) than it exports, increasing demand for foreign currency and weakening the shilling. \\n - **Impact**: A trade deficit reduces foreign exchange reserves, making the currency more vulnerable to depreciation.\\n\\n### 2. **Low Central Bank Intervention** \\n - **CBK Policy**: News articles highlight the Central Bank of Kenya (CBK) has limited interventions in the foreign exchange market. Without active measures to stabilize the shilling (e.g., selling foreign reserves), market forces drive the currency lower. \\n - **Market Sentiment**: Analysts predict the shilling could fall below **KES 90 to the dollar** by year-end, reflecting weak confidence in sustained CBK support.\\n\\n### 3. **Low Interest Rates** \\n - **Monetary Policy (Table 16)**: Savings deposit rates (e.g., 3.56% in 2022) and inter-bank rates (5.39% in 2022) are historically low, reducing incentives for foreign investors to hold shillings. This drives capital outflows, further pressuring the currency. \\n - **Comparison**: Overdraft and loan rates (12–12.67%) remain high for borrowers but fail to attract external investment, as returns in shillings are eroded by depreciation.\\n\\n### 4. **Weak Export Sector Growth** \\n - **Agriculture Data (Table 33)**: While tea and pyrethrum production has grown, key exports like maize and wheat show volatility. Agriculture contributes ~21% of GDP but lacks diversification or value addition to boost export earnings. \\n - **Non-Agricultural Exports**: Limited growth in manufacturing and services (e.g., tourism) exacerbates the trade deficit, reducing foreign exchange inflows.\\n\\n### 5. **Global and Regional Factors** \\n - **Commodity Prices**: Higher global fuel and food prices increase import costs, worsening the trade deficit. \\n - **Regional Currency Dynamics**: The shilling’s depreciation relative to neighbors (e.g., Uganda, Tanzania) reflects broader East African economic challenges, including inflation and weak regional trade integration.\\n\\n### Conclusion \\nThe shilling’s depreciation stems from structural issues: **trade imbalances**, **limited CBK intervention**, **low interest rates**, and **stagnant export growth**. Without addressing these fundamentals (e.g., boosting exports, attracting foreign investment, or stabilizing the exchange rate), the shilling is likely to remain under pressure. Policymakers must balance short-term market interventions with long-term structural reforms to restore currency stability.'"
|
| 268 |
+
]
|
| 269 |
+
},
|
| 270 |
+
"execution_count": 8,
|
| 271 |
+
"metadata": {},
|
| 272 |
+
"output_type": "execute_result"
|
| 273 |
+
}
|
| 274 |
+
],
|
| 275 |
+
"source": [
|
| 276 |
+
"ask_mshauri(agent, query10)"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"cell_type": "code",
|
| 281 |
+
"execution_count": null,
|
| 282 |
+
"id": "79f0ba2b-7f35-4a2c-80fe-4d9c3731c558",
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"outputs": [],
|
| 285 |
+
"source": []
|
| 286 |
+
}
|
| 287 |
+
],
|
| 288 |
+
"metadata": {
|
| 289 |
+
"kernelspec": {
|
| 290 |
+
"display_name": "Python 3 (ipykernel)",
|
| 291 |
+
"language": "python",
|
| 292 |
+
"name": "python3"
|
| 293 |
+
},
|
| 294 |
+
"language_info": {
|
| 295 |
+
"codemirror_mode": {
|
| 296 |
+
"name": "ipython",
|
| 297 |
+
"version": 3
|
| 298 |
+
},
|
| 299 |
+
"file_extension": ".py",
|
| 300 |
+
"mimetype": "text/x-python",
|
| 301 |
+
"name": "python",
|
| 302 |
+
"nbconvert_exporter": "python",
|
| 303 |
+
"pygments_lexer": "ipython3",
|
| 304 |
+
"version": "3.12.3"
|
| 305 |
+
}
|
| 306 |
+
},
|
| 307 |
+
"nbformat": 4,
|
| 308 |
+
"nbformat_minor": 5
|
| 309 |
+
}
|
notebooks/transform.ipynb
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "84e9f72e-84ff-49e5-b8ba-faa6ee9bc4df",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"%load_ext autoreload\n",
|
| 11 |
+
"%autoreload 2"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 2,
|
| 17 |
+
"id": "2b495825-0d1a-4a46-9297-6ceae1ccd2a2",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [],
|
| 20 |
+
"source": [
|
| 21 |
+
"import sys\n",
|
| 22 |
+
"import os\n",
|
| 23 |
+
"import shutil\n",
|
| 24 |
+
"import subprocess\n",
|
| 25 |
+
"import time\n",
|
| 26 |
+
"import requests\n",
|
| 27 |
+
"import torch\n",
|
| 28 |
+
"from pathlib import Path\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"# Fix paths so we can import 'extract.py'\n",
|
| 31 |
+
"project_root = Path(os.getcwd()).parent\n",
|
| 32 |
+
"script_dir = project_root / \"src/transform\"\n",
|
| 33 |
+
"if str(script_dir) not in sys.path:\n",
|
| 34 |
+
" sys.path.append(str(script_dir))\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"# Import your optimized processor\n",
|
| 37 |
+
"from extract import MarkerFolderProcessor, configure_parallelism"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "code",
|
| 42 |
+
"execution_count": 3,
|
| 43 |
+
"id": "8d04e7ad-abf2-40e4-b308-fc0863464935",
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [
|
| 46 |
+
{
|
| 47 |
+
"name": "stdout",
|
| 48 |
+
"output_type": "stream",
|
| 49 |
+
"text": [
|
| 50 |
+
"✅ Setup complete.\n"
|
| 51 |
+
]
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"source": [
|
| 55 |
+
"# Paths\n",
|
| 56 |
+
"SCRATCH = Path(os.environ.get(\"SCRATCH\"))\n",
|
| 57 |
+
"INPUT_PDFS = SCRATCH / \"mshauri-fedha/data/cbk/pdfs\"\n",
|
| 58 |
+
"OUTPUT_DIR = SCRATCH / \"mshauri-fedha/data/cbk/marker-output\"\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"# Ollama Setup\n",
|
| 61 |
+
"OLLAMA_HOME = SCRATCH / \"ollama_core\"\n",
|
| 62 |
+
"OLLAMA_BIN = OLLAMA_HOME / \"bin/ollama\"\n",
|
| 63 |
+
"OLLAMA_MODELS_DIR = OLLAMA_HOME / \"models\" \n",
|
| 64 |
+
"OLLAMA_HOST = \"http://localhost:11434\"\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"print(\"✅ Setup complete.\")"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": 4,
|
| 72 |
+
"id": "2a7846b4-2041-4b4f-9210-16a891d6c9f4",
|
| 73 |
+
"metadata": {},
|
| 74 |
+
"outputs": [
|
| 75 |
+
{
|
| 76 |
+
"name": "stdout",
|
| 77 |
+
"output_type": "stream",
|
| 78 |
+
"text": [
|
| 79 |
+
"🔍 GH200/A100 Detected: 4 GPUs | 94.5 GB VRAM\n",
|
| 80 |
+
"⚙️ Stability Config: 5 workers/GPU | 20 Total Slots\n"
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"source": [
|
| 85 |
+
"total_slots, workers_per_gpu, num_gpus = configure_parallelism()"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"cell_type": "code",
|
| 90 |
+
"execution_count": 5,
|
| 91 |
+
"id": "039a0a95-91e2-495c-a0ab-d2185f98461c",
|
| 92 |
+
"metadata": {},
|
| 93 |
+
"outputs": [],
|
| 94 |
+
"source": [
|
| 95 |
+
"# Kill any old server first\n",
|
| 96 |
+
"subprocess.run([\"pkill\", \"-f\", \"ollama serve\"], stderr=subprocess.DEVNULL)\n",
|
| 97 |
+
"time.sleep(2)\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"server_env = os.environ.copy()\n",
|
| 100 |
+
"server_env[\"OLLAMA_NUM_PARALLEL\"] = str(32) # Matches your total slots\n",
|
| 101 |
+
"server_env[\"OLLAMA_MAX_LOADED_MODELS\"] = \"1\"\n",
|
| 102 |
+
"server_env[\"OLLAMA_MAX_QUEUE\"] = \"2048\"\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"# Start new server\n",
|
| 105 |
+
"process = subprocess.Popen(\n",
|
| 106 |
+
" [str(OLLAMA_BIN), \"serve\"], \n",
|
| 107 |
+
" stdout=subprocess.DEVNULL, \n",
|
| 108 |
+
" stderr=subprocess.DEVNULL,\n",
|
| 109 |
+
" env=server_env\n",
|
| 110 |
+
")"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": 6,
|
| 116 |
+
"id": "accd6a19-e216-450e-9cca-beaeaa7749a9",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"outputs": [
|
| 119 |
+
{
|
| 120 |
+
"name": "stdout",
|
| 121 |
+
"output_type": "stream",
|
| 122 |
+
"text": [
|
| 123 |
+
"⏳ Waiting for server heartbeat...\n",
|
| 124 |
+
"✅ Server is UP and listening!\n"
|
| 125 |
+
]
|
| 126 |
+
}
|
| 127 |
+
],
|
| 128 |
+
"source": [
|
| 129 |
+
"# Robust Wait Loop\n",
|
| 130 |
+
"print(\"⏳ Waiting for server heartbeat...\")\n",
|
| 131 |
+
"server_ready = False\n",
|
| 132 |
+
"for _ in range(60): # Wait 60 seconds max\n",
|
| 133 |
+
" try:\n",
|
| 134 |
+
" if requests.get(OLLAMA_HOST).status_code == 200:\n",
|
| 135 |
+
" server_ready = True\n",
|
| 136 |
+
" break\n",
|
| 137 |
+
" except:\n",
|
| 138 |
+
" time.sleep(1)\n",
|
| 139 |
+
"\n",
|
| 140 |
+
"if server_ready:\n",
|
| 141 |
+
" print(\"✅ Server is UP and listening!\")\n",
|
| 142 |
+
"else:\n",
|
| 143 |
+
" raise RuntimeError(\"❌ Server failed to start. Check logs.\")"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"cell_type": "code",
|
| 148 |
+
"execution_count": 7,
|
| 149 |
+
"id": "454617a5-c1ef-489f-b9c0-8e6b4fe39b47",
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"outputs": [
|
| 152 |
+
{
|
| 153 |
+
"name": "stdout",
|
| 154 |
+
"output_type": "stream",
|
| 155 |
+
"text": [
|
| 156 |
+
"⬇️ Checking/Pulling qwen2.5:7b...\n",
|
| 157 |
+
"📝 Creating 'qwen2.5-7b-16k' (16k Context)...\n"
|
| 158 |
+
]
|
| 159 |
+
}
|
| 160 |
+
],
|
| 161 |
+
"source": [
|
| 162 |
+
"# pull model\n",
|
| 163 |
+
"BASE_MODEL = \"qwen2.5:7b\" \n",
|
| 164 |
+
"CUSTOM_MODEL_NAME = \"qwen2.5-7b-16k\"\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"print(f\"⬇️ Checking/Pulling {BASE_MODEL}...\")\n",
|
| 167 |
+
"subprocess.run(\n",
|
| 168 |
+
" [str(OLLAMA_BIN), \"pull\", BASE_MODEL], \n",
|
| 169 |
+
" check=True, \n",
|
| 170 |
+
" stdout=subprocess.DEVNULL,\n",
|
| 171 |
+
" stderr=subprocess.DEVNULL,\n",
|
| 172 |
+
" env=os.environ.copy()\n",
|
| 173 |
+
")\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"print(f\"📝 Creating '{CUSTOM_MODEL_NAME}' (16k Context)...\")\n",
|
| 176 |
+
"modelfile_content = f\"FROM {BASE_MODEL}\\nPARAMETER num_ctx 16384\"\n",
|
| 177 |
+
"with open(\"Modelfile_qwen_16k\", \"w\") as f:\n",
|
| 178 |
+
" f.write(modelfile_content)"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": 8,
|
| 184 |
+
"id": "0fe26ed7-f31f-43eb-acca-1795b5528219",
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [
|
| 187 |
+
{
|
| 188 |
+
"name": "stdout",
|
| 189 |
+
"output_type": "stream",
|
| 190 |
+
"text": [
|
| 191 |
+
"✅ Model Ready.\n"
|
| 192 |
+
]
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"name": "stderr",
|
| 196 |
+
"output_type": "stream",
|
| 197 |
+
"text": [
|
| 198 |
+
"\u001b[?2026h\u001b[?25l\u001b[1Ggathering model components \u001b[K\n",
|
| 199 |
+
"using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
|
| 200 |
+
"using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
|
| 201 |
+
"using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
|
| 202 |
+
"using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
|
| 203 |
+
"using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
|
| 204 |
+
"writing manifest ⠋ \u001b[K\u001b[?25h\u001b[?2026l\u001b[?2026h\u001b[?25l\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[A\u001b[1Ggathering model components \u001b[K\n",
|
| 205 |
+
"using existing layer sha256:2bada8a7450677000f678be90653b85d364de7db25eb5ea54136ada5f3933730 \u001b[K\n",
|
| 206 |
+
"using existing layer sha256:66b9ea09bd5b7099cbb4fc820f31b575c0366fa439b08245566692c6784e281e \u001b[K\n",
|
| 207 |
+
"using existing layer sha256:eb4402837c7829a690fa845de4d7f3fd842c2adee476d5341da8a46ea9255175 \u001b[K\n",
|
| 208 |
+
"using existing layer sha256:832dd9e00a68dd83b3c3fb9f5588dad7dcf337a0db50f7d9483f310cd292e92e \u001b[K\n",
|
| 209 |
+
"using existing layer sha256:db8fbfd0cb288a053f83ac9014ca9bac2558b1bbcd80b5c408a548e7acba8a24 \u001b[K\n",
|
| 210 |
+
"writing manifest \u001b[K\n",
|
| 211 |
+
"success \u001b[K\u001b[?25h\u001b[?2026l\n"
|
| 212 |
+
]
|
| 213 |
+
}
|
| 214 |
+
],
|
| 215 |
+
"source": [
|
| 216 |
+
"# run model\n",
|
| 217 |
+
"subprocess.run(\n",
|
| 218 |
+
" [str(OLLAMA_BIN), \"create\", CUSTOM_MODEL_NAME, \"-f\", \"Modelfile_qwen\"], \n",
|
| 219 |
+
" check=True, \n",
|
| 220 |
+
" stdout=subprocess.DEVNULL, \n",
|
| 221 |
+
" env=os.environ.copy()\n",
|
| 222 |
+
")\n",
|
| 223 |
+
"print(\"✅ Model Ready.\")"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": 9,
|
| 229 |
+
"id": "4750bd0f-3cd2-4d62-a6c4-75c2f19e45f1",
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"outputs": [],
|
| 232 |
+
"source": [
|
| 233 |
+
"os.chdir(SCRATCH)"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "code",
|
| 238 |
+
"execution_count": null,
|
| 239 |
+
"id": "9581ee47-f690-46c8-b331-084411fb8535",
|
| 240 |
+
"metadata": {},
|
| 241 |
+
"outputs": [
|
| 242 |
+
{
|
| 243 |
+
"name": "stdout",
|
| 244 |
+
"output_type": "stream",
|
| 245 |
+
"text": [
|
| 246 |
+
"✅ Detected 4 GPUs (Dynamic Mode)\n",
|
| 247 |
+
"🚀 Processing PDFs from: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/pdfs\n",
|
| 248 |
+
"📦 Created 1089 batches of 5 files each.\n",
|
| 249 |
+
"🚀 Launching 20 workers on 4 GPUs...\n"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"name": "stderr",
|
| 254 |
+
"output_type": "stream",
|
| 255 |
+
"text": [
|
| 256 |
+
"20:26:57 - [GPU-3:Dev3] - Initializing Worker 3...\n",
|
| 257 |
+
"20:26:58 - [GPU-0:Dev0] - Initializing Worker 0...\n",
|
| 258 |
+
"20:27:05 - [GPU-1:Dev1] - Initializing Worker 1...\n",
|
| 259 |
+
"20:27:06 - [GPU-2:Dev2] - Initializing Worker 2...\n",
|
| 260 |
+
"20:27:09 - [GPU-4:Dev0] - Initializing Worker 4...\n",
|
| 261 |
+
"20:27:11 - [GPU-5:Dev1] - Initializing Worker 5...\n",
|
| 262 |
+
"20:27:12 - [GPU-6:Dev2] - Initializing Worker 6...\n",
|
| 263 |
+
"20:27:12 - [GPU-9:Dev1] - Initializing Worker 9...\n",
|
| 264 |
+
"20:27:14 - [GPU-7:Dev3] - Initializing Worker 7...\n",
|
| 265 |
+
"20:27:15 - [GPU-8:Dev0] - Initializing Worker 8...\n"
|
| 266 |
+
]
|
| 267 |
+
}
|
| 268 |
+
],
|
| 269 |
+
"source": [
|
| 270 |
+
"# Initialize the Processor\n",
|
| 271 |
+
"processor = MarkerFolderProcessor(\n",
|
| 272 |
+
" output_dir=OUTPUT_DIR,\n",
|
| 273 |
+
" ollama_url=OLLAMA_HOST,\n",
|
| 274 |
+
" ollama_model=CUSTOM_MODEL_NAME,\n",
|
| 275 |
+
" batch_multiplier=4, \n",
|
| 276 |
+
" workers_per_gpu=workers_per_gpu,\n",
|
| 277 |
+
" num_gpus=num_gpus \n",
|
| 278 |
+
")\n",
|
| 279 |
+
"\n",
|
| 280 |
+
"# 3. Run the extraction\n",
|
| 281 |
+
"print(f\"🚀 Processing PDFs from: {INPUT_PDFS}\")\n",
|
| 282 |
+
"processor.process_folder(INPUT_PDFS, batch_size=5)"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"cell_type": "code",
|
| 287 |
+
"execution_count": null,
|
| 288 |
+
"id": "931650d0-50f1-48c1-a1a1-a561392e004b",
|
| 289 |
+
"metadata": {},
|
| 290 |
+
"outputs": [],
|
| 291 |
+
"source": []
|
| 292 |
+
}
|
| 293 |
+
],
|
| 294 |
+
"metadata": {
|
| 295 |
+
"kernelspec": {
|
| 296 |
+
"display_name": "Python 3 (ipykernel)",
|
| 297 |
+
"language": "python",
|
| 298 |
+
"name": "python3"
|
| 299 |
+
},
|
| 300 |
+
"language_info": {
|
| 301 |
+
"codemirror_mode": {
|
| 302 |
+
"name": "ipython",
|
| 303 |
+
"version": 3
|
| 304 |
+
},
|
| 305 |
+
"file_extension": ".py",
|
| 306 |
+
"mimetype": "text/x-python",
|
| 307 |
+
"name": "python",
|
| 308 |
+
"nbconvert_exporter": "python",
|
| 309 |
+
"pygments_lexer": "ipython3",
|
| 310 |
+
"version": "3.12.3"
|
| 311 |
+
}
|
| 312 |
+
},
|
| 313 |
+
"nbformat": 4,
|
| 314 |
+
"nbformat_minor": 5
|
| 315 |
+
}
|
notebooks/unzip_stores.ipynb
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "4a41dc7b-f751-4818-912d-21241047c485",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"%load_ext autoreload\n",
|
| 11 |
+
"%autoreload 2"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 2,
|
| 17 |
+
"id": "729a7545-088d-4521-8948-60162d80b1e7",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [],
|
| 20 |
+
"source": [
|
| 21 |
+
"import os\n",
|
| 22 |
+
"import shutil\n",
|
| 23 |
+
"import zipfile\n",
|
| 24 |
+
"from pathlib import Path\n",
|
| 25 |
+
"from tqdm import tqdm"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 3,
|
| 31 |
+
"id": "96eb6a92-9786-48b6-88e4-0441d1a531c5",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"# --- CONFIGURATION ---\n",
|
| 36 |
+
"# 1. Source (Permanent Storage)\n",
|
| 37 |
+
"PROJECT_DIR = Path(os.environ.get(\"PROJECT\")) # Auto-detects $PROJECT\n",
|
| 38 |
+
"SOURCE_ZIPS = PROJECT_DIR / \"tligawa/mshauri-fedha-store/cbk/zipped-store\"\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"# 2. Destination (Fast Scratch Storage)\n",
|
| 41 |
+
"SCRATCH_DIR = Path(os.environ.get(\"SCRATCH\")) # Auto-detects $SCRATCH\n",
|
| 42 |
+
"WORK_DIR = SCRATCH_DIR / \"mshauri-fedha/data/cbk\"\n",
|
| 43 |
+
"FINAL_PDF_DIR = WORK_DIR / \"text\"\n",
|
| 44 |
+
"TEMP_EXTRACT_DIR = WORK_DIR / \"temp-unzip-cbk\""
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": 4,
|
| 50 |
+
"id": "3148a17b-8e99-448e-95de-bb2c60828049",
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [
|
| 53 |
+
{
|
| 54 |
+
"data": {
|
| 55 |
+
"text/plain": [
|
| 56 |
+
"True"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
"execution_count": 4,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"output_type": "execute_result"
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"source": [
|
| 65 |
+
"os.path.exists(WORK_DIR)"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 5,
|
| 71 |
+
"id": "cc97d707-f266-4f6f-9346-19e630101923",
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [
|
| 74 |
+
{
|
| 75 |
+
"name": "stdout",
|
| 76 |
+
"output_type": "stream",
|
| 77 |
+
"text": [
|
| 78 |
+
"🚀 Found 111 batches in /capstor/store/cscs/director2/g164/tligawa/mshauri-fedha-store/cbk/zipped-store\n",
|
| 79 |
+
"📂 Flattening to: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text ...\n"
|
| 80 |
+
]
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"source": [
|
| 84 |
+
"# Setup directories\n",
|
| 85 |
+
"if FINAL_PDF_DIR.exists():\n",
|
| 86 |
+
" print(f\"⚠️ Warning: Target folder {FINAL_PDF_DIR} already exists.\")\n",
|
| 87 |
+
"else:\n",
|
| 88 |
+
" FINAL_PDF_DIR.mkdir(parents=True, exist_ok=True)\n",
|
| 89 |
+
" \n",
|
| 90 |
+
"if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)\n",
|
| 91 |
+
"TEMP_EXTRACT_DIR.mkdir(parents=True, exist_ok=True)\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"# --- EXECUTION ---\n",
|
| 94 |
+
"zips = sorted(list(SOURCE_ZIPS.glob(\"*.zip\")))\n",
|
| 95 |
+
"print(f\"🚀 Found {len(zips)} batches in {SOURCE_ZIPS}\")\n",
|
| 96 |
+
"print(f\"📂 Flattening to: {FINAL_PDF_DIR} ...\")"
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"cell_type": "code",
|
| 101 |
+
"execution_count": 6,
|
| 102 |
+
"id": "0c2a9b27-133a-4960-8920-45ee57eb3d8a",
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"outputs": [
|
| 105 |
+
{
|
| 106 |
+
"name": "stderr",
|
| 107 |
+
"output_type": "stream",
|
| 108 |
+
"text": [
|
| 109 |
+
"Unzipping & Flattening: 100%|██████████| 111/111 [00:22<00:00, 4.84it/s]"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"name": "stdout",
|
| 114 |
+
"output_type": "stream",
|
| 115 |
+
"text": [
|
| 116 |
+
"\n",
|
| 117 |
+
"✨ Done! 58 files are ready in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text\n",
|
| 118 |
+
"🧹 Cleaning up temp dirs...\n"
|
| 119 |
+
]
|
| 120 |
+
},
|
| 121 |
+
{
|
| 122 |
+
"name": "stderr",
|
| 123 |
+
"output_type": "stream",
|
| 124 |
+
"text": [
|
| 125 |
+
"\n"
|
| 126 |
+
]
|
| 127 |
+
}
|
| 128 |
+
],
|
| 129 |
+
"source": [
|
| 130 |
+
"# Unzip and flatten\n",
|
| 131 |
+
"count = 0\n",
|
| 132 |
+
"for zip_path in tqdm(zips, desc=\"Unzipping & Flattening\"):\n",
|
| 133 |
+
" batch_name = zip_path.stem # e.g., \"knbs_batch_1\"\n",
|
| 134 |
+
" \n",
|
| 135 |
+
" try:\n",
|
| 136 |
+
" # 1. Unzip to a temp folder\n",
|
| 137 |
+
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
| 138 |
+
" z.extractall(TEMP_EXTRACT_DIR)\n",
|
| 139 |
+
" \n",
|
| 140 |
+
" # 2. Find the 'pdfs' subfolder inside that batch\n",
|
| 141 |
+
" # We look recursively because structure might vary slightly\n",
|
| 142 |
+
" pdf_files = list(TEMP_EXTRACT_DIR.rglob(\"*.txt\"))\n",
|
| 143 |
+
" \n",
|
| 144 |
+
" # 3. Move and Rename\n",
|
| 145 |
+
" for pdf in pdf_files:\n",
|
| 146 |
+
" # Create unique name: batch_name + original_name\n",
|
| 147 |
+
" # Example: knbs_batch_1_annual_report_2020.pdf\n",
|
| 148 |
+
" new_name = f\"{batch_name}_{pdf.name}\"\n",
|
| 149 |
+
" dest_path = FINAL_PDF_DIR / new_name\n",
|
| 150 |
+
" \n",
|
| 151 |
+
" shutil.move(str(pdf), str(dest_path))\n",
|
| 152 |
+
" count += 1\n",
|
| 153 |
+
" \n",
|
| 154 |
+
" except Exception as e:\n",
|
| 155 |
+
" print(f\"❌ Error processing {zip_path.name}: {e}\")\n",
|
| 156 |
+
" finally:\n",
|
| 157 |
+
" # Clean temp folder for next batch\n",
|
| 158 |
+
" for item in TEMP_EXTRACT_DIR.iterdir():\n",
|
| 159 |
+
" if item.is_dir(): shutil.rmtree(item)\n",
|
| 160 |
+
" else: item.unlink()\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"print(f\"\\n✨ Done! {count} files are ready in {FINAL_PDF_DIR}\")\n",
|
| 163 |
+
"print(f\"🧹 Cleaning up temp dirs...\")\n",
|
| 164 |
+
"if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)"
|
| 165 |
+
]
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"cell_type": "code",
|
| 169 |
+
"execution_count": null,
|
| 170 |
+
"id": "a04d70da-e9e9-4b1b-a393-93b5b76fcd8b",
|
| 171 |
+
"metadata": {},
|
| 172 |
+
"outputs": [],
|
| 173 |
+
"source": []
|
| 174 |
+
}
|
| 175 |
+
],
|
| 176 |
+
"metadata": {
|
| 177 |
+
"kernelspec": {
|
| 178 |
+
"display_name": "Python 3 (ipykernel)",
|
| 179 |
+
"language": "python",
|
| 180 |
+
"name": "python3"
|
| 181 |
+
},
|
| 182 |
+
"language_info": {
|
| 183 |
+
"codemirror_mode": {
|
| 184 |
+
"name": "ipython",
|
| 185 |
+
"version": 3
|
| 186 |
+
},
|
| 187 |
+
"file_extension": ".py",
|
| 188 |
+
"mimetype": "text/x-python",
|
| 189 |
+
"name": "python",
|
| 190 |
+
"nbconvert_exporter": "python",
|
| 191 |
+
"pygments_lexer": "ipython3",
|
| 192 |
+
"version": "3.12.3"
|
| 193 |
+
}
|
| 194 |
+
},
|
| 195 |
+
"nbformat": 4,
|
| 196 |
+
"nbformat_minor": 5
|
| 197 |
+
}
|
src/extract/download_file_links.py
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import dependencies
|
| 2 |
+
from typing import Any, Union, List, Dict
|
| 3 |
+
import time
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import logging
|
| 6 |
+
import requests
|
| 7 |
+
from gnews import GNews
|
| 8 |
+
import feedparser
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
import time, re
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
import urllib3
|
| 13 |
+
import certifi
|
| 14 |
+
from urllib.parse import urljoin, urlparse
|
| 15 |
+
from urllib.robotparser import RobotFileParser
|
| 16 |
+
from collections import Counter
|
| 17 |
+
from tqdm.auto import tqdm
|
| 18 |
+
from newspaper import Article
|
| 19 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 20 |
+
|
| 21 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 22 |
+
|
| 23 |
+
# Set up basic logging
|
| 24 |
+
logging.basicConfig(
|
| 25 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 26 |
+
)
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class CBKExplorer:
|
| 31 |
+
def __init__(self, github_username):
|
| 32 |
+
self.user_agent = f"MshauriFedhaBot/0.1 (+https://github.com/{github_username}/mshaurifedha)"
|
| 33 |
+
self.session = requests.Session()
|
| 34 |
+
self.session.headers.update({"User-Agent": self.user_agent})
|
| 35 |
+
|
| 36 |
+
def is_allowed_by_robots(self, base_url, target_url):
|
| 37 |
+
"""Check robots.txt for permission."""
|
| 38 |
+
parsed = urlparse(base_url)
|
| 39 |
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
| 40 |
+
rp = RobotFileParser()
|
| 41 |
+
try:
|
| 42 |
+
rp.set_url(robots_url)
|
| 43 |
+
rp.read()
|
| 44 |
+
return rp.can_fetch(self.user_agent, target_url)
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"[robots] Could not read robots.txt ({e}). Proceed cautiously.")
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
def fetch(self, url, timeout=25, allow_proxy_fallback=True):
|
| 50 |
+
"""
|
| 51 |
+
Robust fetch that tries:
|
| 52 |
+
1) requests with certifi bundle (secure)
|
| 53 |
+
2) http fallback (if https fails)
|
| 54 |
+
3) requests with verify=False (insecure)
|
| 55 |
+
4) optional external proxy fetch (r.jina.ai) as last resort
|
| 56 |
+
|
| 57 |
+
Returns (response, soup) or (None, None)
|
| 58 |
+
"""
|
| 59 |
+
# helper to parse response->soup
|
| 60 |
+
def resp_to_soup(r):
|
| 61 |
+
try:
|
| 62 |
+
r.raise_for_status()
|
| 63 |
+
return r, BeautifulSoup(r.text, "lxml")
|
| 64 |
+
except Exception:
|
| 65 |
+
return None, None
|
| 66 |
+
|
| 67 |
+
# 1) Try with certifi (preferred)
|
| 68 |
+
try:
|
| 69 |
+
r = self.session.get(url, timeout=timeout, verify=certifi.where())
|
| 70 |
+
ok_resp, soup = resp_to_soup(r)
|
| 71 |
+
if ok_resp:
|
| 72 |
+
return ok_resp, soup
|
| 73 |
+
except requests.exceptions.SSLError as ssl_err:
|
| 74 |
+
print(f"[fetch] SSL error with certifi for {url}: {ssl_err}")
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"[fetch] Primary attempt failed for {url}: {e}")
|
| 77 |
+
|
| 78 |
+
# 2) Try http fallback if URL is https
|
| 79 |
+
try:
|
| 80 |
+
parsed = urlparse(url)
|
| 81 |
+
if parsed.scheme == "https":
|
| 82 |
+
http_url = url.replace("https://", "http://", 1)
|
| 83 |
+
try:
|
| 84 |
+
r = self.session.get(http_url, timeout=timeout)
|
| 85 |
+
ok_resp, soup = resp_to_soup(r)
|
| 86 |
+
if ok_resp:
|
| 87 |
+
print(f"[fetch] HTTP fallback succeeded for {http_url}")
|
| 88 |
+
return ok_resp, soup
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"[fetch] HTTP fallback failed for {http_url}: {e}")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"[fetch] HTTP fallback: error preparing URL: {e}")
|
| 93 |
+
|
| 94 |
+
# 3) Try insecure (verify=False) as last direct option
|
| 95 |
+
try:
|
| 96 |
+
print(f"[fetch] Trying insecure fetch (verify=False) for {url} — not recommended for sensitive data.")
|
| 97 |
+
r = self.session.get(url, timeout=timeout, verify=False)
|
| 98 |
+
ok_resp, soup = resp_to_soup(r)
|
| 99 |
+
if ok_resp:
|
| 100 |
+
return ok_resp, soup
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"[fetch] Insecure fetch also failed for {url}: {e}")
|
| 103 |
+
|
| 104 |
+
# 4) Optional: external proxy/relay (last resort)
|
| 105 |
+
if allow_proxy_fallback:
|
| 106 |
+
try:
|
| 107 |
+
# Jina.ai simple fetch service: returns rendered HTML as text
|
| 108 |
+
# NOTE: this is an external service — use only for public/cached pages.
|
| 109 |
+
proxy_url = "https://r.jina.ai/http://" + url.replace("https://", "").replace("http://", "")
|
| 110 |
+
print(f"[fetch] Trying proxy fetch via {proxy_url}")
|
| 111 |
+
r = requests.get(proxy_url, timeout=30) # using plain requests (no verify issues; it's https to jina)
|
| 112 |
+
if r.status_code == 200 and r.text:
|
| 113 |
+
return r, BeautifulSoup(r.text, "lxml")
|
| 114 |
+
else:
|
| 115 |
+
print(f"[fetch] Proxy fetch returned status {r.status_code}")
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"[fetch] Proxy fetch failed: {e}")
|
| 118 |
+
|
| 119 |
+
# give up
|
| 120 |
+
print(f"[fetch] All fetch strategies failed for {url}")
|
| 121 |
+
return None, None
|
| 122 |
+
|
| 123 |
+
def abs_link(self, base, href):
|
| 124 |
+
"""Make absolute link from relative href."""
|
| 125 |
+
if not href:
|
| 126 |
+
return None
|
| 127 |
+
return urljoin(base, href)
|
| 128 |
+
|
| 129 |
+
def explore_url(self, url, print_anchors=40):
|
| 130 |
+
"""Explore a CBK URL: meta, headings, nav links, anchor samples, file-like links."""
|
| 131 |
+
print("URL:", url)
|
| 132 |
+
print("Allowed by robots.py? ->", self.is_allowed_by_robots(url, url))
|
| 133 |
+
resp, soup = self.fetch(url)
|
| 134 |
+
if not resp:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
# Basic meta
|
| 138 |
+
print("Status code:", resp.status_code)
|
| 139 |
+
title = soup.title.string.strip() if soup.title else ""
|
| 140 |
+
print("Title:", title)
|
| 141 |
+
desc = ""
|
| 142 |
+
meta_desc = soup.find("meta", attrs={"name":"description"}) or soup.find("meta", attrs={"property":"og:description"})
|
| 143 |
+
if meta_desc and meta_desc.get("content"):
|
| 144 |
+
desc = meta_desc["content"].strip()
|
| 145 |
+
print("Meta description:", desc[:300])
|
| 146 |
+
|
| 147 |
+
# Headings
|
| 148 |
+
h1s = [h.get_text(strip=True) for h in soup.find_all("h1")]
|
| 149 |
+
h2s = [h.get_text(strip=True) for h in soup.find_all("h2")]
|
| 150 |
+
print("H1s:", h1s[:5])
|
| 151 |
+
print("H2s:", h2s[:8])
|
| 152 |
+
|
| 153 |
+
# Nav / header anchors
|
| 154 |
+
navs = soup.find_all("nav")
|
| 155 |
+
if navs:
|
| 156 |
+
print(f"Found {len(navs)} <nav> block(s). Sample nav links:")
|
| 157 |
+
nav_links = []
|
| 158 |
+
for nav in navs:
|
| 159 |
+
for a in nav.find_all("a", href=True):
|
| 160 |
+
nav_links.append((a.get_text(strip=True), self.abs_link(url, a["href"])))
|
| 161 |
+
for t, link in nav_links[:20]:
|
| 162 |
+
print(" -", t or "<no-text>", "->", link)
|
| 163 |
+
else:
|
| 164 |
+
print("No <nav> block found (or it's rendered by JS).")
|
| 165 |
+
|
| 166 |
+
# Sample anchors across page
|
| 167 |
+
anchors = []
|
| 168 |
+
for a in soup.find_all("a", href=True):
|
| 169 |
+
text = a.get_text(strip=True)
|
| 170 |
+
href = a["href"].strip()
|
| 171 |
+
anchors.append((text, self.abs_link(url, href)))
|
| 172 |
+
anchors = [a for a in anchors if a[1] is not None]
|
| 173 |
+
print(f"Total anchors on page: {len(anchors)}. Showing first {min(print_anchors,len(anchors))}:")
|
| 174 |
+
for t, link in anchors[:print_anchors]:
|
| 175 |
+
print(" *", (t[:60] or "<no-text>"), "->", link)
|
| 176 |
+
|
| 177 |
+
# Class name frequencies
|
| 178 |
+
classes = []
|
| 179 |
+
for tag in soup.find_all(True):
|
| 180 |
+
cls = tag.get("class")
|
| 181 |
+
if cls:
|
| 182 |
+
classes.extend(cls if isinstance(cls, list) else [cls])
|
| 183 |
+
class_counts = Counter(classes)
|
| 184 |
+
print("Top 15 classes used on page (class_name:count):")
|
| 185 |
+
for k,v in class_counts.most_common(15):
|
| 186 |
+
print(" ", k, ":", v)
|
| 187 |
+
|
| 188 |
+
# Links that look like files
|
| 189 |
+
file_like = []
|
| 190 |
+
for text, link in anchors:
|
| 191 |
+
if re.search(r"\.pdf$|\.xls$|\.xlsx$|\.csv$", link, re.IGNORECASE):
|
| 192 |
+
file_like.append((text, link))
|
| 193 |
+
print("File-like links found on page:", len(file_like))
|
| 194 |
+
for t, l in file_like[:20]:
|
| 195 |
+
print(" FILE:", (t[:80] or "<no-text>"), "->", l)
|
| 196 |
+
|
| 197 |
+
return {"title": title, "anchors": anchors, "file_links": file_like, "class_counts": class_counts}
|
| 198 |
+
|
| 199 |
+
def inspect_pages(self, urls):
|
| 200 |
+
results = {}
|
| 201 |
+
for u in urls:
|
| 202 |
+
print("\n" + "="*80)
|
| 203 |
+
print("Inspecting:", u)
|
| 204 |
+
out = self.explore_url(u, print_anchors=80)
|
| 205 |
+
results[u] = out
|
| 206 |
+
time.sleep(1.0) # polite pause
|
| 207 |
+
return results
|
| 208 |
+
|
| 209 |
+
def collect_file_links(self, url, allowed_exts=(".pdf", ".xls", ".xlsx", ".csv")):
|
| 210 |
+
_, soup = self.fetch(url)
|
| 211 |
+
if not soup:
|
| 212 |
+
return pd.DataFrame() # instead of returning []
|
| 213 |
+
|
| 214 |
+
found = []
|
| 215 |
+
for a in soup.find_all("a", href=True):
|
| 216 |
+
href = a["href"].strip()
|
| 217 |
+
ab = self.abs_link(url, href)
|
| 218 |
+
if not ab:
|
| 219 |
+
continue
|
| 220 |
+
# only same domain (safety)
|
| 221 |
+
if urlparse(ab).netloc.endswith("centralbank.go.ke") or urlparse(ab).netloc == "":
|
| 222 |
+
if any(ab.lower().endswith(ext) for ext in allowed_exts):
|
| 223 |
+
found.append({"page":url, "text": a.get_text(strip=True), "file_url":ab})
|
| 224 |
+
|
| 225 |
+
# dedupe
|
| 226 |
+
seen = set()
|
| 227 |
+
dedup = []
|
| 228 |
+
for row in found:
|
| 229 |
+
if row["file_url"] not in seen:
|
| 230 |
+
dedup.append(row)
|
| 231 |
+
seen.add(row["file_url"])
|
| 232 |
+
|
| 233 |
+
df = pd.DataFrame(dedup)
|
| 234 |
+
print(f"Found {len(df)} file links on {url}")
|
| 235 |
+
return df
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def crawl_links_for_files(self, start_url, allowed_exts=(".pdf", ".xls", ".xlsx", ".csv"), max_pages=50):
|
| 239 |
+
_, soup = self.fetch(start_url)
|
| 240 |
+
if not soup:
|
| 241 |
+
return []
|
| 242 |
+
pages = []
|
| 243 |
+
for a in soup.find_all("a", href=True):
|
| 244 |
+
href = a["href"].strip()
|
| 245 |
+
ab = self.abs_link(start_url, href)
|
| 246 |
+
if not ab:
|
| 247 |
+
continue
|
| 248 |
+
# only same domain
|
| 249 |
+
if urlparse(ab).netloc.endswith("centralbank.go.ke"):
|
| 250 |
+
pages.append(ab)
|
| 251 |
+
pages = list(dict.fromkeys(pages))[:max_pages]
|
| 252 |
+
print(f"Will inspect {len(pages)} linked pages from {start_url}")
|
| 253 |
+
results = []
|
| 254 |
+
for p in tqdm(pages):
|
| 255 |
+
df = self.collect_file_links(p, allowed_exts=allowed_exts)
|
| 256 |
+
if not df.empty:
|
| 257 |
+
results.append(df)
|
| 258 |
+
time.sleep(0.8)
|
| 259 |
+
if results:
|
| 260 |
+
return pd.concat(results, ignore_index=True)
|
| 261 |
+
return pd.DataFrame()
|
| 262 |
+
|
| 263 |
+
def download_files(file_links, root_dir, save_dir,
|
| 264 |
+
allowed_exts=(".pdf", ".xls", ".xlsx", ".csv"),
|
| 265 |
+
overwrite=False):
|
| 266 |
+
"""
|
| 267 |
+
Download multiple files from a list of (title, url) pairs.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
file_links: list of (title, url) tuples, or list of dicts {"text":..., "file_url":...}
|
| 271 |
+
root_dir: base folder to save under
|
| 272 |
+
save_dir: subdirectory under root_dir
|
| 273 |
+
allowed_exts: file extensions to allow
|
| 274 |
+
overwrite: if True, re-download even if file exists
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
metadata: list of dicts (title, url, local_path, size, status)
|
| 278 |
+
"""
|
| 279 |
+
save_dir_path = os.path.join(root_dir, save_dir)
|
| 280 |
+
os.makedirs(save_dir_path, exist_ok=True)
|
| 281 |
+
metadata = []
|
| 282 |
+
|
| 283 |
+
# Normalize file_links into [(title, url), ...]
|
| 284 |
+
norm_links = []
|
| 285 |
+
for item in file_links:
|
| 286 |
+
if isinstance(item, tuple):
|
| 287 |
+
title, url = item
|
| 288 |
+
elif isinstance(item, dict):
|
| 289 |
+
title, url = item.get("text", "file"), item.get("file_url")
|
| 290 |
+
else:
|
| 291 |
+
continue
|
| 292 |
+
norm_links.append((title.strip(), url.strip()))
|
| 293 |
+
|
| 294 |
+
for title, url in norm_links:
|
| 295 |
+
# filter by extension
|
| 296 |
+
if not any(url.lower().endswith(ext) for ext in allowed_exts):
|
| 297 |
+
continue
|
| 298 |
+
|
| 299 |
+
# guess extension from URL
|
| 300 |
+
ext = os.path.splitext(urlparse(url).path)[1] or ".bin"
|
| 301 |
+
# clean filename
|
| 302 |
+
safe_title = re.sub(r"[^A-Za-z0-9._-]+", "_", title)[:100]
|
| 303 |
+
fname = f"{safe_title}{ext}"
|
| 304 |
+
path = os.path.join(save_dir_path, fname)
|
| 305 |
+
|
| 306 |
+
if os.path.exists(path) and not overwrite:
|
| 307 |
+
print(f"[skip] {fname} already exists.")
|
| 308 |
+
status = "skipped"
|
| 309 |
+
else:
|
| 310 |
+
try:
|
| 311 |
+
print(f"[download] {title} -> {fname}")
|
| 312 |
+
r = requests.get(url, stream=True, timeout=60)
|
| 313 |
+
r.raise_for_status()
|
| 314 |
+
with open(path, "wb") as f:
|
| 315 |
+
for chunk in r.iter_content(8192):
|
| 316 |
+
if chunk:
|
| 317 |
+
f.write(chunk)
|
| 318 |
+
status = "ok"
|
| 319 |
+
except Exception as e:
|
| 320 |
+
print(f"[error] Failed: {url} ({e})")
|
| 321 |
+
status = "error"
|
| 322 |
+
|
| 323 |
+
size = os.path.getsize(path) if os.path.exists(path) else 0
|
| 324 |
+
metadata.append({
|
| 325 |
+
"title": title,
|
| 326 |
+
"url": url,
|
| 327 |
+
"local_path": path,
|
| 328 |
+
"size": size,
|
| 329 |
+
"status": status
|
| 330 |
+
})
|
| 331 |
+
|
| 332 |
+
return metadata
|
| 333 |
+
|
| 334 |
+
import os, subprocess, importlib, sys
|
| 335 |
+
|
| 336 |
+
def load_repo(repo):
|
| 337 |
+
local = repo.split("/")[-1]
|
| 338 |
+
if not os.path.exists(local):
|
| 339 |
+
subprocess.run(["git", "clone", f"https://github.com/{repo}.git"], check=True)
|
| 340 |
+
else:
|
| 341 |
+
subprocess.run(["git", "-C", local, "pull"], check=True)
|
| 342 |
+
if local not in sys.path:
|
| 343 |
+
sys.path.insert(0, local)
|
| 344 |
+
mod = importlib.import_module(local)
|
| 345 |
+
importlib.reload(mod)
|
| 346 |
+
return mod
|
| 347 |
+
|
| 348 |
+
def fetch_kenya_gnews(api_key):
|
| 349 |
+
# Free tier: 100 requests/day
|
| 350 |
+
url = f"https://gnews.io/api/v4/top-headlines?category=business&country=ke&token={api_key}"
|
| 351 |
+
|
| 352 |
+
response = requests.get(url)
|
| 353 |
+
data = response.json()
|
| 354 |
+
|
| 355 |
+
articles = []
|
| 356 |
+
for article in data.get('articles', []):
|
| 357 |
+
articles.append({
|
| 358 |
+
'title': article.get('title'),
|
| 359 |
+
'content': article.get('description'),
|
| 360 |
+
'url': article.get('url'),
|
| 361 |
+
'date': article.get('publishedAt'),
|
| 362 |
+
'source': article.get('source', {}).get('name')
|
| 363 |
+
})
|
| 364 |
+
|
| 365 |
+
df = pd.DataFrame(articles)
|
| 366 |
+
return df
|
| 367 |
+
|
| 368 |
+
def is_valid_url(url):
|
| 369 |
+
try:
|
| 370 |
+
result = urlparse(url)
|
| 371 |
+
return all([result.scheme, result.netloc])
|
| 372 |
+
except:
|
| 373 |
+
return False
|
| 374 |
+
|
| 375 |
+
def fetch_kenya_thenewsapi(api_key):
|
| 376 |
+
|
| 377 |
+
url = f"https://api.thenewsapi.com/v1/news/all?api_token={api_key}&search=kenya+economy&language=en"
|
| 378 |
+
|
| 379 |
+
response = requests.get(url)
|
| 380 |
+
data = response.json()
|
| 381 |
+
|
| 382 |
+
articles = []
|
| 383 |
+
for article in data.get('data', []):
|
| 384 |
+
articles.append({
|
| 385 |
+
'title': article.get('title'),
|
| 386 |
+
'content': article.get('description'),
|
| 387 |
+
'url': article.get('url'),
|
| 388 |
+
'date': article.get('published_at'),
|
| 389 |
+
'source': article.get('source')
|
| 390 |
+
})
|
| 391 |
+
|
| 392 |
+
df = pd.DataFrame(articles)
|
| 393 |
+
return df
|
| 394 |
+
|
| 395 |
+
def scrape_google_news_kenya():
|
| 396 |
+
google_news = GNews(
|
| 397 |
+
language='en',
|
| 398 |
+
country='KE',
|
| 399 |
+
period='7d',
|
| 400 |
+
max_results=50
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
# Search for Kenya business news
|
| 404 |
+
articles = google_news.get_news('Kenya economy OR inflation OR central bank')
|
| 405 |
+
|
| 406 |
+
df = pd.DataFrame(articles)
|
| 407 |
+
return df
|
| 408 |
+
|
| 409 |
+
# Install: pip install gnews
|
| 410 |
+
|
| 411 |
+
def scrape_african_business_rss():
|
| 412 |
+
feeds = [
|
| 413 |
+
'https://african.business/feed/', # African Business Magazine
|
| 414 |
+
'https://www.cnbcafrica.com/feed/', # CNBC Africa
|
| 415 |
+
'https://allafrica.com/tools/headlines/rdf/economy/headlines.rdf', # AllAfrica Economy
|
| 416 |
+
]
|
| 417 |
+
|
| 418 |
+
articles = []
|
| 419 |
+
|
| 420 |
+
for feed_url in feeds:
|
| 421 |
+
feed = feedparser.parse(feed_url)
|
| 422 |
+
|
| 423 |
+
for entry in feed.entries[:20]:
|
| 424 |
+
articles.append({
|
| 425 |
+
'title': entry.get('title', ''),
|
| 426 |
+
'url': entry.get('link', ''),
|
| 427 |
+
'date': entry.get('published', ''),
|
| 428 |
+
'summary': entry.get('summary', ''),
|
| 429 |
+
'source': feed.feed.get('title', '')
|
| 430 |
+
})
|
| 431 |
+
|
| 432 |
+
df = pd.DataFrame(articles)
|
| 433 |
+
return df
|
| 434 |
+
|
| 435 |
+
def scrape_article(url: str, metadata: dict) -> dict:
|
| 436 |
+
"""Scrape single article"""
|
| 437 |
+
try:
|
| 438 |
+
article = Article(url)
|
| 439 |
+
article.download()
|
| 440 |
+
article.parse()
|
| 441 |
+
|
| 442 |
+
if len(article.text) > 200:
|
| 443 |
+
return {
|
| 444 |
+
'title': article.title,
|
| 445 |
+
'full_content': article.text,
|
| 446 |
+
'summary': metadata.get('summary', ''),
|
| 447 |
+
'url': url,
|
| 448 |
+
'date': metadata.get('date'),
|
| 449 |
+
'source': metadata.get('source'),
|
| 450 |
+
'authors': ', '.join(article.authors) if article.authors else '',
|
| 451 |
+
'image': article.top_image,
|
| 452 |
+
'word_count': len(article.text.split()),
|
| 453 |
+
'status': 'success'
|
| 454 |
+
}
|
| 455 |
+
return None
|
| 456 |
+
except Exception as e:
|
| 457 |
+
return {'url': url, 'status': 'failed', 'error': str(e)}
|
| 458 |
+
|
| 459 |
+
def fetch_newsdata_multi(api_key: str) -> List[Dict]:
|
| 460 |
+
"""Multiple NewsData.io requests with pagination"""
|
| 461 |
+
all_articles = []
|
| 462 |
+
|
| 463 |
+
# Different queries to maximize coverage
|
| 464 |
+
queries = [
|
| 465 |
+
'kenya economy',
|
| 466 |
+
'kenya inflation',
|
| 467 |
+
'kenya central bank',
|
| 468 |
+
'kenya business',
|
| 469 |
+
'kenya finance'
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
for query in queries:
|
| 473 |
+
try:
|
| 474 |
+
page = None
|
| 475 |
+
for _ in range(3): # Up to 3 pages per query
|
| 476 |
+
params = {
|
| 477 |
+
'apikey': api_key,
|
| 478 |
+
'q': query,
|
| 479 |
+
'country': 'ke',
|
| 480 |
+
'language': 'en'
|
| 481 |
+
}
|
| 482 |
+
if page:
|
| 483 |
+
params['page'] = page
|
| 484 |
+
|
| 485 |
+
response = requests.get('https://newsdata.io/api/1/latest', params=params, timeout=10)
|
| 486 |
+
data = response.json()
|
| 487 |
+
|
| 488 |
+
if data.get('status') != 'success':
|
| 489 |
+
break
|
| 490 |
+
|
| 491 |
+
for item in data.get('results', []):
|
| 492 |
+
if is_valid_url(item.get('link')):
|
| 493 |
+
all_articles.append({
|
| 494 |
+
'url': item.get('link'),
|
| 495 |
+
'summary': item.get('description', ''),
|
| 496 |
+
'date': item.get('pubDate'),
|
| 497 |
+
'source': item.get('source_id')
|
| 498 |
+
})
|
| 499 |
+
|
| 500 |
+
page = data.get('nextPage')
|
| 501 |
+
if not page:
|
| 502 |
+
break
|
| 503 |
+
|
| 504 |
+
time.sleep(1)
|
| 505 |
+
except Exception as e:
|
| 506 |
+
print(f"NewsData query '{query}': {e}")
|
| 507 |
+
continue
|
| 508 |
+
|
| 509 |
+
return all_articles
|
| 510 |
+
|
| 511 |
+
def fetch_gnews_multi(api_key: str) -> List[Dict]:
|
| 512 |
+
"""Multiple GNews requests"""
|
| 513 |
+
all_articles = []
|
| 514 |
+
|
| 515 |
+
# Different search terms
|
| 516 |
+
searches = [
|
| 517 |
+
'kenya economy',
|
| 518 |
+
'kenya inflation',
|
| 519 |
+
'kenya business',
|
| 520 |
+
'nairobi stock exchange'
|
| 521 |
+
]
|
| 522 |
+
|
| 523 |
+
for search in searches:
|
| 524 |
+
try:
|
| 525 |
+
params = {
|
| 526 |
+
'apikey': api_key,
|
| 527 |
+
'q': search,
|
| 528 |
+
'country': 'ke',
|
| 529 |
+
'lang': 'en',
|
| 530 |
+
'max': 10 # Free tier max
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
response = requests.get('https://gnews.io/api/v4/search', params=params, timeout=10)
|
| 534 |
+
data = response.json()
|
| 535 |
+
|
| 536 |
+
for item in data.get('articles', []):
|
| 537 |
+
if is_valid_url(item.get('url')):
|
| 538 |
+
all_articles.append({
|
| 539 |
+
'url': item.get('url'),
|
| 540 |
+
'summary': item.get('description', ''),
|
| 541 |
+
'date': item.get('publishedAt'),
|
| 542 |
+
'source': item.get('source', {}).get('name')
|
| 543 |
+
})
|
| 544 |
+
|
| 545 |
+
time.sleep(1)
|
| 546 |
+
except Exception as e:
|
| 547 |
+
print(f"GNews search '{search}': {e}")
|
| 548 |
+
continue
|
| 549 |
+
|
| 550 |
+
return all_articles
|
| 551 |
+
|
| 552 |
+
def fetch_thenewsapi_multi(api_key: str) -> List[Dict]:
|
| 553 |
+
"""Multiple TheNewsAPI requests (only 3 articles per request!)"""
|
| 554 |
+
all_articles = []
|
| 555 |
+
|
| 556 |
+
# Multiple searches to compensate for 3-article limit
|
| 557 |
+
searches = [
|
| 558 |
+
'kenya economy',
|
| 559 |
+
'kenya business',
|
| 560 |
+
'kenya inflation',
|
| 561 |
+
'kenya central bank',
|
| 562 |
+
'kenya finance',
|
| 563 |
+
'nairobi economy',
|
| 564 |
+
'kenya investment',
|
| 565 |
+
'kenya banking'
|
| 566 |
+
]
|
| 567 |
+
|
| 568 |
+
for search in searches:
|
| 569 |
+
try:
|
| 570 |
+
params = {
|
| 571 |
+
'api_token': api_key,
|
| 572 |
+
'search': search,
|
| 573 |
+
'language': 'en',
|
| 574 |
+
'limit': 3 # Free tier limit
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
response = requests.get('https://api.thenewsapi.com/v1/news/all', params=params, timeout=10)
|
| 578 |
+
data = response.json()
|
| 579 |
+
|
| 580 |
+
for item in data.get('data', []):
|
| 581 |
+
if is_valid_url(item.get('url')):
|
| 582 |
+
all_articles.append({
|
| 583 |
+
'url': item.get('url'),
|
| 584 |
+
'summary': item.get('description', ''),
|
| 585 |
+
'date': item.get('published_at'),
|
| 586 |
+
'source': item.get('source')
|
| 587 |
+
})
|
| 588 |
+
|
| 589 |
+
time.sleep(1)
|
| 590 |
+
except Exception as e:
|
| 591 |
+
print(f"TheNewsAPI search '{search}': {e}")
|
| 592 |
+
continue
|
| 593 |
+
|
| 594 |
+
return all_articles
|
| 595 |
+
|
| 596 |
+
def scrape_kenya_news_maximum(
|
| 597 |
+
newsdata_key: str = None,
|
| 598 |
+
gnews_key: str = None,
|
| 599 |
+
thenewsapi_key: str = None,
|
| 600 |
+
max_workers: int = 8
|
| 601 |
+
) -> pd.DataFrame:
|
| 602 |
+
"""Get MAXIMUM articles from all sources"""
|
| 603 |
+
|
| 604 |
+
print("🔍 Fetching maximum articles from all APIs...\n")
|
| 605 |
+
|
| 606 |
+
all_articles = []
|
| 607 |
+
|
| 608 |
+
# Fetch from all sources
|
| 609 |
+
if newsdata_key:
|
| 610 |
+
print("📰 NewsData.io: ", end="", flush=True)
|
| 611 |
+
articles = fetch_newsdata_multi(newsdata_key)
|
| 612 |
+
all_articles.extend(articles)
|
| 613 |
+
print(f"{len(articles)} URLs")
|
| 614 |
+
|
| 615 |
+
if gnews_key:
|
| 616 |
+
print("📰 GNews.io: ", end="", flush=True)
|
| 617 |
+
articles = fetch_gnews_multi(gnews_key)
|
| 618 |
+
all_articles.extend(articles)
|
| 619 |
+
print(f"{len(articles)} URLs")
|
| 620 |
+
|
| 621 |
+
if thenewsapi_key:
|
| 622 |
+
print("📰 TheNewsAPI: ", end="", flush=True)
|
| 623 |
+
articles = fetch_thenewsapi_multi(thenewsapi_key)
|
| 624 |
+
all_articles.extend(articles)
|
| 625 |
+
print(f"{len(articles)} URLs (limited to 3/request on free)")
|
| 626 |
+
|
| 627 |
+
if not all_articles:
|
| 628 |
+
print("\n No articles found")
|
| 629 |
+
return pd.DataFrame()
|
| 630 |
+
|
| 631 |
+
# Deduplicate by URL
|
| 632 |
+
seen = set()
|
| 633 |
+
unique = []
|
| 634 |
+
for a in all_articles:
|
| 635 |
+
if a['url'] not in seen:
|
| 636 |
+
seen.add(a['url'])
|
| 637 |
+
unique.append(a)
|
| 638 |
+
|
| 639 |
+
print(f"\n Total unique URLs: {len(unique)}\n")
|
| 640 |
+
|
| 641 |
+
# Parallel scraping
|
| 642 |
+
results = []
|
| 643 |
+
failed = 0
|
| 644 |
+
|
| 645 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 646 |
+
futures = {executor.submit(scrape_article, a['url'], a): a for a in unique}
|
| 647 |
+
|
| 648 |
+
with tqdm(total=len(futures), desc="📄 Scraping", unit="article") as pbar:
|
| 649 |
+
for future in as_completed(futures):
|
| 650 |
+
result = future.result()
|
| 651 |
+
|
| 652 |
+
if result and result.get('status') == 'success':
|
| 653 |
+
results.append(result)
|
| 654 |
+
elif result:
|
| 655 |
+
failed += 1
|
| 656 |
+
if failed <= 3: # Only show first 3 errors
|
| 657 |
+
print(f"\n {result['url'][:50]}... | {result['error']}")
|
| 658 |
+
|
| 659 |
+
pbar.update(1)
|
| 660 |
+
time.sleep(0.2)
|
| 661 |
+
|
| 662 |
+
# Save
|
| 663 |
+
if results:
|
| 664 |
+
df = pd.DataFrame(results)
|
| 665 |
+
df = df.drop('status', axis=1, errors='ignore')
|
| 666 |
+
|
| 667 |
+
print(f"\n {len(results)} articles scraped | {failed} failed | {len(results)/(len(results)+failed)*100:.1f}% success")
|
| 668 |
+
print(f" Avg: {df['word_count'].mean():.0f} words | {df['source'].nunique()} sources")
|
| 669 |
+
return df
|
| 670 |
+
|
| 671 |
+
return pd.DataFrame()
|
src/load/clean_db.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# clean_database.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sqlalchemy import create_engine, text
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
# Set up a logger that can be configured by the importer
|
| 7 |
+
logger = logging.getLogger("DBCleaner")
|
| 8 |
+
|
| 9 |
+
def drop_blacklisted_tables(engine):
|
| 10 |
+
"""Drops tables matching the blacklist patterns."""
|
| 11 |
+
drop_patterns = [
|
| 12 |
+
"bop_annual",
|
| 13 |
+
"commercial_banks_average_lending_rates",
|
| 14 |
+
"depository_corporation",
|
| 15 |
+
"exchange_rates_end_period",
|
| 16 |
+
"exchange_rates_period_average",
|
| 17 |
+
"forex_bureau_rates_sheet",
|
| 18 |
+
"lr_return_template",
|
| 19 |
+
"nsfr_return_template"
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
with engine.connect() as conn:
|
| 23 |
+
all_tables = [t[0] for t in conn.execute(text("SELECT name FROM sqlite_master WHERE type='table'")).fetchall()]
|
| 24 |
+
tables_to_drop = []
|
| 25 |
+
|
| 26 |
+
for t in all_tables:
|
| 27 |
+
if any(p in t for p in drop_patterns):
|
| 28 |
+
tables_to_drop.append(t)
|
| 29 |
+
|
| 30 |
+
if not tables_to_drop:
|
| 31 |
+
logger.info("No tables found matching blacklist patterns.")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
logger.info(f"🗑️ Dropping {len(tables_to_drop)} tables...")
|
| 35 |
+
for t in tables_to_drop:
|
| 36 |
+
conn.execute(text(f'DROP TABLE "{t}"'))
|
| 37 |
+
logger.info(f" - Dropped: {t}")
|
| 38 |
+
conn.commit()
|
| 39 |
+
|
| 40 |
+
def clean_table(engine, table_name, drop_top_rows=0, rename_map=None, rename_by_index=None, static_date=None):
|
| 41 |
+
"""
|
| 42 |
+
Generic cleaner for specific table fixes.
|
| 43 |
+
"""
|
| 44 |
+
try:
|
| 45 |
+
# Check if table exists first
|
| 46 |
+
with engine.connect() as conn:
|
| 47 |
+
exists = conn.execute(text(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")).scalar()
|
| 48 |
+
if not exists:
|
| 49 |
+
logger.warning(f" Table '{table_name}' not found. Skipping.")
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
|
| 53 |
+
if df.empty: return
|
| 54 |
+
|
| 55 |
+
# Drop columns that are completely empty
|
| 56 |
+
df = df.dropna(axis=1, how='all')
|
| 57 |
+
|
| 58 |
+
# Drop top rows if requested
|
| 59 |
+
if drop_top_rows > 0:
|
| 60 |
+
df = df.iloc[drop_top_rows:].reset_index(drop=True)
|
| 61 |
+
|
| 62 |
+
# Rename by Index (useful for 'col_1', 'col_2')
|
| 63 |
+
if rename_by_index:
|
| 64 |
+
curr_cols = list(df.columns)
|
| 65 |
+
new_cols = curr_cols.copy()
|
| 66 |
+
for idx, new_name in rename_by_index.items():
|
| 67 |
+
if idx < len(curr_cols):
|
| 68 |
+
new_cols[idx] = new_name
|
| 69 |
+
df.columns = new_cols
|
| 70 |
+
|
| 71 |
+
# Rename by Map
|
| 72 |
+
if rename_map:
|
| 73 |
+
df.rename(columns=rename_map, inplace=True)
|
| 74 |
+
|
| 75 |
+
# Inject Static Date if missing
|
| 76 |
+
if static_date:
|
| 77 |
+
if 'date' not in df.columns:
|
| 78 |
+
df.insert(0, 'date', static_date)
|
| 79 |
+
else:
|
| 80 |
+
df['date'] = static_date
|
| 81 |
+
|
| 82 |
+
# Save back to DB (Replace mode)
|
| 83 |
+
df.to_sql(table_name, engine, if_exists='replace', index=False)
|
| 84 |
+
logger.info(f" Fixed '{table_name}': {len(df)} rows")
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f" Error cleaning '{table_name}': {e}")
|
| 88 |
+
|
| 89 |
+
def run_specific_fixes(engine):
|
| 90 |
+
"""Orchestrates the specific cleaning rules."""
|
| 91 |
+
logger.info("🔧 Running specific table fixes...")
|
| 92 |
+
|
| 93 |
+
# 1. Historical Rates
|
| 94 |
+
clean_table(engine, "download_all_historical_rates",
|
| 95 |
+
rename_by_index={2: "mean_rate", 3: "buy_rate", 4: "sell_rate"})
|
| 96 |
+
|
| 97 |
+
# 2. Foreign Trade Summary
|
| 98 |
+
clean_table(engine, "foreign_trade_summary", drop_top_rows=1)
|
| 99 |
+
|
| 100 |
+
# 3. Forex Bureau Rates
|
| 101 |
+
clean_table(engine, "forex_bureau_rates",
|
| 102 |
+
rename_map={"bureau_name": "currency"})
|
| 103 |
+
|
| 104 |
+
# 4. Indicative Rates (Indicative Sheet)
|
| 105 |
+
clean_table(engine, "indicative_rates_sheet_indicative",
|
| 106 |
+
static_date="2017-11-16",
|
| 107 |
+
rename_by_index={0: "currency", 1: "mean_rate", 2: "buy_rate", 3: "sell_rate"})
|
| 108 |
+
|
| 109 |
+
# 5. Indicative Rates (Press Sheet)
|
| 110 |
+
clean_table(engine, "indicative_rates_sheet_press",
|
| 111 |
+
static_date="2017-11-16",
|
| 112 |
+
rename_by_index={
|
| 113 |
+
0: "bank_name",
|
| 114 |
+
1: "usd_buy", 2: "usd_sell", 3: "usd_margin",
|
| 115 |
+
4: "gbp_buy", 5: "gbp_sell", 6: "gbp_margin"
|
| 116 |
+
})
|
| 117 |
+
|
| 118 |
+
# 6. Selected Domestic Exports
|
| 119 |
+
clean_table(engine, "value_of_selected_domestic_exports", drop_top_rows=2)
|
| 120 |
+
|
| 121 |
+
# 7. Imports by Commodity
|
| 122 |
+
clean_table(engine, "value_of_direct_imports_by_commodities", drop_top_rows=1)
|
| 123 |
+
|
| 124 |
+
def clean_database_pipeline(db_name):
|
| 125 |
+
"""Main entry point for external calls."""
|
| 126 |
+
connection_str = f"sqlite:///{db_name}"
|
| 127 |
+
engine = create_engine(connection_str)
|
| 128 |
+
|
| 129 |
+
logger.info(f" Starting cleanup on {db_name}...")
|
| 130 |
+
drop_blacklisted_tables(engine)
|
| 131 |
+
run_specific_fixes(engine)
|
| 132 |
+
logger.info(" Cleanup Complete.")
|
| 133 |
+
|
| 134 |
+
def drop_tables(engine):
|
| 135 |
+
"""Drops the specific list of tables requested."""
|
| 136 |
+
tables_to_drop = [
|
| 137 |
+
'forex_bureau_rates',
|
| 138 |
+
'forex_bureaus_rates_sheet_chief_dealers',
|
| 139 |
+
'forex_bureaus_rates_sheet_director',
|
| 140 |
+
'forex_bureaus_rates_sheet_directors',
|
| 141 |
+
'forex_bureaus_rates_sheet_fbx',
|
| 142 |
+
'forex_bureaus_rates_sheet_fbx1',
|
| 143 |
+
'forex_bureaus_rates_sheet_fbx2',
|
| 144 |
+
'forex_bureaus_rates_sheet_fxb1',
|
| 145 |
+
'forex_bureaus_rates_sheet_fxb2',
|
| 146 |
+
'forex_bureaus_rates_sheet_fxb22',
|
| 147 |
+
'forex_bureaus_rates_sheet_market_intelligence',
|
| 148 |
+
'forex_bureaus_rates_sheet_sheet1',
|
| 149 |
+
'forex_bureaus_rates_sheet_sheet2',
|
| 150 |
+
'forex_bureaus_rates_sheet_sheet3',
|
| 151 |
+
'forex_bureaus_rates_sheet_sheet4',
|
| 152 |
+
'issues_of_treasury_bills',
|
| 153 |
+
'issues_of_treasury_bonds'
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
print("🗑️ Dropping Tables...")
|
| 157 |
+
with engine.connect() as conn:
|
| 158 |
+
for t in tables_to_drop:
|
| 159 |
+
try:
|
| 160 |
+
conn.execute(text(f'DROP TABLE IF EXISTS "{t}"'))
|
| 161 |
+
print(f" - Dropped: {t}")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f" Could not drop {t}: {e}")
|
| 164 |
+
conn.commit()
|
| 165 |
+
|
| 166 |
+
def fix_foreign_trade(engine):
|
| 167 |
+
"""Renames first column to 'year'."""
|
| 168 |
+
table_name = "foreign_trade_summary"
|
| 169 |
+
try:
|
| 170 |
+
df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
|
| 171 |
+
if 'kenyan_shillings_million_year' in df.columns:
|
| 172 |
+
df.rename(columns={'kenyan_shillings_million_year': 'year'}, inplace=True)
|
| 173 |
+
df.to_sql(table_name, engine, if_exists='replace', index=False)
|
| 174 |
+
print(f" Fixed '{table_name}': Renamed 'year' column.")
|
| 175 |
+
else:
|
| 176 |
+
print(f" '{table_name}': Target column not found.")
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f" Error fixing {table_name}: {e}")
|
| 179 |
+
|
| 180 |
+
def fix_indicative_rates_shift(engine):
|
| 181 |
+
"""
|
| 182 |
+
Applies the 'Shift Right + Fixed Date' logic.
|
| 183 |
+
Inserts 2017-11-16 at position 0, shifting existing data to the right.
|
| 184 |
+
"""
|
| 185 |
+
targets = [
|
| 186 |
+
"indicative_rates_sheet_indicative",
|
| 187 |
+
"indicative_rates_sheet_press"
|
| 188 |
+
]
|
| 189 |
+
|
| 190 |
+
fixed_date = "2017-11-16"
|
| 191 |
+
|
| 192 |
+
for table in targets:
|
| 193 |
+
try:
|
| 194 |
+
df = pd.read_sql(f'SELECT * FROM "{table}"', engine)
|
| 195 |
+
if df.empty: continue
|
| 196 |
+
|
| 197 |
+
# Logic: Insert new date column at index 0
|
| 198 |
+
# This effectively "shifts" the old col 0 to col 1
|
| 199 |
+
df.insert(0, 'fixed_date', fixed_date)
|
| 200 |
+
|
| 201 |
+
# Rename columns to reflect the shift clearly
|
| 202 |
+
# We assume the user wants standard names for the shifted data
|
| 203 |
+
# Adjust names based on the table type
|
| 204 |
+
new_columns = list(df.columns)
|
| 205 |
+
new_columns[0] = "date" # The new fixed column
|
| 206 |
+
|
| 207 |
+
# Assigning generic or specific headers for the shifted data
|
| 208 |
+
if "press" in table:
|
| 209 |
+
# Based on previous prompt instructions for Press sheet:
|
| 210 |
+
# Bank, USD_Buy, USD_Sell, USD_Margin, GBP_Buy...
|
| 211 |
+
expected_headers = ["date", "bank_name", "usd_buy", "usd_sell", "usd_margin", "gbp_buy", "gbp_sell", "gbp_margin", "euro_buy", "euro_sell", "euro_margin"]
|
| 212 |
+
else:
|
| 213 |
+
# Indicative sheet: Currency, Mean, Buy, Sell
|
| 214 |
+
expected_headers = ["date", "currency", "mean_rate", "buy_rate", "sell_rate"]
|
| 215 |
+
|
| 216 |
+
# Map headers safely (truncate if df has fewer cols, pad if more)
|
| 217 |
+
final_cols = expected_headers + [f"col_{i}" for i in range(len(df.columns) - len(expected_headers))]
|
| 218 |
+
df.columns = final_cols[:len(df.columns)]
|
| 219 |
+
|
| 220 |
+
# Clean up: Drop any old 'date' column if it was pushed to the right and is duplicate/garbage
|
| 221 |
+
# (Optional, but safer to keep strictly what we shifted)
|
| 222 |
+
|
| 223 |
+
df.to_sql(table, engine, if_exists='replace', index=False)
|
| 224 |
+
print(f" Fixed '{table}': Applied Date Shift & Header Rename.")
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
print(f" Error fixing {table}: {e}")
|
| 228 |
+
|
| 229 |
+
def fix_cbk_indicative_swap(engine):
|
| 230 |
+
"""Swaps 'date' and 'currency' column names."""
|
| 231 |
+
table_name = "cbk_indicative_rates"
|
| 232 |
+
try:
|
| 233 |
+
df = pd.read_sql(f'SELECT * FROM "{table_name}"', engine)
|
| 234 |
+
|
| 235 |
+
rename_map = {}
|
| 236 |
+
if 'date' in df.columns: rename_map['date'] = 'currency'
|
| 237 |
+
if 'currency' in df.columns: rename_map['currency'] = 'date'
|
| 238 |
+
|
| 239 |
+
if rename_map:
|
| 240 |
+
df.rename(columns=rename_map, inplace=True)
|
| 241 |
+
df.to_sql(table_name, engine, if_exists='replace', index=False)
|
| 242 |
+
print(f" Fixed '{table_name}': Swapped 'date' <-> 'currency'.")
|
| 243 |
+
except Exception as e:
|
| 244 |
+
print(f" Error fixing {table_name}: {e}")
|
src/load/explore_news_schema.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import glob
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# Configure logging to show up in the notebook
|
| 8 |
+
logging.basicConfig(
|
| 9 |
+
level=logging.INFO,
|
| 10 |
+
format='%(message)s',
|
| 11 |
+
stream=sys.stdout,
|
| 12 |
+
force=True
|
| 13 |
+
)
|
| 14 |
+
logger = logging.getLogger("SchemaExplorer")
|
| 15 |
+
|
| 16 |
+
def analyze_schemas(news_dir: str):
|
| 17 |
+
"""
|
| 18 |
+
Scans all CSV files in the given directory and groups them by their column structure.
|
| 19 |
+
"""
|
| 20 |
+
if not os.path.exists(news_dir):
|
| 21 |
+
logger.error(f" Directory not found: {news_dir}")
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
csv_files = glob.glob(os.path.join(news_dir, "*.csv"))
|
| 25 |
+
logger.info(f"🔍 Scanning {len(csv_files)} files in '{news_dir}'...\n")
|
| 26 |
+
|
| 27 |
+
if not csv_files:
|
| 28 |
+
logger.warning(" No CSV files found.")
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
# Dictionary to store unique schemas: { (col1, col2): [file1, file2] }
|
| 32 |
+
schemas = {}
|
| 33 |
+
|
| 34 |
+
for f in csv_files:
|
| 35 |
+
try:
|
| 36 |
+
# Read only the header (fast)
|
| 37 |
+
df = pd.read_csv(f, nrows=0)
|
| 38 |
+
|
| 39 |
+
# Sort columns to ensure order doesn't matter for grouping
|
| 40 |
+
cols = tuple(sorted(df.columns.tolist()))
|
| 41 |
+
|
| 42 |
+
if cols not in schemas:
|
| 43 |
+
schemas[cols] = []
|
| 44 |
+
schemas[cols].append(os.path.basename(f))
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f" Error reading {os.path.basename(f)}: {e}")
|
| 48 |
+
|
| 49 |
+
# Report Findings
|
| 50 |
+
logger.info("--- Schema Report ---")
|
| 51 |
+
for i, (cols, files) in enumerate(schemas.items()):
|
| 52 |
+
logger.info(f"\nTYPE {i+1}: Found in {len(files)} files")
|
| 53 |
+
logger.info(f"Columns: {list(cols)}")
|
| 54 |
+
if len(files) < 5:
|
| 55 |
+
logger.info(f"Examples: {files}")
|
| 56 |
+
else:
|
| 57 |
+
logger.info(f"Examples: {files[:3]} ... (+{len(files)-3} others)")
|
| 58 |
+
|
| 59 |
+
# Date Format Check (Random Sample from the first valid file)
|
| 60 |
+
logger.info("\n--- Date Format Sample ---")
|
| 61 |
+
try:
|
| 62 |
+
sample_file = csv_files[0]
|
| 63 |
+
sample = pd.read_csv(sample_file, nrows=5)
|
| 64 |
+
|
| 65 |
+
# Look for a column containing 'date' or 'time'
|
| 66 |
+
date_col = next((c for c in sample.columns if 'date' in c.lower() or 'time' in c.lower() or 'published' in c.lower()), None)
|
| 67 |
+
|
| 68 |
+
if date_col:
|
| 69 |
+
logger.info(f"Sample from column '{date_col}' in {os.path.basename(sample_file)}:")
|
| 70 |
+
logger.info(sample[date_col].head().tolist())
|
| 71 |
+
else:
|
| 72 |
+
logger.warning("No obvious 'date' column found in sample.")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Could not read sample for date check: {e}")
|
src/load/ingest_md.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import sys
|
| 4 |
+
import time
|
| 5 |
+
import requests
|
| 6 |
+
import subprocess
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from langchain_community.document_loaders import DirectoryLoader, TextLoader # <--- SWITCHED
|
| 10 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 11 |
+
from langchain_community.vectorstores import Chroma
|
| 12 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
| 13 |
+
|
| 14 |
+
# --- LOGGING ---
|
| 15 |
+
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(message)s', force=True)
|
| 16 |
+
logger = logging.getLogger("ReportIngest")
|
| 17 |
+
|
| 18 |
+
def _ensure_ollama_running(port="25000"):
|
| 19 |
+
host = f"http://127.0.0.1:{port}"
|
| 20 |
+
try:
|
| 21 |
+
if requests.get(host).status_code == 200:
|
| 22 |
+
return True
|
| 23 |
+
except: pass
|
| 24 |
+
|
| 25 |
+
print(" Starting Ollama Server...")
|
| 26 |
+
scratch = os.environ.get("SCRATCH", "/tmp")
|
| 27 |
+
base = Path(scratch)
|
| 28 |
+
bin_path = base / "ollama_core/bin/ollama"
|
| 29 |
+
|
| 30 |
+
env = os.environ.copy()
|
| 31 |
+
env["OLLAMA_HOST"] = f"127.0.0.1:{port}"
|
| 32 |
+
env["OLLAMA_MODELS"] = str(base / "ollama_core/models")
|
| 33 |
+
|
| 34 |
+
subprocess.Popen([str(bin_path), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
|
| 35 |
+
time.sleep(5)
|
| 36 |
+
return True
|
| 37 |
+
|
| 38 |
+
def ingest_markdown_reports(
|
| 39 |
+
markdown_dir="mshauri-fedha/data/knbs/marker-output",
|
| 40 |
+
vector_db_path="mshauri_fedha_chroma_db",
|
| 41 |
+
model="nomic-embed-text",
|
| 42 |
+
ollama_port="25000"
|
| 43 |
+
):
|
| 44 |
+
_ensure_ollama_running(ollama_port)
|
| 45 |
+
|
| 46 |
+
if not os.path.exists(markdown_dir):
|
| 47 |
+
logger.error(f" Directory not found: {markdown_dir}")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
print(f"📄 Scanning for Markdown Reports in {markdown_dir}...")
|
| 51 |
+
|
| 52 |
+
# --- 1. LOAD FILES (Improved) ---
|
| 53 |
+
# We use TextLoader which is faster and doesn't trigger 'unstructured' warnings
|
| 54 |
+
loader = DirectoryLoader(
|
| 55 |
+
markdown_dir,
|
| 56 |
+
glob="**/*.md",
|
| 57 |
+
loader_cls=TextLoader,
|
| 58 |
+
loader_kwargs={'autodetect_encoding': True}, # Safe for varying file encodings
|
| 59 |
+
show_progress=True,
|
| 60 |
+
use_multithreading=True
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Catch errors during loading (e.g., empty files)
|
| 64 |
+
try:
|
| 65 |
+
raw_docs = loader.load()
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f" Warning during loading: {e}")
|
| 68 |
+
# Fallback: simple load if directory loader fails
|
| 69 |
+
raw_docs = []
|
| 70 |
+
|
| 71 |
+
if not raw_docs:
|
| 72 |
+
print(" No valid markdown files found.")
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
print(f" Loaded {len(raw_docs)} report files.")
|
| 76 |
+
|
| 77 |
+
# --- 2. CHUNKING ---
|
| 78 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 79 |
+
chunk_size=2000,
|
| 80 |
+
chunk_overlap=200,
|
| 81 |
+
separators=["\n## ", "\n### ", "\n", " ", ""]
|
| 82 |
+
)
|
| 83 |
+
docs = text_splitter.split_documents(raw_docs)
|
| 84 |
+
|
| 85 |
+
# --- 3. METADATA ---
|
| 86 |
+
for d in docs:
|
| 87 |
+
d.metadata["type"] = "report"
|
| 88 |
+
if "source" not in d.metadata:
|
| 89 |
+
d.metadata["source"] = os.path.basename(d.metadata.get("source", "Official Report"))
|
| 90 |
+
|
| 91 |
+
print(f" ✂️ Split into {len(docs)} chunks.")
|
| 92 |
+
|
| 93 |
+
# --- 4. EMBEDDING ---
|
| 94 |
+
print(" Appending to Vector Store...")
|
| 95 |
+
embeddings = OllamaEmbeddings(
|
| 96 |
+
model=model,
|
| 97 |
+
base_url=f"http://127.0.0.1:{ollama_port}"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
vectorstore = Chroma(
|
| 101 |
+
persist_directory=vector_db_path,
|
| 102 |
+
embedding_function=embeddings
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Batch Add
|
| 106 |
+
batch_size = 100
|
| 107 |
+
|
| 108 |
+
with tqdm(total=len(docs), desc="Ingesting Reports", unit="chunk") as pbar:
|
| 109 |
+
for i in range(0, len(docs), batch_size):
|
| 110 |
+
batch = docs[i:i+batch_size]
|
| 111 |
+
vectorstore.add_documents(batch)
|
| 112 |
+
pbar.update(len(batch))
|
| 113 |
+
|
| 114 |
+
print("\n Reports Added. Hybrid Knowledge Base is ready.")
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
ingest_markdown_reports()
|
src/load/ingest_news.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import ast
|
| 5 |
+
import logging
|
| 6 |
+
import glob
|
| 7 |
+
import time
|
| 8 |
+
import requests
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from tqdm import tqdm # <--- New Import
|
| 13 |
+
from dateutil import parser
|
| 14 |
+
from langchain_core.documents import Document
|
| 15 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 16 |
+
from langchain_community.vectorstores import Chroma
|
| 17 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
| 18 |
+
|
| 19 |
+
# --- CONFIG ---
|
| 20 |
+
MIN_CONTENT_LENGTH = 100
|
| 21 |
+
|
| 22 |
+
# --- LOGGING ---
|
| 23 |
+
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(message)s', force=True)
|
| 24 |
+
logger = logging.getLogger("NewsIngest")
|
| 25 |
+
|
| 26 |
+
def _ensure_ollama_running(port="25000"):
|
| 27 |
+
host = f"http://127.0.0.1:{port}"
|
| 28 |
+
try:
|
| 29 |
+
if requests.get(host).status_code == 200:
|
| 30 |
+
return True
|
| 31 |
+
except: pass
|
| 32 |
+
|
| 33 |
+
print(" Starting Ollama Server...")
|
| 34 |
+
scratch = os.environ.get("SCRATCH", "/tmp")
|
| 35 |
+
base = Path(scratch)
|
| 36 |
+
bin_path = base / "ollama_core/bin/ollama"
|
| 37 |
+
|
| 38 |
+
env = os.environ.copy()
|
| 39 |
+
env["OLLAMA_HOST"] = f"127.0.0.1:{port}"
|
| 40 |
+
env["OLLAMA_MODELS"] = str(base / "ollama_core/models")
|
| 41 |
+
|
| 42 |
+
subprocess.Popen([str(bin_path), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=env)
|
| 43 |
+
time.sleep(5)
|
| 44 |
+
return True
|
| 45 |
+
|
| 46 |
+
# --- CLEANING HELPERS ---
|
| 47 |
+
def clean_text(text):
|
| 48 |
+
if not isinstance(text, str): return ""
|
| 49 |
+
text = re.sub(r'(?:https?|ftp)://\S+|www\.\S+', '', text)
|
| 50 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 51 |
+
text = re.sub(r'^[\W_]+', '', text)
|
| 52 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 53 |
+
return text
|
| 54 |
+
|
| 55 |
+
def parse_standard_date(date_str):
|
| 56 |
+
try:
|
| 57 |
+
if pd.isna(date_str): return "Unknown Date"
|
| 58 |
+
dt = parser.parse(str(date_str))
|
| 59 |
+
return dt.strftime("%Y-%m-%d")
|
| 60 |
+
except: return "Unknown Date"
|
| 61 |
+
|
| 62 |
+
def extract_publisher_from_dict(pub_str):
|
| 63 |
+
try:
|
| 64 |
+
if isinstance(pub_str, str) and "{" in pub_str:
|
| 65 |
+
data = ast.literal_eval(pub_str)
|
| 66 |
+
return data.get('title', 'Google News')
|
| 67 |
+
return str(pub_str)
|
| 68 |
+
except: return "Google News"
|
| 69 |
+
|
| 70 |
+
def normalize_news_df(df, filename):
|
| 71 |
+
cols = df.columns.tolist()
|
| 72 |
+
normalized = []
|
| 73 |
+
|
| 74 |
+
def create_entry(row, title_col, content_col, date_col, source_val):
|
| 75 |
+
title = clean_text(row.get(title_col, ''))
|
| 76 |
+
content = clean_text(row.get(content_col, ''))
|
| 77 |
+
if len(content) < MIN_CONTENT_LENGTH: return None
|
| 78 |
+
return {
|
| 79 |
+
'title': title,
|
| 80 |
+
'content': content,
|
| 81 |
+
'date': parse_standard_date(row.get(date_col, '')),
|
| 82 |
+
'source': source_val,
|
| 83 |
+
'url': row.get('url', ''),
|
| 84 |
+
'file_origin': filename
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
if 'publisher' in cols and 'description' in cols:
|
| 88 |
+
for _, row in df.iterrows():
|
| 89 |
+
source = extract_publisher_from_dict(row.get('publisher', ''))
|
| 90 |
+
entry = create_entry(row, 'title', 'description', 'published date', source)
|
| 91 |
+
if entry: normalized.append(entry)
|
| 92 |
+
elif 'full_content' in cols:
|
| 93 |
+
for _, row in df.iterrows():
|
| 94 |
+
c_col = 'full_content' if isinstance(row.get('full_content'), str) and len(str(row.get('full_content'))) > 50 else 'summary'
|
| 95 |
+
entry = create_entry(row, 'title', c_col, 'date', str(row.get('source', 'Unknown')))
|
| 96 |
+
if entry: normalized.append(entry)
|
| 97 |
+
elif 'content' in cols and 'source' in cols:
|
| 98 |
+
for _, row in df.iterrows():
|
| 99 |
+
entry = create_entry(row, 'title', 'content', 'date', str(row.get('source', 'Unknown')))
|
| 100 |
+
if entry: normalized.append(entry)
|
| 101 |
+
return pd.DataFrame(normalized)
|
| 102 |
+
|
| 103 |
+
def ingest_news_data(news_dir, vector_db_path="mshauri_fedha_chroma_db", model="nomic-embed-text"):
|
| 104 |
+
_ensure_ollama_running()
|
| 105 |
+
|
| 106 |
+
csv_files = glob.glob(os.path.join(news_dir, "*.csv"))
|
| 107 |
+
if not csv_files:
|
| 108 |
+
print("No files found.")
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
print(f" Found {len(csv_files)} news files. Processing...")
|
| 112 |
+
|
| 113 |
+
all_articles = []
|
| 114 |
+
|
| 115 |
+
# Progress bar for loading files
|
| 116 |
+
for f in tqdm(csv_files, desc="Reading CSVs", unit="file"):
|
| 117 |
+
try:
|
| 118 |
+
df = pd.read_csv(f)
|
| 119 |
+
clean_df = normalize_news_df(df, os.path.basename(f))
|
| 120 |
+
if not clean_df.empty:
|
| 121 |
+
all_articles.extend(clean_df.to_dict('records'))
|
| 122 |
+
except Exception as e:
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
# Deduplication
|
| 126 |
+
unique_docs = {}
|
| 127 |
+
for art in all_articles:
|
| 128 |
+
key = f"{art['title']}_{art['date']}"
|
| 129 |
+
if art['title'] in art['content']:
|
| 130 |
+
page_content = f"Date: {art['date']}\nSource: {art['source']}\n\n{art['content']}"
|
| 131 |
+
else:
|
| 132 |
+
page_content = f"Title: {art['title']}\nDate: {art['date']}\nSource: {art['source']}\n\n{art['content']}"
|
| 133 |
+
|
| 134 |
+
if key in unique_docs:
|
| 135 |
+
if len(page_content) > len(unique_docs[key].page_content):
|
| 136 |
+
unique_docs[key] = Document(page_content=page_content, metadata={"source": art['source'], "date": art['date'], "type": "news"})
|
| 137 |
+
else:
|
| 138 |
+
unique_docs[key] = Document(page_content=page_content, metadata={"source": art['source'], "date": art['date'], "type": "news"})
|
| 139 |
+
|
| 140 |
+
raw_docs = list(unique_docs.values())
|
| 141 |
+
print(f" Condensed into {len(raw_docs)} unique articles.")
|
| 142 |
+
|
| 143 |
+
# Chunking
|
| 144 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200, separators=["\n\n", "\n", ". ", " "])
|
| 145 |
+
final_docs = text_splitter.split_documents(raw_docs)
|
| 146 |
+
|
| 147 |
+
if final_docs:
|
| 148 |
+
print(f" Embedding {len(final_docs)} chunks into Vector DB...")
|
| 149 |
+
embeddings = OllamaEmbeddings(model=model, base_url="http://127.0.0.1:25000")
|
| 150 |
+
vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
|
| 151 |
+
|
| 152 |
+
batch_size = 100
|
| 153 |
+
# Progress bar for embedding
|
| 154 |
+
with tqdm(total=len(final_docs), desc="Embedding News", unit="chunk") as pbar:
|
| 155 |
+
for i in range(0, len(final_docs), batch_size):
|
| 156 |
+
batch = final_docs[i:i+batch_size]
|
| 157 |
+
vectorstore.add_documents(batch)
|
| 158 |
+
pbar.update(len(batch))
|
| 159 |
+
|
| 160 |
+
print("\n News Ingestion Complete.")
|
| 161 |
+
else:
|
| 162 |
+
print("No valid articles extracted.")
|
src/load/inspect_db.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sqlalchemy import create_engine, text
|
| 4 |
+
|
| 5 |
+
# --- CONFIGURATION ---
|
| 6 |
+
DB_NAME = "mshauri_fedha.db"
|
| 7 |
+
DB_CONNECTION = f"sqlite:///{DB_NAME}"
|
| 8 |
+
|
| 9 |
+
def list_all_tables(engine):
|
| 10 |
+
print(f"\n --- DATABASE SUMMARY: {DB_NAME} ---")
|
| 11 |
+
try:
|
| 12 |
+
with engine.connect() as conn:
|
| 13 |
+
# Query the master table for all table names
|
| 14 |
+
query = text("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
|
| 15 |
+
tables = conn.execute(query).fetchall()
|
| 16 |
+
|
| 17 |
+
if not tables:
|
| 18 |
+
print(" Database is empty.")
|
| 19 |
+
return []
|
| 20 |
+
|
| 21 |
+
table_list = [t[0] for t in tables]
|
| 22 |
+
|
| 23 |
+
print(f"{'ID':<4} | {'Rows':<8} | {'Table Name'}")
|
| 24 |
+
print("-" * 60)
|
| 25 |
+
|
| 26 |
+
for i, t_name in enumerate(table_list):
|
| 27 |
+
# Count rows for verification
|
| 28 |
+
try:
|
| 29 |
+
count = conn.execute(text(f'SELECT COUNT(*) FROM "{t_name}"')).scalar()
|
| 30 |
+
print(f"{i:<4} | {count:<8} | {t_name}")
|
| 31 |
+
except:
|
| 32 |
+
print(f"{i:<4} | {'ERROR':<8} | {t_name}")
|
| 33 |
+
|
| 34 |
+
return table_list
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f" Connection failed: {e}")
|
| 37 |
+
return []
|
| 38 |
+
|
| 39 |
+
def inspect_table(engine, table_name):
|
| 40 |
+
print(f"\n🔎 Inspecting Table: '{table_name}'")
|
| 41 |
+
try:
|
| 42 |
+
# Read schema/columns
|
| 43 |
+
query = f'SELECT * FROM "{table_name}" LIMIT 5'
|
| 44 |
+
df = pd.read_sql(query, engine)
|
| 45 |
+
|
| 46 |
+
if df.empty:
|
| 47 |
+
print(" Table is empty.")
|
| 48 |
+
else:
|
| 49 |
+
print(f"Columns: {list(df.columns)}")
|
| 50 |
+
print("\n--- First 5 Rows ---")
|
| 51 |
+
# to_string() makes it readable in terminal without truncation
|
| 52 |
+
print(df.to_string(index=False))
|
| 53 |
+
print("-" * 50)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f" Could not read table: {e}")
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
if not os.path.exists(DB_NAME):
|
| 59 |
+
print(f" Error: Database file '{DB_NAME}' not found in current directory.")
|
| 60 |
+
print(f"Current Directory: {os.getcwd()}")
|
| 61 |
+
return
|
| 62 |
+
|
| 63 |
+
engine = create_engine(DB_CONNECTION)
|
| 64 |
+
tables = list_all_tables(engine)
|
| 65 |
+
|
| 66 |
+
if not tables: return
|
| 67 |
+
|
| 68 |
+
while True:
|
| 69 |
+
try:
|
| 70 |
+
user_input = input("\nEnter Table ID (or Name) to inspect, or 'q' to quit: ").strip()
|
| 71 |
+
if user_input.lower() == 'q': break
|
| 72 |
+
|
| 73 |
+
target_table = None
|
| 74 |
+
|
| 75 |
+
# Handle numeric ID input
|
| 76 |
+
if user_input.isdigit():
|
| 77 |
+
idx = int(user_input)
|
| 78 |
+
if 0 <= idx < len(tables):
|
| 79 |
+
target_table = tables[idx]
|
| 80 |
+
# Handle name input
|
| 81 |
+
elif user_input in tables:
|
| 82 |
+
target_table = user_input
|
| 83 |
+
|
| 84 |
+
if target_table:
|
| 85 |
+
inspect_table(engine, target_table)
|
| 86 |
+
else:
|
| 87 |
+
print(" Invalid selection.")
|
| 88 |
+
except KeyboardInterrupt:
|
| 89 |
+
break
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
main()
|
src/load/mshauri_demo.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import sys
|
| 4 |
+
# These imports are stable and have worked in your previous logs
|
| 5 |
+
from langchain_ollama import ChatOllama
|
| 6 |
+
from langchain_community.utilities import SQLDatabase
|
| 7 |
+
from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
|
| 8 |
+
from langchain_community.vectorstores import Chroma
|
| 9 |
+
from langchain_community.embeddings import OllamaEmbeddings
|
| 10 |
+
|
| 11 |
+
# --- CONFIGURATION ---
|
| 12 |
+
DEFAULT_SQL_DB = "sqlite:///mshauri_fedha_v6.db"
|
| 13 |
+
DEFAULT_VECTOR_DB = "mshauri_fedha_chroma_db"
|
| 14 |
+
DEFAULT_EMBED_MODEL = "nomic-embed-text"
|
| 15 |
+
DEFAULT_LLM_MODEL = "qwen3:32b"
|
| 16 |
+
DEFAULT_OLLAMA_URL = "http://127.0.0.1:25000"
|
| 17 |
+
|
| 18 |
+
# --- 1. REPLACEMENT CLASS FOR 'Tool' ---
|
| 19 |
+
class SimpleTool:
|
| 20 |
+
"""A simple wrapper to replace langchain.tools.Tool"""
|
| 21 |
+
def __init__(self, name, func, description):
|
| 22 |
+
self.name = name
|
| 23 |
+
self.func = func
|
| 24 |
+
self.description = description
|
| 25 |
+
|
| 26 |
+
def run(self, input_data):
|
| 27 |
+
return self.func(input_data)
|
| 28 |
+
|
| 29 |
+
# --- 2. REPLACEMENT CLASS FOR THE AGENT ---
|
| 30 |
+
class SimpleReActAgent:
|
| 31 |
+
"""A manual ReAct loop that doesn't rely on langchain.agents"""
|
| 32 |
+
def __init__(self, llm, tools, verbose=True):
|
| 33 |
+
self.llm = llm
|
| 34 |
+
self.tools = {t.name: t for t in tools}
|
| 35 |
+
self.verbose = verbose
|
| 36 |
+
# Create the tool description string for the prompt
|
| 37 |
+
self.tool_desc = "\n".join([f"{t.name}: {t.description}" for t in tools])
|
| 38 |
+
self.tool_names = ", ".join([t.name for t in tools])
|
| 39 |
+
|
| 40 |
+
# Hardcoded ReAct Prompt
|
| 41 |
+
self.prompt_template = """Answer the following questions as best you can. You have access to the following tools:
|
| 42 |
+
|
| 43 |
+
{tool_desc}
|
| 44 |
+
|
| 45 |
+
Use the following format:
|
| 46 |
+
|
| 47 |
+
Question: the input question you must answer
|
| 48 |
+
Thought: you should always think about what to do
|
| 49 |
+
Action: the action to take, should be one of [{tool_names}]
|
| 50 |
+
Action Input: the input to the action
|
| 51 |
+
Observation: the result of the action
|
| 52 |
+
... (this Thought/Action/Action Input/Observation can repeat N times)
|
| 53 |
+
Thought: I now know the final answer
|
| 54 |
+
Final Answer: the final answer to the original input question
|
| 55 |
+
|
| 56 |
+
Begin!
|
| 57 |
+
|
| 58 |
+
Question: {input}
|
| 59 |
+
Thought:{agent_scratchpad}"""
|
| 60 |
+
|
| 61 |
+
def invoke(self, inputs):
|
| 62 |
+
query = inputs["input"]
|
| 63 |
+
scratchpad = ""
|
| 64 |
+
|
| 65 |
+
print(f" Starting Agent Loop for: '{query}'")
|
| 66 |
+
|
| 67 |
+
for step in range(10): # Max 10 steps
|
| 68 |
+
# Fill the prompt
|
| 69 |
+
prompt = self.prompt_template.format(
|
| 70 |
+
tool_desc=self.tool_desc,
|
| 71 |
+
tool_names=self.tool_names,
|
| 72 |
+
input=query,
|
| 73 |
+
agent_scratchpad=scratchpad
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Call LLM
|
| 77 |
+
# stop=["\nObservation:"] prevents the LLM from hallucinating the tool output
|
| 78 |
+
response = self.llm.invoke(prompt, stop=["\nObservation:"])
|
| 79 |
+
response_text = response.content
|
| 80 |
+
|
| 81 |
+
if self.verbose:
|
| 82 |
+
print(f"\n🧠 Step {step+1}: {response_text.strip()}")
|
| 83 |
+
|
| 84 |
+
scratchpad += response_text
|
| 85 |
+
|
| 86 |
+
# Check for completion
|
| 87 |
+
if "Final Answer:" in response_text:
|
| 88 |
+
return {"output": response_text.split("Final Answer:")[-1].strip()}
|
| 89 |
+
|
| 90 |
+
# Parse Action
|
| 91 |
+
action_match = re.search(r"Action:\s*(.*?)\n", response_text)
|
| 92 |
+
input_match = re.search(r"Action Input:\s*(.*)", response_text)
|
| 93 |
+
|
| 94 |
+
if action_match and input_match:
|
| 95 |
+
action_name = action_match.group(1).strip()
|
| 96 |
+
action_input = input_match.group(1).strip()
|
| 97 |
+
|
| 98 |
+
# Execute Tool
|
| 99 |
+
if action_name in self.tools:
|
| 100 |
+
if self.verbose:
|
| 101 |
+
print(f"🛠️ Calling '{action_name}' with: {action_input}")
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Handle both SimpleTool (.run) and LangChain Tools (.invoke or .run)
|
| 105 |
+
tool = self.tools[action_name]
|
| 106 |
+
if hasattr(tool, 'invoke'):
|
| 107 |
+
tool_result = tool.invoke(action_input)
|
| 108 |
+
else:
|
| 109 |
+
tool_result = tool.run(action_input)
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
tool_result = f"Error executing tool: {e}"
|
| 113 |
+
|
| 114 |
+
observation = f"\nObservation: {tool_result}\n"
|
| 115 |
+
else:
|
| 116 |
+
observation = f"\nObservation: Error: Tool '{action_name}' not found. Available: {self.tool_names}\n"
|
| 117 |
+
|
| 118 |
+
scratchpad += observation
|
| 119 |
+
else:
|
| 120 |
+
# Fallback: if no action found but also no Final Answer
|
| 121 |
+
if "Action:" in response_text:
|
| 122 |
+
scratchpad += "\nObservation: You provided an Action but no Action Input. Please provide the input.\n"
|
| 123 |
+
else:
|
| 124 |
+
return {"output": response_text.strip()}
|
| 125 |
+
|
| 126 |
+
return {"output": "Agent timed out."}
|
| 127 |
+
|
| 128 |
+
# --- 3. MAIN SETUP FUNCTION ---
|
| 129 |
+
|
| 130 |
+
def create_mshauri_agent(
|
| 131 |
+
sql_db_path=DEFAULT_SQL_DB,
|
| 132 |
+
vector_db_path=DEFAULT_VECTOR_DB,
|
| 133 |
+
llm_model=DEFAULT_LLM_MODEL,
|
| 134 |
+
ollama_url=DEFAULT_OLLAMA_URL
|
| 135 |
+
):
|
| 136 |
+
print(f" Initializing Mshauri Fedha (Model: {llm_model})...")
|
| 137 |
+
|
| 138 |
+
# 1. Initialize LLM
|
| 139 |
+
try:
|
| 140 |
+
llm = ChatOllama(model=llm_model, base_url=ollama_url, temperature=0.1)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
print(f" Error connecting to Ollama: {e}")
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
# 2. LEFT BRAIN (SQL)
|
| 146 |
+
if "sqlite" in sql_db_path:
|
| 147 |
+
real_path = sql_db_path.replace("sqlite:///", "")
|
| 148 |
+
if not os.path.exists(real_path):
|
| 149 |
+
print(f" Warning: SQL Database not found at {real_path}")
|
| 150 |
+
|
| 151 |
+
db = SQLDatabase.from_uri(sql_db_path)
|
| 152 |
+
# The Toolkit returns standard LangChain tools, which our SimpleReActAgent can handle
|
| 153 |
+
sql_toolkit = SQLDatabaseToolkit(db=db, llm=llm)
|
| 154 |
+
sql_tools = sql_toolkit.get_tools()
|
| 155 |
+
|
| 156 |
+
# 3. RIGHT BRAIN (Vector)
|
| 157 |
+
# We define the Retriever function manually
|
| 158 |
+
def search_docs(query):
|
| 159 |
+
embeddings = OllamaEmbeddings(model=DEFAULT_EMBED_MODEL, base_url=ollama_url)
|
| 160 |
+
vectorstore = Chroma(persist_directory=vector_db_path, embedding_function=embeddings)
|
| 161 |
+
docs = vectorstore.similarity_search(query, k=4)
|
| 162 |
+
return "\n\n".join([d.page_content for d in docs])
|
| 163 |
+
|
| 164 |
+
# Use our SimpleTool wrapper instead of importing from langchain
|
| 165 |
+
retriever_tool = SimpleTool(
|
| 166 |
+
name="search_financial_reports_and_news",
|
| 167 |
+
func=search_docs,
|
| 168 |
+
description="Searches CBK/KNBS reports and business news. Use this for qualitative questions (why, how, trends) or when SQL data is missing."
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# 4. CREATE AGENT
|
| 172 |
+
tools = sql_tools + [retriever_tool]
|
| 173 |
+
agent = SimpleReActAgent(llm, tools)
|
| 174 |
+
|
| 175 |
+
print(" Mshauri Agent Ready (Zero-Dependency Mode).")
|
| 176 |
+
return agent
|
| 177 |
+
|
| 178 |
+
def ask_mshauri(agent, query):
|
| 179 |
+
if not agent:
|
| 180 |
+
print(" Agent not initialized.")
|
| 181 |
+
return
|
| 182 |
+
|
| 183 |
+
print(f"\n User: {query}")
|
| 184 |
+
print("-" * 40)
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
response = agent.invoke({"input": query})
|
| 188 |
+
print("-" * 40)
|
| 189 |
+
print(f" Mshauri: {response['output']}")
|
| 190 |
+
return response['output']
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f" Error during execution: {e}")
|
| 193 |
+
return None
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
# Quick Test
|
| 197 |
+
agent = create_mshauri_agent()
|
| 198 |
+
ask_mshauri(agent, "What is the inflation rate?")
|
src/load/start_ollama.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import time
|
| 4 |
+
import requests
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
def start_ollama_server():
|
| 8 |
+
"""Checks if Ollama is running on port 25000, if not, starts it."""
|
| 9 |
+
OLLAMA_PORT = "25000"
|
| 10 |
+
OLLAMA_HOST = f"http://127.0.0.1:{OLLAMA_PORT}"
|
| 11 |
+
|
| 12 |
+
# 1. Check if already running
|
| 13 |
+
try:
|
| 14 |
+
if requests.get(OLLAMA_HOST).status_code == 200:
|
| 15 |
+
print(" Ollama is already running.")
|
| 16 |
+
return True
|
| 17 |
+
except:
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
print(" Starting Ollama Server...")
|
| 21 |
+
|
| 22 |
+
# 2. Define Paths (CSCS Environment)
|
| 23 |
+
SCRATCH = os.environ.get("SCRATCH", "/tmp")
|
| 24 |
+
BASE_DIR = Path(SCRATCH)
|
| 25 |
+
OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
|
| 26 |
+
MODELS_DIR = BASE_DIR / "ollama_core/models"
|
| 27 |
+
|
| 28 |
+
# 3. Setup Environment
|
| 29 |
+
server_env = os.environ.copy()
|
| 30 |
+
server_env["OLLAMA_HOST"] = f"127.0.0.1:{OLLAMA_PORT}"
|
| 31 |
+
server_env["OLLAMA_MODELS"] = str(MODELS_DIR)
|
| 32 |
+
|
| 33 |
+
# 4. Start Background Process
|
| 34 |
+
try:
|
| 35 |
+
subprocess.Popen(
|
| 36 |
+
[str(OLLAMA_BIN), "serve"],
|
| 37 |
+
stdout=subprocess.DEVNULL,
|
| 38 |
+
stderr=subprocess.DEVNULL,
|
| 39 |
+
env=server_env
|
| 40 |
+
)
|
| 41 |
+
print(" Waiting for server to boot...")
|
| 42 |
+
time.sleep(10) # Give it time to initialize
|
| 43 |
+
|
| 44 |
+
# 5. Verify
|
| 45 |
+
if requests.get(OLLAMA_HOST).status_code == 200:
|
| 46 |
+
print(" Server started successfully.")
|
| 47 |
+
return True
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f" Failed to start server: {e}")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
import requests
|
| 53 |
+
import json
|
| 54 |
+
import sys
|
| 55 |
+
|
| 56 |
+
def pull_embedding_model(model_name="nomic-embed-text"):
|
| 57 |
+
url = "http://127.0.0.1:25000/api/pull"
|
| 58 |
+
print(f" Requesting pull for '{model_name}'...")
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
# Send the pull request to the running server
|
| 62 |
+
response = requests.post(url, json={"name": model_name}, stream=True)
|
| 63 |
+
response.raise_for_status()
|
| 64 |
+
|
| 65 |
+
# Stream the progress so you know it's working
|
| 66 |
+
for line in response.iter_lines():
|
| 67 |
+
if line:
|
| 68 |
+
data = json.loads(line)
|
| 69 |
+
status = data.get('status', '')
|
| 70 |
+
completed = data.get('completed', 0)
|
| 71 |
+
total = data.get('total', 1)
|
| 72 |
+
|
| 73 |
+
# Print progress bar or status
|
| 74 |
+
if total > 1 and completed > 0:
|
| 75 |
+
percent = int((completed / total) * 100)
|
| 76 |
+
sys.stdout.write(f"\r {status}: {percent}%")
|
| 77 |
+
else:
|
| 78 |
+
sys.stdout.write(f"\r {status}")
|
| 79 |
+
sys.stdout.flush()
|
| 80 |
+
|
| 81 |
+
print(f"\n Model '{model_name}' installed successfully!")
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"\n Failed to pull model: {e}")
|
src/transform/config.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class ProcessingConfig:
|
| 8 |
+
"""Central configuration for Data Sources."""
|
| 9 |
+
root_dir: str
|
| 10 |
+
source_name: str # e.g., 'knbs' or 'cbk'
|
| 11 |
+
|
| 12 |
+
# Settings
|
| 13 |
+
batch_size: int = 20
|
| 14 |
+
max_workers: int = 4
|
| 15 |
+
min_image_bytes: int = 3000
|
| 16 |
+
min_image_dim: int = 100
|
| 17 |
+
max_page_objects: int = 500
|
| 18 |
+
|
| 19 |
+
def __post_init__(self):
|
| 20 |
+
# Paths setup
|
| 21 |
+
self.base_processed_dir = os.path.join(self.root_dir, 'processed')
|
| 22 |
+
self.source_dir = os.path.join(self.base_processed_dir, self.source_name)
|
| 23 |
+
self.drive_zip_dir = os.path.join(self.source_dir, "zipped_batches")
|
| 24 |
+
self.meta_dir = os.path.join(self.source_dir, f"{self.source_name}_index_metadata")
|
| 25 |
+
|
| 26 |
+
# Log Files
|
| 27 |
+
self.logs = {
|
| 28 |
+
'docs': os.path.join(self.meta_dir, f'{self.source_name}_docs_metadata.jsonl'),
|
| 29 |
+
'images': os.path.join(self.meta_dir, f'{self.source_name}_images_index.jsonl'),
|
| 30 |
+
'tables': os.path.join(self.meta_dir, f'{self.source_name}_tables_index.jsonl')
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
# Local Temp Paths
|
| 34 |
+
self.local_work_dir = Path(f"/tmp/temp_work_{self.source_name}")
|
| 35 |
+
self.local_dirs = {
|
| 36 |
+
'texts': self.local_work_dir / "texts",
|
| 37 |
+
'images': self.local_work_dir / "images",
|
| 38 |
+
'tables': self.local_work_dir / "tables",
|
| 39 |
+
'pdfs': self.local_work_dir / "pdfs"
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def setup(self):
|
| 43 |
+
os.makedirs(self.drive_zip_dir, exist_ok=True)
|
| 44 |
+
os.makedirs(self.meta_dir, exist_ok=True)
|
| 45 |
+
if self.local_work_dir.exists():
|
| 46 |
+
shutil.rmtree(self.local_work_dir)
|
| 47 |
+
for d in self.local_dirs.values():
|
| 48 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 49 |
+
self.create_canary()
|
| 50 |
+
|
| 51 |
+
def create_canary(self):
|
| 52 |
+
script_content = """
|
| 53 |
+
import sys, pymupdf, pdfplumber
|
| 54 |
+
if len(sys.argv) < 2: sys.exit(1)
|
| 55 |
+
try:
|
| 56 |
+
doc = pymupdf.open(sys.argv[1])
|
| 57 |
+
for p in doc: _, _ = p.get_text(), [doc.extract_image(i[0]) for i in p.get_images(full=True)]
|
| 58 |
+
with pdfplumber.open(sys.argv[1]) as p: _ = [page.objects for page in p.pages]
|
| 59 |
+
print("SAFE")
|
| 60 |
+
sys.exit(0)
|
| 61 |
+
except: sys.exit(1)
|
| 62 |
+
"""
|
| 63 |
+
with open("pdf_canary.py", "w") as f:
|
| 64 |
+
f.write(script_content.strip())
|
src/transform/download_files.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import zipfile
|
| 4 |
+
import time
|
| 5 |
+
import gc
|
| 6 |
+
import threading
|
| 7 |
+
import shutil
|
| 8 |
+
import requests
|
| 9 |
+
import subprocess
|
| 10 |
+
import pymupdf
|
| 11 |
+
import pdfplumber
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import urllib3
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
+
from tqdm import tqdm
|
| 18 |
+
from requests.adapters import HTTPAdapter
|
| 19 |
+
from urllib3.util.retry import Retry
|
| 20 |
+
|
| 21 |
+
# Import shared config
|
| 22 |
+
from config import ProcessingConfig
|
| 23 |
+
|
| 24 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 25 |
+
|
| 26 |
+
class UniversalProcessor:
|
| 27 |
+
def __init__(self, config: ProcessingConfig):
|
| 28 |
+
self.config = config
|
| 29 |
+
retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 504])
|
| 30 |
+
adapter = HTTPAdapter(max_retries=retry)
|
| 31 |
+
self.session = requests.Session()
|
| 32 |
+
self.session.mount("https://", adapter)
|
| 33 |
+
self.session.mount("http://", adapter)
|
| 34 |
+
|
| 35 |
+
def download(self, url, safe_title) -> Path:
|
| 36 |
+
try:
|
| 37 |
+
response = self.session.get(url, timeout=30, stream=True, verify=False)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
ext = Path(url).suffix.lower() or '.pdf'
|
| 40 |
+
safe_name = safe_title[:50].replace(' ', '_').replace('/', '_')
|
| 41 |
+
filepath = self.config.local_dirs['pdfs'] / f"{safe_name}{ext}"
|
| 42 |
+
with open(filepath, 'wb') as f:
|
| 43 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 44 |
+
f.write(chunk)
|
| 45 |
+
return filepath
|
| 46 |
+
except: return None
|
| 47 |
+
|
| 48 |
+
def process(self, filepath: Path, url: str, safe_title: str):
|
| 49 |
+
# Only does basic validation/extraction for the initial pass
|
| 50 |
+
try:
|
| 51 |
+
res = subprocess.run(["python", "pdf_canary.py", str(filepath)], capture_output=True, timeout=15)
|
| 52 |
+
if res.returncode != 0: return None
|
| 53 |
+
|
| 54 |
+
# Basic PyMuPDF extraction for quick preview
|
| 55 |
+
doc = pymupdf.open(filepath)
|
| 56 |
+
text = "".join([page.get_text() for page in doc])
|
| 57 |
+
doc.close()
|
| 58 |
+
|
| 59 |
+
return {
|
| 60 |
+
'text': text,
|
| 61 |
+
'tables': [],
|
| 62 |
+
'images': [],
|
| 63 |
+
'metadata': {'pages': len(doc)}
|
| 64 |
+
}
|
| 65 |
+
except: return None
|
| 66 |
+
|
| 67 |
+
class BatchPipeline:
|
| 68 |
+
def __init__(self, config: ProcessingConfig, processor: UniversalProcessor):
|
| 69 |
+
self.config = config
|
| 70 |
+
self.processor = processor
|
| 71 |
+
self.lock = threading.Lock()
|
| 72 |
+
self.config.setup()
|
| 73 |
+
|
| 74 |
+
def _append_log(self, log_key, record):
|
| 75 |
+
with self.lock:
|
| 76 |
+
with open(self.config.logs[log_key], 'a', encoding='utf-8') as f:
|
| 77 |
+
f.write(json.dumps(record) + '\n')
|
| 78 |
+
|
| 79 |
+
def _worker(self, item):
|
| 80 |
+
row = item['row']
|
| 81 |
+
title = str(row.get('text', 'untitled'))
|
| 82 |
+
url = row['file_url']
|
| 83 |
+
|
| 84 |
+
path = self.processor.download(url, title)
|
| 85 |
+
if not path or path.stat().st_size < 500: return None
|
| 86 |
+
|
| 87 |
+
data = self.processor.process(path, url, title)
|
| 88 |
+
if not data: return None
|
| 89 |
+
|
| 90 |
+
# Save Preview Text
|
| 91 |
+
with open(self.config.local_dirs['texts'] / f"{path.stem}.txt", 'w') as f:
|
| 92 |
+
f.write(data['text'])
|
| 93 |
+
|
| 94 |
+
self._append_log('docs', {'url': url, 'file': path.name, 'status': 'downloaded'})
|
| 95 |
+
return True
|
| 96 |
+
|
| 97 |
+
def _zip_and_ship(self, batch_id):
|
| 98 |
+
ts = datetime.now().strftime("%H%M%S")
|
| 99 |
+
zname = f"{self.config.source_name}_{batch_id}_{ts}.zip"
|
| 100 |
+
local_z = Path(f"/tmp/{zname}")
|
| 101 |
+
drive_z = Path(self.config.drive_zip_dir) / zname
|
| 102 |
+
|
| 103 |
+
with zipfile.ZipFile(local_z, 'w', zipfile.ZIP_DEFLATED) as z:
|
| 104 |
+
for root, _, files in os.walk(self.config.local_work_dir):
|
| 105 |
+
for f in files:
|
| 106 |
+
fp = os.path.join(root, f)
|
| 107 |
+
z.write(fp, os.path.relpath(fp, self.config.local_work_dir))
|
| 108 |
+
|
| 109 |
+
shutil.copy(local_z, drive_z)
|
| 110 |
+
self.config.setup() # Wipe local
|
| 111 |
+
os.remove(local_z)
|
| 112 |
+
|
| 113 |
+
def run(self, df, ignore_history=False):
|
| 114 |
+
done = set()
|
| 115 |
+
if not ignore_history and os.path.exists(self.config.logs['docs']):
|
| 116 |
+
with open(self.config.logs['docs']) as f:
|
| 117 |
+
for l in f:
|
| 118 |
+
try: done.add(json.loads(l)['url'])
|
| 119 |
+
except: continue
|
| 120 |
+
|
| 121 |
+
queue = [r for _, r in df.iterrows() if r['file_url'] not in done]
|
| 122 |
+
print(f" Queued: {len(queue)} files.")
|
| 123 |
+
|
| 124 |
+
bs = self.config.batch_size
|
| 125 |
+
for i in range(0, len(queue), bs):
|
| 126 |
+
batch = queue[i:i+bs]
|
| 127 |
+
bid = f"batch_{i//bs + 1}"
|
| 128 |
+
print(f"Processing {bid}...")
|
| 129 |
+
|
| 130 |
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as ex:
|
| 131 |
+
futures = [ex.submit(self._worker, {'row': item}) for item in batch]
|
| 132 |
+
for _ in tqdm(as_completed(futures), total=len(batch)): pass
|
| 133 |
+
|
| 134 |
+
self._zip_and_ship(bid)
|
| 135 |
+
gc.collect()
|
| 136 |
+
|
| 137 |
+
if __name__ == "__main__":
|
| 138 |
+
# Example Usage
|
| 139 |
+
ROOT_DIR = "/scratch/user/mshauri_data"
|
| 140 |
+
conf = ProcessingConfig(root_dir=ROOT_DIR, source_name='cbk')
|
| 141 |
+
pipe = BatchPipeline(conf, UniversalProcessor(conf))
|
src/transform/extract.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import time
|
| 6 |
+
import queue
|
| 7 |
+
import logging
|
| 8 |
+
import gc
|
| 9 |
+
import multiprocessing as mp
|
| 10 |
+
import argparse
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
def configure_parallelism(workers_per_gpu=5):
|
| 15 |
+
"""
|
| 16 |
+
Tuned specifically for 96GB VRAM GPUs.
|
| 17 |
+
We cap workers to prevent 'Thundering Herd' and use VRAM for 'Batch Power'.
|
| 18 |
+
"""
|
| 19 |
+
if not torch.cuda.is_available():
|
| 20 |
+
return max(1, mp.cpu_count() // 2), 1, 0
|
| 21 |
+
|
| 22 |
+
num_gpus = torch.cuda.device_count()
|
| 23 |
+
gpu_properties = torch.cuda.get_device_properties(0)
|
| 24 |
+
total_vram_gb = gpu_properties.total_memory / (1024**3)
|
| 25 |
+
|
| 26 |
+
# --- THE STABILITY STRATEGY ---
|
| 27 |
+
# On 96GB, 8-10 workers is the "Sweet Spot".
|
| 28 |
+
# More workers than this creates too much 'context switching' overhead on the GPU.
|
| 29 |
+
|
| 30 |
+
total_slots = num_gpus * workers_per_gpu
|
| 31 |
+
|
| 32 |
+
print(f"🔍 GH200/A100 Detected: {num_gpus} GPUs | {total_vram_gb:.1f} GB VRAM")
|
| 33 |
+
print(f"⚙️ Stability Config: {workers_per_gpu} workers/GPU | {total_slots} Total Slots")
|
| 34 |
+
|
| 35 |
+
# --- SYSTEM TUNING ---
|
| 36 |
+
os.environ["OLLAMA_NUM_PARALLEL"] = str(total_slots)
|
| 37 |
+
os.environ["OLLAMA_MAX_QUEUE"] = "2048" # Large buffer for requests
|
| 38 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 39 |
+
|
| 40 |
+
return total_slots, workers_per_gpu, num_gpus
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# --- CRITICAL: MULTIPROCESSING SETUP ---
|
| 45 |
+
try:
|
| 46 |
+
mp.set_start_method('spawn', force=True)
|
| 47 |
+
except RuntimeError:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
from marker.converters.pdf import PdfConverter
|
| 51 |
+
from marker.models import create_model_dict
|
| 52 |
+
from marker.output import text_from_rendered
|
| 53 |
+
from marker.config.parser import ConfigParser
|
| 54 |
+
|
| 55 |
+
# Configure Logger
|
| 56 |
+
logging.basicConfig(
|
| 57 |
+
level=logging.INFO,
|
| 58 |
+
format='%(asctime)s - [GPU-%(processName)s] - %(message)s',
|
| 59 |
+
datefmt='%H:%M:%S'
|
| 60 |
+
)
|
| 61 |
+
logger = logging.getLogger(__name__)
|
| 62 |
+
|
| 63 |
+
def worker_routine(worker_id, gpu_id, batch_queue, output_dir, ollama_config, marker_config):
|
| 64 |
+
"""
|
| 65 |
+
Optimized Worker for GH200:
|
| 66 |
+
1. Receives a BATCH of files (List) to reduce queue overhead.
|
| 67 |
+
2. Uses torch.compile for architectural optimization.
|
| 68 |
+
3. Skips image extraction for speed.
|
| 69 |
+
4. Fails fast on tables (Triage) to prevent LLM stalls.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
time.sleep(worker_id * 1.5)
|
| 73 |
+
|
| 74 |
+
mp.current_process().name = f"{worker_id}:Dev{gpu_id}"
|
| 75 |
+
logger.info(f"Initializing Worker {worker_id}...")
|
| 76 |
+
|
| 77 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
| 78 |
+
os.environ["TORCH_DEVICE"] = f"cuda:{gpu_id}"
|
| 79 |
+
|
| 80 |
+
# 2. Model Initialization
|
| 81 |
+
try:
|
| 82 |
+
# Load Surya/OCR weights
|
| 83 |
+
artifact_dict = create_model_dict(
|
| 84 |
+
device=f"cuda:{gpu_id}",
|
| 85 |
+
dtype=torch.bfloat16,
|
| 86 |
+
attention_implementation="flash_attention_2"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# --- OPTIMIZATION 1: torch.compile ---
|
| 90 |
+
logger.info("Compiling models with torch.compile... (One-time setup)")
|
| 91 |
+
for key, model in artifact_dict.items():
|
| 92 |
+
if hasattr(model, 'forward'):
|
| 93 |
+
artifact_dict[key] = torch.compile(model, mode="max-autotune")
|
| 94 |
+
|
| 95 |
+
# --- OPTIMIZATION 2: Config Tuning ---
|
| 96 |
+
full_config = {
|
| 97 |
+
"output_format": "markdown",
|
| 98 |
+
"disable_multiprocessing": True,
|
| 99 |
+
"extract_images": False, # Speed up: Skip image extraction
|
| 100 |
+
"ocr_all_pages": False,
|
| 101 |
+
"use_llm": True,
|
| 102 |
+
"llm_service": "marker.services.ollama.OllamaService",
|
| 103 |
+
|
| 104 |
+
# --- TRIAGE STRATEGY ---
|
| 105 |
+
"max_table_retries": 0, # Fail fast if table extraction stalls
|
| 106 |
+
"llm_service_timeout": 150, # Don't let a table hold a worker for more than x minutes
|
| 107 |
+
|
| 108 |
+
**ollama_config,
|
| 109 |
+
**marker_config
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
config_parser = ConfigParser(full_config)
|
| 113 |
+
|
| 114 |
+
converter = PdfConverter(
|
| 115 |
+
config=config_parser.generate_config_dict(),
|
| 116 |
+
artifact_dict=artifact_dict,
|
| 117 |
+
processor_list=config_parser.get_processors(),
|
| 118 |
+
renderer=config_parser.get_renderer(),
|
| 119 |
+
llm_service=config_parser.get_llm_service()
|
| 120 |
+
)
|
| 121 |
+
logger.info(f"Worker Ready. Waiting for batches...")
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Initialization Failed: {e}")
|
| 125 |
+
return
|
| 126 |
+
|
| 127 |
+
# 3. Batch Work Loop
|
| 128 |
+
batches_processed = 0
|
| 129 |
+
while True:
|
| 130 |
+
try:
|
| 131 |
+
# Get a list of files (Batch)
|
| 132 |
+
batch_files = batch_queue.get(timeout=5)
|
| 133 |
+
except queue.Empty:
|
| 134 |
+
logger.info(f"Queue empty. Worker shutting down. Processed {batches_processed} batches.")
|
| 135 |
+
break
|
| 136 |
+
|
| 137 |
+
# Process the batch locally
|
| 138 |
+
for pdf_path_str in batch_files:
|
| 139 |
+
try:
|
| 140 |
+
pdf_path = Path(pdf_path_str)
|
| 141 |
+
doc_out_dir = Path(output_dir) / pdf_path.stem
|
| 142 |
+
md_file = doc_out_dir / f"{pdf_path.stem}.md"
|
| 143 |
+
|
| 144 |
+
if md_file.exists():
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# Heavy Compute (OCR + LLM)
|
| 148 |
+
rendered = converter(str(pdf_path))
|
| 149 |
+
|
| 150 |
+
if rendered is None:
|
| 151 |
+
logger.warning(f"Skipping {pdf_path.name}: Converter returned None")
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
text, meta, images = text_from_rendered(rendered)
|
| 155 |
+
|
| 156 |
+
# Write Output
|
| 157 |
+
doc_out_dir.mkdir(parents=True, exist_ok=True)
|
| 158 |
+
with open(md_file, "w", encoding="utf-8") as f:
|
| 159 |
+
f.write(text)
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.error(f"Failed on {pdf_path.name}: {e}")
|
| 163 |
+
|
| 164 |
+
# Cleanup after batch to keep VRAM healthy
|
| 165 |
+
batches_processed += 1
|
| 166 |
+
|
| 167 |
+
# Aggressive GC after every batch prevents "memory creep" on long runs
|
| 168 |
+
gc.collect()
|
| 169 |
+
torch.cuda.empty_cache()
|
| 170 |
+
|
| 171 |
+
class MarkerFolderProcessor:
|
| 172 |
+
def __init__(self, output_dir, ollama_url, ollama_model, batch_multiplier, workers_per_gpu, num_gpus):
|
| 173 |
+
self.output_dir = Path(output_dir)
|
| 174 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 175 |
+
self.num_gpus = num_gpus
|
| 176 |
+
|
| 177 |
+
# We now accept the dynamic num_gpus passed from __main__
|
| 178 |
+
if self.num_gpus > 0:
|
| 179 |
+
print(f" Detected {self.num_gpus} GPUs (Dynamic Mode)")
|
| 180 |
+
else:
|
| 181 |
+
print(" No GPUs detected. Running in CPU mode.")
|
| 182 |
+
|
| 183 |
+
self.workers_per_gpu = workers_per_gpu
|
| 184 |
+
|
| 185 |
+
# Configs passed to workers
|
| 186 |
+
self.ollama_config = {
|
| 187 |
+
"ollama_base_url": ollama_url,
|
| 188 |
+
"ollama_model": ollama_model,
|
| 189 |
+
"ollama_timeout": 600, # 10 mins max per request
|
| 190 |
+
"ollama_options": {
|
| 191 |
+
"num_ctx": 32768,
|
| 192 |
+
"num_predict": 2048,
|
| 193 |
+
"temperature": 0.0
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
self.marker_config = {"batch_multiplier": batch_multiplier}
|
| 197 |
+
|
| 198 |
+
def process_folder(self, source_folder, batch_size=10, subset=None):
|
| 199 |
+
if subset is not None:
|
| 200 |
+
# Use the partitioned list provided by run_transform.py
|
| 201 |
+
pdfs = [Path(p) for p in subset]
|
| 202 |
+
else:
|
| 203 |
+
source_path = Path(source_folder)
|
| 204 |
+
pdfs = sorted(list(source_path.glob("*.pdf")))
|
| 205 |
+
|
| 206 |
+
if not pdfs:
|
| 207 |
+
print("No PDFs to process.")
|
| 208 |
+
return
|
| 209 |
+
|
| 210 |
+
manager = mp.Manager()
|
| 211 |
+
batch_queue = manager.Queue()
|
| 212 |
+
|
| 213 |
+
# --- BATCHING STRATEGY ---
|
| 214 |
+
# Chunk the list of PDFs into batches
|
| 215 |
+
chunks = [pdfs[i:i + batch_size] for i in range(0, len(pdfs), batch_size)]
|
| 216 |
+
print(f" Created {len(chunks)} batches of {batch_size} files each.")
|
| 217 |
+
|
| 218 |
+
for chunk in chunks:
|
| 219 |
+
batch_queue.put([str(p) for p in chunk])
|
| 220 |
+
|
| 221 |
+
total_workers = (self.num_gpus * self.workers_per_gpu) if self.num_gpus > 0 else 1
|
| 222 |
+
print(f" Launching {total_workers} workers on {self.num_gpus} GPUs...")
|
| 223 |
+
|
| 224 |
+
processes = []
|
| 225 |
+
for i in range(total_workers):
|
| 226 |
+
gpu_id = i % self.num_gpus if self.num_gpus > 0 else 0
|
| 227 |
+
p = mp.Process(
|
| 228 |
+
target=worker_routine,
|
| 229 |
+
args=(i, gpu_id, batch_queue, self.output_dir, self.ollama_config, self.marker_config)
|
| 230 |
+
)
|
| 231 |
+
p.start()
|
| 232 |
+
processes.append(p)
|
| 233 |
+
|
| 234 |
+
for p in processes:
|
| 235 |
+
p.join()
|
| 236 |
+
|
| 237 |
+
print(" Extraction Complete.")
|
| 238 |
+
|
| 239 |
+
# This block only runs if you execute 'python extract.py' directly.
|
| 240 |
+
if __name__ == "__main__":
|
| 241 |
+
parser = argparse.ArgumentParser()
|
| 242 |
+
parser.add_argument("--input", required=True, help="Input folder of PDFs")
|
| 243 |
+
parser.add_argument("--output", required=True, help="Output folder")
|
| 244 |
+
parser.add_argument("--url", default="http://localhost:11434", help="Ollama URL")
|
| 245 |
+
parser.add_argument("--model", default="llama3", help="Ollama Model Name")
|
| 246 |
+
args = parser.parse_args()
|
| 247 |
+
|
| 248 |
+
# --- DYNAMIC HARDWARE DETECTION ---
|
| 249 |
+
if torch.cuda.is_available():
|
| 250 |
+
num_gpus = torch.cuda.device_count()
|
| 251 |
+
gpu_properties = torch.cuda.get_device_properties(0)
|
| 252 |
+
total_vram_gb = gpu_properties.total_memory / (1024**3)
|
| 253 |
+
|
| 254 |
+
# Calculate optimal workers: (VRAM - 2GB overhead) / 5GB per worker
|
| 255 |
+
workers_per_gpu = int((total_vram_gb - 2) // 5)
|
| 256 |
+
workers_per_gpu = max(1, workers_per_gpu) # Minimum 1
|
| 257 |
+
|
| 258 |
+
total_slots = num_gpus * workers_per_gpu
|
| 259 |
+
print(f"⚙️ Dynamic Config: {num_gpus} GPUs | {workers_per_gpu} workers/GPU | {total_slots} Total Slots")
|
| 260 |
+
|
| 261 |
+
# Set Env vars for external tools (optional, but good practice)
|
| 262 |
+
os.environ["OLLAMA_NUM_PARALLEL"] = str(total_slots)
|
| 263 |
+
|
| 264 |
+
else:
|
| 265 |
+
num_gpus = 0
|
| 266 |
+
workers_per_gpu = 1
|
| 267 |
+
print(" No GPU detected. Defaulting to 1 worker.")
|
| 268 |
+
|
| 269 |
+
# --- PROCESSOR INIT ---
|
| 270 |
+
processor = MarkerFolderProcessor(
|
| 271 |
+
output_dir=args.output,
|
| 272 |
+
ollama_url=args.url,
|
| 273 |
+
ollama_model=args.model,
|
| 274 |
+
batch_multiplier=2,
|
| 275 |
+
workers_per_gpu=workers_per_gpu, # Passed dynamically
|
| 276 |
+
num_gpus=num_gpus # Passed dynamically
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
processor.process_folder(args.input, batch_size=10)
|
src/transform/get_csv_from_md.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import io
|
| 5 |
+
import time
|
| 6 |
+
import logging
|
| 7 |
+
import requests
|
| 8 |
+
import subprocess
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from sqlalchemy import create_engine
|
| 12 |
+
from ollama import Client
|
| 13 |
+
|
| 14 |
+
# --- LOGGING ---
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
| 16 |
+
logger = logging.getLogger("KNBS_Ingest")
|
| 17 |
+
|
| 18 |
+
# --- 1. INFRASTRUCTURE ---
|
| 19 |
+
|
| 20 |
+
def _manage_ollama_server(ollama_host, ollama_port, ollama_bin, model):
|
| 21 |
+
"""Ensures Ollama is running (reuses existing logic)."""
|
| 22 |
+
try:
|
| 23 |
+
if requests.get(ollama_host).status_code == 200:
|
| 24 |
+
logger.info(" Ollama connected.")
|
| 25 |
+
return True
|
| 26 |
+
except: pass
|
| 27 |
+
|
| 28 |
+
logger.info(f"🚀 Starting Ollama ({model})...")
|
| 29 |
+
scratch_env = os.environ.get("SCRATCH", "/tmp")
|
| 30 |
+
models_dir = Path(scratch_env) / "ollama_core/models"
|
| 31 |
+
|
| 32 |
+
server_env = os.environ.copy()
|
| 33 |
+
server_env["OLLAMA_HOST"] = f"127.0.0.1:{ollama_port}"
|
| 34 |
+
server_env["OLLAMA_MODELS"] = str(models_dir)
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
subprocess.Popen([str(ollama_bin), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=server_env)
|
| 38 |
+
time.sleep(5)
|
| 39 |
+
subprocess.run([str(ollama_bin), "pull", model], env=server_env, check=True)
|
| 40 |
+
return True
|
| 41 |
+
except Exception as e:
|
| 42 |
+
logger.error(f" Server Error: {e}")
|
| 43 |
+
return False
|
| 44 |
+
|
| 45 |
+
# --- 2. MARKDOWN PARSING ENGINE ---
|
| 46 |
+
|
| 47 |
+
def extract_tables_from_markdown(md_content: str) -> list[pd.DataFrame]:
|
| 48 |
+
"""
|
| 49 |
+
Scans markdown text for pipe-delimited tables (| col | col |)
|
| 50 |
+
and converts them to Pandas DataFrames.
|
| 51 |
+
"""
|
| 52 |
+
tables = []
|
| 53 |
+
lines = md_content.split('\n')
|
| 54 |
+
buffer = []
|
| 55 |
+
inside_table = False
|
| 56 |
+
|
| 57 |
+
for line in lines:
|
| 58 |
+
stripped = line.strip()
|
| 59 |
+
# Detect table lines (must start and end with |)
|
| 60 |
+
if stripped.startswith('|') and stripped.endswith('|'):
|
| 61 |
+
inside_table = True
|
| 62 |
+
buffer.append(stripped)
|
| 63 |
+
else:
|
| 64 |
+
if inside_table:
|
| 65 |
+
# Table block ended, process buffer
|
| 66 |
+
if buffer:
|
| 67 |
+
try:
|
| 68 |
+
table_str = '\n'.join(buffer)
|
| 69 |
+
# Read using pandas, handling markdown separators
|
| 70 |
+
df = pd.read_csv(
|
| 71 |
+
io.StringIO(table_str),
|
| 72 |
+
sep="|",
|
| 73 |
+
skipinitialspace=True,
|
| 74 |
+
engine='python'
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# CLEANUP PANDAS ARTIFACTS
|
| 78 |
+
# 1. Drop empty columns (pandas creates empty cols for leading/trailing pipes)
|
| 79 |
+
df = df.dropna(axis=1, how='all')
|
| 80 |
+
|
| 81 |
+
# 2. Filter out the markdown divider row (e.g. ---|---|---)
|
| 82 |
+
if not df.empty:
|
| 83 |
+
df = df[~df.iloc[:,0].astype(str).str.contains('---', regex=False)]
|
| 84 |
+
|
| 85 |
+
if not df.empty and len(df.columns) > 1:
|
| 86 |
+
tables.append(df)
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.warning(f"Failed to parse a table block: {e}")
|
| 90 |
+
|
| 91 |
+
buffer = []
|
| 92 |
+
inside_table = False
|
| 93 |
+
|
| 94 |
+
return tables
|
| 95 |
+
|
| 96 |
+
# --- 3. LLM HEADER CLEANER (KNBS SPECIFIC) ---
|
| 97 |
+
|
| 98 |
+
def clean_knbs_headers(df: pd.DataFrame, filename: str, table_index: int, client: Client, model: str) -> pd.DataFrame:
|
| 99 |
+
"""
|
| 100 |
+
Uses LLM to sanitize headers, handling split headers common in PDF-to-Markdown.
|
| 101 |
+
"""
|
| 102 |
+
raw_headers = [str(c).strip() for c in df.columns]
|
| 103 |
+
|
| 104 |
+
# Context: Provide first 2 rows to help identify if headers are split across rows
|
| 105 |
+
data_preview = df.head(2).astype(str).values.tolist()
|
| 106 |
+
|
| 107 |
+
prompt = f"""
|
| 108 |
+
You are a Data Engineer cleaning Kenya National Bureau of Statistics (KNBS) data.
|
| 109 |
+
|
| 110 |
+
Source File: "{filename}"
|
| 111 |
+
Table Index: {table_index}
|
| 112 |
+
|
| 113 |
+
Current Headers: {raw_headers}
|
| 114 |
+
Data Preview (First 2 Rows): {data_preview}
|
| 115 |
+
|
| 116 |
+
Task: Return a list of {len(raw_headers)} clean, snake_case SQL column names.
|
| 117 |
+
|
| 118 |
+
RULES:
|
| 119 |
+
1. INFER MEANING: If header is "Gross" and Row 1 is "Domestic Product", the column name is "gdp".
|
| 120 |
+
2. HANDLE YEARS: If headers are "2019", "2020", keep as "year_2019".
|
| 121 |
+
3. HANDLE GARBAGE: If header is "Unnamed: 1" look at Data Preview. If it contains items like "Agriculture", name it "sector".
|
| 122 |
+
4. KNBS reports often have a "Total" column. Ensure it is named "total".
|
| 123 |
+
|
| 124 |
+
Respond ONLY with a JSON list of strings.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
res = client.chat(model=model, messages=[{'role': 'user', 'content': prompt}], format='json')
|
| 129 |
+
new_headers = json.loads(res['message']['content'])
|
| 130 |
+
|
| 131 |
+
# Handle dictionary wrapper if LLM returns {"headers": [...]}
|
| 132 |
+
if isinstance(new_headers, dict):
|
| 133 |
+
for val in new_headers.values():
|
| 134 |
+
if isinstance(val, list):
|
| 135 |
+
new_headers = val
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
# Validation: Length must match
|
| 139 |
+
if isinstance(new_headers, list) and len(new_headers) == len(df.columns):
|
| 140 |
+
df.columns = new_headers
|
| 141 |
+
else:
|
| 142 |
+
# Fallback: keep originals but snake_case them
|
| 143 |
+
df.columns = [re.sub(r'[^a-zA-Z0-9]', '_', str(c).strip()).lower() for c in df.columns]
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.warning(f"LLM Header clean failed (Table {table_index}): {e}")
|
| 147 |
+
|
| 148 |
+
return df
|
| 149 |
+
|
| 150 |
+
# --- 4. MAIN PIPELINE EXPORT ---
|
| 151 |
+
|
| 152 |
+
def ingest_knbs_data(input_dir: str, db_name: str, model: str = "qwen2.5:14b"):
|
| 153 |
+
"""
|
| 154 |
+
Main entry point to run the KNBS ingestion pipeline.
|
| 155 |
+
Recursively scans input_dir for all .md files.
|
| 156 |
+
"""
|
| 157 |
+
# Paths
|
| 158 |
+
SCRATCH = os.environ.get("SCRATCH", "/tmp")
|
| 159 |
+
BASE_DIR = Path(SCRATCH)
|
| 160 |
+
|
| 161 |
+
INPUT_PATH = Path(input_dir)
|
| 162 |
+
if not INPUT_PATH.exists():
|
| 163 |
+
INPUT_PATH = BASE_DIR / input_dir
|
| 164 |
+
|
| 165 |
+
if not INPUT_PATH.exists():
|
| 166 |
+
logger.error(f" Input directory not found: {INPUT_PATH}")
|
| 167 |
+
return
|
| 168 |
+
|
| 169 |
+
OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
|
| 170 |
+
CUSTOM_PORT = "25000"
|
| 171 |
+
OLLAMA_HOST = f"http://127.0.0.1:{CUSTOM_PORT}"
|
| 172 |
+
|
| 173 |
+
# Infrastructure
|
| 174 |
+
if not _manage_ollama_server(OLLAMA_HOST, CUSTOM_PORT, OLLAMA_BIN, model): return
|
| 175 |
+
|
| 176 |
+
engine = create_engine(f"sqlite:///{db_name}")
|
| 177 |
+
client = Client(host=OLLAMA_HOST)
|
| 178 |
+
|
| 179 |
+
# Process Files (RECURSIVE SEARCH using rglob)
|
| 180 |
+
files = sorted(list(INPUT_PATH.rglob("*.md")))
|
| 181 |
+
logger.info(f"🚀 Found {len(files)} KNBS markdown files (Recursive Scan). Starting ingestion...")
|
| 182 |
+
|
| 183 |
+
for f in files:
|
| 184 |
+
logger.info(f"📄 Processing {f.name}...")
|
| 185 |
+
try:
|
| 186 |
+
with open(f, 'r', encoding='utf-8', errors='ignore') as file:
|
| 187 |
+
content = file.read()
|
| 188 |
+
|
| 189 |
+
# A. Extract Tables
|
| 190 |
+
dfs = extract_tables_from_markdown(content)
|
| 191 |
+
|
| 192 |
+
if not dfs:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
logger.info(f" found {len(dfs)} tables.")
|
| 196 |
+
|
| 197 |
+
# B. Clean & Load Tables
|
| 198 |
+
for i, df in enumerate(dfs):
|
| 199 |
+
# Basic cleanup
|
| 200 |
+
df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
|
| 201 |
+
if df.empty or len(df) < 2: continue # Skip empty/tiny tables
|
| 202 |
+
|
| 203 |
+
# LLM Semantic Cleaning
|
| 204 |
+
df = clean_knbs_headers(df, f.name, i, client, model)
|
| 205 |
+
|
| 206 |
+
# Sanitize numeric data
|
| 207 |
+
for c in df.columns:
|
| 208 |
+
if any(x in str(c).lower() for x in ['rate', 'value', 'amount', 'total', 'year', 'price']):
|
| 209 |
+
df[c] = df[c].apply(lambda x: pd.to_numeric(str(x).replace(',', '').replace('%',''), errors='ignore'))
|
| 210 |
+
|
| 211 |
+
# Naming: knbs_{filename_slug}_tab{index}
|
| 212 |
+
slug = re.sub(r'[^a-zA-Z0-9]', '_', f.stem).lower()[:40].lstrip('_')
|
| 213 |
+
table_name = f"{slug}_tab{i}"
|
| 214 |
+
|
| 215 |
+
df['source_file'] = f.name
|
| 216 |
+
|
| 217 |
+
df.to_sql(table_name, engine, if_exists='replace', index=False)
|
| 218 |
+
logger.info(f" -> Saved table: {table_name} ({len(df)} rows)")
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f" Failed {f.name}: {e}")
|
| 222 |
+
|
| 223 |
+
logger.info(" KNBS Ingestion Complete.")
|
src/transform/run_transform.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
import logging
|
| 6 |
+
import requests
|
| 7 |
+
import torch
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import timedelta
|
| 10 |
+
|
| 11 |
+
# --- 1. LOGGING SETUP ---
|
| 12 |
+
# Identify Node Rank for logging clarity
|
| 13 |
+
NODE_ID = os.environ.get("SLURM_PROCID", "0")
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format=f'%(asctime)s - [Node {NODE_ID}] - %(levelname)s - %(message)s',
|
| 18 |
+
handlers=[
|
| 19 |
+
logging.StreamHandler(sys.stdout),
|
| 20 |
+
logging.FileHandler(f"logs/node_{NODE_ID}_transform.log")
|
| 21 |
+
]
|
| 22 |
+
)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
t_start = time.perf_counter()
|
| 27 |
+
logger.info(f"🚀 Starting Transformation Pipeline on Node {NODE_ID}")
|
| 28 |
+
|
| 29 |
+
# --- 2. ENVIRONMENT & PATHS ---
|
| 30 |
+
SCRATCH = Path(os.environ.get("SCRATCH", "/tmp"))
|
| 31 |
+
INPUT_PDFS_DIR = SCRATCH / "mshauri-fedha/data/knbs/pdfs"
|
| 32 |
+
OUTPUT_DIR = SCRATCH / "mshauri-fedha/data/knbs/marker-output"
|
| 33 |
+
|
| 34 |
+
OLLAMA_HOME = SCRATCH / "ollama_core"
|
| 35 |
+
OLLAMA_BIN = OLLAMA_HOME / "bin/ollama"
|
| 36 |
+
OLLAMA_HOST = "http://localhost:11434"
|
| 37 |
+
|
| 38 |
+
# Important: Ensure the current directory is in sys.path for 'extract' import
|
| 39 |
+
if os.getcwd() not in sys.path:
|
| 40 |
+
sys.path.append(os.getcwd())
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
from extract import MarkerFolderProcessor, configure_parallelism
|
| 44 |
+
except ImportError as e:
|
| 45 |
+
logger.error(f"Could not import extract.py from {os.getcwd()}")
|
| 46 |
+
raise e
|
| 47 |
+
|
| 48 |
+
# --- 3. DYNAMIC PARALLELISM & OLLAMA CONFIG ---
|
| 49 |
+
# Calculates workers based on node hardware (GH200 96GB)
|
| 50 |
+
total_slots, workers_per_gpu, num_gpus = configure_parallelism()
|
| 51 |
+
|
| 52 |
+
# Clean up any zombie servers on this node
|
| 53 |
+
subprocess.run(["pkill", "-f", "ollama serve"], stderr=subprocess.DEVNULL)
|
| 54 |
+
time.sleep(5)
|
| 55 |
+
|
| 56 |
+
# Set server environment variables
|
| 57 |
+
server_env = os.environ.copy()
|
| 58 |
+
server_env["OLLAMA_NUM_PARALLEL"] = str(total_slots)
|
| 59 |
+
server_env["OLLAMA_MAX_LOADED_MODELS"] = "1"
|
| 60 |
+
server_env["OLLAMA_MAX_QUEUE"] = "2048"
|
| 61 |
+
server_env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 62 |
+
|
| 63 |
+
logger.info(f"⏳ Launching Ollama Server with {total_slots} slots...")
|
| 64 |
+
subprocess.Popen(
|
| 65 |
+
[str(OLLAMA_BIN), "serve"],
|
| 66 |
+
stdout=subprocess.DEVNULL,
|
| 67 |
+
stderr=subprocess.DEVNULL,
|
| 68 |
+
env=server_env
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Heartbeat check
|
| 72 |
+
for i in range(60):
|
| 73 |
+
try:
|
| 74 |
+
if requests.get(OLLAMA_HOST).status_code == 200:
|
| 75 |
+
logger.info(" Ollama Server is online.")
|
| 76 |
+
break
|
| 77 |
+
except:
|
| 78 |
+
time.sleep(1)
|
| 79 |
+
else:
|
| 80 |
+
raise RuntimeError(" Ollama server heartbeat failed.")
|
| 81 |
+
|
| 82 |
+
# --- 4. MODEL SETUP ---
|
| 83 |
+
BASE_MODEL = "qwen2.5:7b"
|
| 84 |
+
CUSTOM_MODEL_NAME = "qwen2.5-7b-16k"
|
| 85 |
+
|
| 86 |
+
logger.info(f" Pulling {BASE_MODEL}...")
|
| 87 |
+
subprocess.run([str(OLLAMA_BIN), "pull", BASE_MODEL], check=True, capture_output=True)
|
| 88 |
+
|
| 89 |
+
logger.info(f" Creating custom 16k context model...")
|
| 90 |
+
modelfile_path = Path(f"Modelfile_node_{NODE_ID}")
|
| 91 |
+
modelfile_path.write_text(f"FROM {BASE_MODEL}\nPARAMETER num_ctx 16384")
|
| 92 |
+
subprocess.run([str(OLLAMA_BIN), "create", CUSTOM_MODEL_NAME, "-f", str(modelfile_path)], check=True, capture_output=True)
|
| 93 |
+
|
| 94 |
+
# --- 5. AUTOMATED DATA PARTITIONING ---
|
| 95 |
+
# Get all PDFs and sort them for deterministic behavior
|
| 96 |
+
all_pdfs = sorted(list(INPUT_PDFS_DIR.glob("*.pdf")))
|
| 97 |
+
total_nodes = int(os.environ.get("SLURM_NTASKS", 1))
|
| 98 |
+
node_rank = int(NODE_ID)
|
| 99 |
+
|
| 100 |
+
# Each node takes every Nth file (Node 0 takes index 0, 2, 4... Node 1 takes 1, 3, 5...)
|
| 101 |
+
my_pdfs = all_pdfs[node_rank::total_nodes]
|
| 102 |
+
my_pdf_strs = [str(p) for p in my_pdfs]
|
| 103 |
+
|
| 104 |
+
logger.info(f" Data Partitioning: Node {node_rank}/{total_nodes} handling {len(my_pdfs)} files.")
|
| 105 |
+
|
| 106 |
+
# --- 6. EXECUTION ---
|
| 107 |
+
os.chdir(SCRATCH)
|
| 108 |
+
|
| 109 |
+
processor = MarkerFolderProcessor(
|
| 110 |
+
output_dir=OUTPUT_DIR,
|
| 111 |
+
ollama_url=OLLAMA_HOST,
|
| 112 |
+
ollama_model=CUSTOM_MODEL_NAME,
|
| 113 |
+
batch_multiplier=4,
|
| 114 |
+
workers_per_gpu=workers_per_gpu,
|
| 115 |
+
num_gpus=num_gpus
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
logger.info(f"🚀 Processing PDFs...")
|
| 119 |
+
# Using the 'subset' parameter in process_folder (ensure extract.py supports this)
|
| 120 |
+
processor.process_folder(INPUT_PDFS_DIR, batch_size=5, subset=my_pdf_strs)
|
| 121 |
+
|
| 122 |
+
# --- 7. CLEANUP & TIMING ---
|
| 123 |
+
t_end = time.perf_counter()
|
| 124 |
+
duration = timedelta(seconds=t_end - t_start)
|
| 125 |
+
logger.info(" Transformation process finished.")
|
| 126 |
+
logger.info(f"⏱️ Total Duration for Node {NODE_ID}: {duration}")
|
| 127 |
+
|
| 128 |
+
# Shutdown server
|
| 129 |
+
subprocess.run(["pkill", "-f", "ollama serve"], stderr=subprocess.DEVNULL)
|
| 130 |
+
if modelfile_path.exists(): modelfile_path.unlink()
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
main()
|
src/transform/structure_data.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import logging
|
| 6 |
+
import requests
|
| 7 |
+
import subprocess
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
from typing import List, Tuple, Dict
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from sqlalchemy import create_engine, text
|
| 13 |
+
from ollama import Client
|
| 14 |
+
|
| 15 |
+
# --- LOGGING ---
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
| 17 |
+
logger = logging.getLogger("SmartIngestV6")
|
| 18 |
+
|
| 19 |
+
# --- 0. KNOWLEDGE BASE (THE RULES) ---
|
| 20 |
+
|
| 21 |
+
# Files to ignore completely
|
| 22 |
+
SKIP_PATTERNS = [
|
| 23 |
+
"december__2019_tap",
|
| 24 |
+
"lcr_return",
|
| 25 |
+
"lcr_sheet",
|
| 26 |
+
"quarterly_gdp",
|
| 27 |
+
"remittances",
|
| 28 |
+
"depository_corporation_survey_(expanded)"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
# Exact column expectations based on prior analysis [cite: 1, 2, 5, 6, 8, 14, 15, 16, 17]
|
| 32 |
+
SCHEMA_DEFINITIONS = {
|
| 33 |
+
"annual_gdp": ["year", "month", "nominal_gdp_prices", "real_gdp_growth", "real_gdp_prices"],
|
| 34 |
+
"bop_annual": ["bpm6_concept", "year_2019", "year_2020", "year_2021", "year_2022", "year_2023", "year_2024"],
|
| 35 |
+
"indicative_rates": ["date", "currency", "mean_rate", "buy_rate", "sell_rate"],
|
| 36 |
+
"exchange_rates": ["date", "currency", "mean_rate", "buy_rate", "sell_rate"], # Catch-all for historical/indicative
|
| 37 |
+
"central_bank_rates": ["year", "month", "reverse_repo", "interbank_rate", "tbill_91_day", "tbill_182_day", "tbill_364_day", "reserve_requirement", "cbr"],
|
| 38 |
+
"commercial_bank_rates": ["year", "month", "deposit_rate", "savings_rate", "lending_rate", "overdraft_rate"],
|
| 39 |
+
"domestic_debt": ["fiscal_year", "treasury_bills", "treasury_bonds", "govt_stocks", "overdraft_cbk", "advances_commercial", "other_debt", "total_debt"],
|
| 40 |
+
"forex_bureau": ["bureau_name", "usd_buy", "usd_sell", "usd_margin", "gbp_buy", "gbp_sell", "gbp_margin", "euro_buy", "euro_sell", "euro_margin"],
|
| 41 |
+
"treasury_bills": ["issue_date", "amount_offered", "tenure", "amount_received", "amount_accepted", "yield_rate", "alloted", "rejected", "redeemed", "outstanding"],
|
| 42 |
+
"treasury_bonds": ["issue_date", "bond_code", "amount_offered", "amount_received", "amount_accepted", "coupon_rate", "alloted", "rejected", "redeemed", "outstanding"],
|
| 43 |
+
"exports": ["year", "month", "commodity", "value_millions", "total"],
|
| 44 |
+
"imports": ["year", "month", "commodity", "value_millions", "total"],
|
| 45 |
+
"revenue": ["year", "month", "tax_revenue", "non_tax_revenue", "total_revenue", "recurrent_expenditure", "development_expenditure"],
|
| 46 |
+
"depository_corporation_survey": ["category", "data_values"] # Wide table handling triggered later
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# --- 1. INFRASTRUCTURE ---
|
| 50 |
+
|
| 51 |
+
def _manage_ollama_server(ollama_host, ollama_port, ollama_bin, model):
|
| 52 |
+
try:
|
| 53 |
+
if requests.get(ollama_host).status_code == 200:
|
| 54 |
+
logger.info(" Ollama connected.")
|
| 55 |
+
return True
|
| 56 |
+
except: pass
|
| 57 |
+
|
| 58 |
+
logger.info(f" Starting Ollama ({model})...")
|
| 59 |
+
scratch_env = os.environ.get("SCRATCH", "/tmp")
|
| 60 |
+
models_dir = Path(scratch_env) / "ollama_core/models"
|
| 61 |
+
|
| 62 |
+
server_env = os.environ.copy()
|
| 63 |
+
server_env["OLLAMA_HOST"] = f"127.0.0.1:{ollama_port}"
|
| 64 |
+
server_env["OLLAMA_MODELS"] = str(models_dir)
|
| 65 |
+
models_dir.mkdir(parents=True, exist_ok=True)
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
subprocess.Popen([str(ollama_bin), "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=server_env)
|
| 69 |
+
time.sleep(5)
|
| 70 |
+
subprocess.run([str(ollama_bin), "pull", model], env=server_env, check=True)
|
| 71 |
+
return True
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f" Server Error: {e}")
|
| 74 |
+
return False
|
| 75 |
+
|
| 76 |
+
# --- 2. HEADER HUNTER (Geometric Scanner) ---
|
| 77 |
+
|
| 78 |
+
def read_csv_robust(file_path: Path) -> pd.DataFrame:
|
| 79 |
+
encodings = ['utf-8', 'latin1', 'cp1252', 'ISO-8859-1']
|
| 80 |
+
for enc in encodings:
|
| 81 |
+
try:
|
| 82 |
+
return pd.read_csv(file_path, header=None, dtype=str, encoding=enc).fillna("")
|
| 83 |
+
except UnicodeDecodeError:
|
| 84 |
+
continue
|
| 85 |
+
return pd.DataFrame()
|
| 86 |
+
|
| 87 |
+
def find_best_header_row(df_raw: pd.DataFrame, expected_keywords: List[str]) -> Tuple[int, int]:
|
| 88 |
+
"""Scores rows based on expected keywords for this specific file type."""
|
| 89 |
+
scores = {}
|
| 90 |
+
scan_depth = min(30, len(df_raw))
|
| 91 |
+
|
| 92 |
+
# If we have no expectations, use generic keywords
|
| 93 |
+
if not expected_keywords:
|
| 94 |
+
expected_keywords = ['year', 'month', 'date', 'rate', 'bank', 'shilling', 'total']
|
| 95 |
+
|
| 96 |
+
for i in range(scan_depth):
|
| 97 |
+
row_str = " ".join(df_raw.iloc[i].astype(str)).lower()
|
| 98 |
+
score = 0
|
| 99 |
+
|
| 100 |
+
# Reward: Matches expected schema
|
| 101 |
+
for kw in expected_keywords:
|
| 102 |
+
if kw.lower() in row_str:
|
| 103 |
+
score += 3
|
| 104 |
+
|
| 105 |
+
# Penalty: Looks like Data (Dense numbers)
|
| 106 |
+
num_cells = sum(1 for c in df_raw.iloc[i].astype(str) if c.replace(',','').replace('.','').isdigit())
|
| 107 |
+
if num_cells > len(df_raw.columns) * 0.5:
|
| 108 |
+
score -= 10
|
| 109 |
+
|
| 110 |
+
scores[i] = score
|
| 111 |
+
|
| 112 |
+
best_header = max(scores, key=scores.get)
|
| 113 |
+
if scores[best_header] <= 0:
|
| 114 |
+
return _geometric_scan(df_raw)
|
| 115 |
+
|
| 116 |
+
return best_header, best_header + 1
|
| 117 |
+
|
| 118 |
+
def _geometric_scan(df_raw):
|
| 119 |
+
"""Fallback: Find first dense block of numbers."""
|
| 120 |
+
def is_data(x):
|
| 121 |
+
try:
|
| 122 |
+
float(str(x).replace(',', ''))
|
| 123 |
+
return 1
|
| 124 |
+
except: return 0
|
| 125 |
+
scores = df_raw.map(is_data).sum(axis=1)
|
| 126 |
+
if scores.empty or scores.max() <= 1: return 0, 1
|
| 127 |
+
data_rows = scores[scores >= scores.max() * 0.5].index.tolist()
|
| 128 |
+
if not data_rows: return 0, 1
|
| 129 |
+
data_start = data_rows[0]
|
| 130 |
+
header_idx = max(0, data_start - 1)
|
| 131 |
+
# Search up for content
|
| 132 |
+
while header_idx > 0:
|
| 133 |
+
if df_raw.iloc[header_idx].str.join("").str.strip().any(): break
|
| 134 |
+
header_idx -= 1
|
| 135 |
+
return header_idx, data_start
|
| 136 |
+
|
| 137 |
+
# --- 3. HYBRID PROMPT STRATEGY ---
|
| 138 |
+
|
| 139 |
+
def get_clean_headers(raw_headers: List[str], first_row: List[str], filename: str, client: Client, model: str) -> List[str]:
|
| 140 |
+
# 1. Identify File Type & Expectations
|
| 141 |
+
expected_cols = []
|
| 142 |
+
file_type = "generic"
|
| 143 |
+
for key, cols in SCHEMA_DEFINITIONS.items():
|
| 144 |
+
if key in filename.lower():
|
| 145 |
+
file_type = key
|
| 146 |
+
expected_cols = cols
|
| 147 |
+
break
|
| 148 |
+
|
| 149 |
+
# 2. Build Prompt
|
| 150 |
+
valid_raw = [str(h).strip() for h in raw_headers]
|
| 151 |
+
valid_data = [str(d).strip()[:15] for d in first_row]
|
| 152 |
+
|
| 153 |
+
prompt = f"""
|
| 154 |
+
You are a Financial Data Engineer.
|
| 155 |
+
|
| 156 |
+
File: "{filename}"
|
| 157 |
+
Detected Type: "{file_type}"
|
| 158 |
+
Expected Schema: {expected_cols}
|
| 159 |
+
|
| 160 |
+
Current Headers (Row N): {valid_raw}
|
| 161 |
+
First Data Row (Row N+1): {valid_data}
|
| 162 |
+
|
| 163 |
+
Task: Return a list of {len(raw_headers)} clean snake_case column names.
|
| 164 |
+
|
| 165 |
+
CRITICAL RULES:
|
| 166 |
+
1. PRIORITIZE THE EXPECTED SCHEMA. If the data looks like it matches the expectation, use those names.
|
| 167 |
+
2. If Expected Schema has 5 cols but file has 7, keep the 5 and name the others based on context (e.g., 'total').
|
| 168 |
+
3. If header is a Year ("1999"), keep it as "year_1999".
|
| 169 |
+
4. If header is empty/garbage, use the Data Row to guess (e.g. "Kenya Commercial Bank" -> "bank_name").
|
| 170 |
+
|
| 171 |
+
Respond ONLY with a JSON list of strings.
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
res = client.chat(model=model, messages=[{'role': 'user', 'content': prompt}], format='json')
|
| 176 |
+
content = json.loads(res['message']['content'])
|
| 177 |
+
|
| 178 |
+
if isinstance(content, dict):
|
| 179 |
+
for val in content.values():
|
| 180 |
+
if isinstance(val, list): return val
|
| 181 |
+
return content if isinstance(content, list) else [f"col_{i}" for i in range(len(raw_headers))]
|
| 182 |
+
except:
|
| 183 |
+
# FALLBACK: If LLM fails, return the Expected Schema (padded if needed)
|
| 184 |
+
if expected_cols:
|
| 185 |
+
if len(expected_cols) < len(raw_headers):
|
| 186 |
+
return expected_cols + [f"extra_{i}" for i in range(len(raw_headers)-len(expected_cols))]
|
| 187 |
+
return expected_cols[:len(raw_headers)]
|
| 188 |
+
return [f"col_{i}" for i in range(len(raw_headers))]
|
| 189 |
+
|
| 190 |
+
# --- 4. SPECIFIC TRANSFORMS ---
|
| 191 |
+
|
| 192 |
+
def apply_specific_transforms(df: pd.DataFrame, filename: str) -> pd.DataFrame:
|
| 193 |
+
fname = filename.lower()
|
| 194 |
+
|
| 195 |
+
# Rule 20: Revenue & Expenditure - Remove top 3 rows
|
| 196 |
+
if "revenue" in fname:
|
| 197 |
+
if len(df) > 3: df = df.iloc[3:].reset_index(drop=True)
|
| 198 |
+
|
| 199 |
+
# Rule 9: Depository Survey - Wide Table Logic
|
| 200 |
+
if "depository_corporation" in fname:
|
| 201 |
+
# This is a massive wide table. We usually want to melt it.
|
| 202 |
+
# Assuming col 0 is Category and rest are dates
|
| 203 |
+
try:
|
| 204 |
+
id_vars = [df.columns[0]]
|
| 205 |
+
value_vars = [c for c in df.columns if c != df.columns[0]]
|
| 206 |
+
df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name="date", value_name="amount_millions")
|
| 207 |
+
except: pass
|
| 208 |
+
|
| 209 |
+
# Rule 1/19/21/22: Year + Month merging
|
| 210 |
+
# Check if we have 'year' and 'month' columns
|
| 211 |
+
cols = [str(c).lower() for c in df.columns]
|
| 212 |
+
if 'year' in cols and 'month' in cols:
|
| 213 |
+
try:
|
| 214 |
+
# Simple merge
|
| 215 |
+
y_idx = cols.index('year')
|
| 216 |
+
m_idx = cols.index('month')
|
| 217 |
+
df['period'] = df.iloc[:, y_idx].astype(str) + '-' + df.iloc[:, m_idx].astype(str)
|
| 218 |
+
except: pass
|
| 219 |
+
|
| 220 |
+
return df
|
| 221 |
+
|
| 222 |
+
# --- 5. PROCESSING CORE ---
|
| 223 |
+
|
| 224 |
+
def process_file_v6(file_path: Path, engine, client, model):
|
| 225 |
+
# 1. Skip Check
|
| 226 |
+
if any(p in file_path.name.lower() for p in SKIP_PATTERNS):
|
| 227 |
+
logger.warning(f" Skipping {file_path.name} (Blacklisted)")
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
logger.info(f"Processing {file_path.name}...")
|
| 231 |
+
|
| 232 |
+
# 2. Read
|
| 233 |
+
df_raw = read_csv_robust(file_path)
|
| 234 |
+
if df_raw.empty: return
|
| 235 |
+
|
| 236 |
+
# 3. Identify Expectations for Header Scanning
|
| 237 |
+
expected_keys = []
|
| 238 |
+
for key, cols in SCHEMA_DEFINITIONS.items():
|
| 239 |
+
if key in file_path.name.lower():
|
| 240 |
+
expected_keys = cols
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
# 4. Find Header
|
| 244 |
+
header_idx, data_start = find_best_header_row(df_raw, expected_keys)
|
| 245 |
+
|
| 246 |
+
# 5. Extract Headers
|
| 247 |
+
raw_headers = df_raw.iloc[header_idx].tolist()
|
| 248 |
+
|
| 249 |
+
# Double Header Check
|
| 250 |
+
if header_idx > 0:
|
| 251 |
+
row_above = df_raw.iloc[header_idx-1].fillna("").astype(str).tolist()
|
| 252 |
+
if sum(len(x) for x in row_above) > 10:
|
| 253 |
+
raw_headers = [f"{p} {c}".strip() for p, c in zip(row_above, raw_headers)]
|
| 254 |
+
|
| 255 |
+
if len(raw_headers) != len(df_raw.columns):
|
| 256 |
+
raw_headers = [f"col_{i}" for i in range(len(df_raw.columns))]
|
| 257 |
+
|
| 258 |
+
# 6. LLM / Hybrid Map
|
| 259 |
+
first_row = df_raw.iloc[data_start].tolist() if data_start < len(df_raw) else [""]*len(raw_headers)
|
| 260 |
+
clean_headers = get_clean_headers(raw_headers, first_row, file_path.name, client, model)
|
| 261 |
+
|
| 262 |
+
# Align Lengths
|
| 263 |
+
if len(clean_headers) < len(df_raw.columns):
|
| 264 |
+
clean_headers += [f"extra_{i}" for i in range(len(df_raw.columns) - len(clean_headers))]
|
| 265 |
+
clean_headers = clean_headers[:len(df_raw.columns)]
|
| 266 |
+
|
| 267 |
+
# 7. Build DF
|
| 268 |
+
df = df_raw.iloc[data_start:].copy()
|
| 269 |
+
df.columns = clean_headers
|
| 270 |
+
|
| 271 |
+
# 8. Transforms
|
| 272 |
+
df = apply_specific_transforms(df, file_path.name)
|
| 273 |
+
|
| 274 |
+
# 9. Clean & Save
|
| 275 |
+
df = df.loc[:, ~df.columns.str.contains('^unnamed', case=False)]
|
| 276 |
+
df.dropna(thresh=1, inplace=True)
|
| 277 |
+
|
| 278 |
+
for c in df.columns:
|
| 279 |
+
if any(x in str(c).lower() for x in ['rate', 'value', 'amount', 'mean', 'buy', 'sell']):
|
| 280 |
+
df[c] = df[c].apply(lambda x: pd.to_numeric(str(x).replace(',', '').replace('(', '-').replace(')', ''), errors='ignore'))
|
| 281 |
+
|
| 282 |
+
table_name = re.sub(r'cbk_batch_\d+_\d+_', '', file_path.stem)
|
| 283 |
+
table_name = re.sub(r'[^a-zA-Z0-9]', '_', table_name).lower()[:60].lstrip('_')
|
| 284 |
+
df['source_file'] = file_path.name
|
| 285 |
+
|
| 286 |
+
try:
|
| 287 |
+
df.to_sql(table_name, engine, if_exists='replace', index=False)
|
| 288 |
+
logger.info(f" Saved {len(df)} rows to '{table_name}'")
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.error(f" SQL Error: {e}")
|
| 291 |
+
|
| 292 |
+
# --- MAIN ---
|
| 293 |
+
|
| 294 |
+
def process_cbk_files(input_dir: str, db_name="mshauri_fedha_v6.db", model="qwen2.5:14b"):
|
| 295 |
+
SCRATCH = os.environ.get("SCRATCH", "/tmp")
|
| 296 |
+
BASE_DIR = Path(SCRATCH)
|
| 297 |
+
INPUT_PATH = Path(input_dir) if Path(input_dir).exists() else BASE_DIR / input_dir
|
| 298 |
+
|
| 299 |
+
if not INPUT_PATH.exists(): return
|
| 300 |
+
|
| 301 |
+
OLLAMA_BIN = BASE_DIR / "ollama_core/bin/ollama"
|
| 302 |
+
CUSTOM_PORT = "25000"
|
| 303 |
+
OLLAMA_HOST = f"http://127.0.0.1:{CUSTOM_PORT}"
|
| 304 |
+
|
| 305 |
+
if not _manage_ollama_server(OLLAMA_HOST, CUSTOM_PORT, OLLAMA_BIN, model): return
|
| 306 |
+
|
| 307 |
+
engine = create_engine(f"sqlite:///{db_name}")
|
| 308 |
+
client = Client(host=OLLAMA_HOST)
|
| 309 |
+
|
| 310 |
+
files = sorted(list(INPUT_PATH.glob("*.csv")))
|
| 311 |
+
print(f"🚀 Processing {len(files)} files...")
|
| 312 |
+
|
| 313 |
+
for f in files:
|
| 314 |
+
process_file_v6(f, engine, client, model)
|
| 315 |
+
|
| 316 |
+
print("\n Done.")
|
| 317 |
+
with engine.connect() as conn:
|
| 318 |
+
tables = conn.execute(text("SELECT name FROM sqlite_master WHERE type='table'")).fetchall()
|
| 319 |
+
print(f"📊 Created {len(tables)} tables.")
|
| 320 |
+
|
| 321 |
+
if __name__ == "__main__":
|
| 322 |
+
pass
|