Upload notebooks/00_setup/01_verify_environment.ipynb with huggingface_hub
Browse files
notebooks/00_setup/01_verify_environment.ipynb
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Environment Verification\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Verify that all system components are correctly configured:\n",
|
| 10 |
+
"database connectivity, NLP models, embedding models, and system resources."
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "code",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"source": [
|
| 17 |
+
"# Test DB connection\n",
|
| 18 |
+
"from research_lib.db import fetch_df\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"result = fetch_df(\"SELECT 1 AS test\")\n",
|
| 21 |
+
"assert result.iloc[0][\"test\"] == 1, \"DB connection failed\"\n",
|
| 22 |
+
"print(\"Database connection: OK\")\n",
|
| 23 |
+
"print(result)"
|
| 24 |
+
],
|
| 25 |
+
"execution_count": null,
|
| 26 |
+
"outputs": []
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"source": [
|
| 32 |
+
"# Check table row counts\n",
|
| 33 |
+
"from research_lib.db import fetch_df\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"df_tables = fetch_df(\n",
|
| 36 |
+
" \"SELECT relname, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC\"\n",
|
| 37 |
+
")\n",
|
| 38 |
+
"print(\"Table row counts:\")\n",
|
| 39 |
+
"print(df_tables.to_string(index=False))"
|
| 40 |
+
],
|
| 41 |
+
"execution_count": null,
|
| 42 |
+
"outputs": []
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"cell_type": "code",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"source": [
|
| 48 |
+
"# Verify spaCy loads\n",
|
| 49 |
+
"from research_lib.nlp import get_nlp\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"nlp = get_nlp()\n",
|
| 52 |
+
"print(f\"spaCy model: {nlp.meta['name']}\")\n",
|
| 53 |
+
"print(f\"spaCy version: {nlp.meta.get('spacy_version', 'unknown')}\")\n",
|
| 54 |
+
"print(f\"Pipeline components: {nlp.pipe_names}\")\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"# Quick sanity check\n",
|
| 57 |
+
"doc = nlp(\"Jeffrey Epstein traveled to New York on January 5, 2005.\")\n",
|
| 58 |
+
"print(f\"\\nTest NER results:\")\n",
|
| 59 |
+
"for ent in doc.ents:\n",
|
| 60 |
+
" print(f\" {ent.text:30s} -> {ent.label_}\")"
|
| 61 |
+
],
|
| 62 |
+
"execution_count": null,
|
| 63 |
+
"outputs": []
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"cell_type": "code",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"source": [
|
| 69 |
+
"# Verify sentence-transformers\n",
|
| 70 |
+
"from research_lib.embeddings import get_embedder\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"m = get_embedder()\n",
|
| 73 |
+
"print(f\"Embedding dim: {m.get_sentence_embedding_dimension()}\")\n",
|
| 74 |
+
"print(f\"Max seq length: {m.max_seq_length}\")\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"# Quick encode test\n",
|
| 77 |
+
"test_emb = m.encode([\"test sentence\"])\n",
|
| 78 |
+
"print(f\"Test embedding shape: {test_emb.shape}\")\n",
|
| 79 |
+
"print(\"Sentence-transformers: OK\")"
|
| 80 |
+
],
|
| 81 |
+
"execution_count": null,
|
| 82 |
+
"outputs": []
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"cell_type": "code",
|
| 86 |
+
"metadata": {},
|
| 87 |
+
"source": [
|
| 88 |
+
"# Print system info\n",
|
| 89 |
+
"import psutil\n",
|
| 90 |
+
"import shutil\n",
|
| 91 |
+
"import platform\n",
|
| 92 |
+
"import os\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"mem = psutil.virtual_memory()\n",
|
| 95 |
+
"disk = shutil.disk_usage(\"/\")\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"print(\"=== System Information ===\")\n",
|
| 98 |
+
"print(f\"Platform: {platform.platform()}\")\n",
|
| 99 |
+
"print(f\"Python: {platform.python_version()}\")\n",
|
| 100 |
+
"print(f\"CPU cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical\")\n",
|
| 101 |
+
"print(f\"CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
|
| 102 |
+
"print(f\"RAM total: {mem.total / (1024**3):.1f} GB\")\n",
|
| 103 |
+
"print(f\"RAM used: {mem.used / (1024**3):.1f} GB ({mem.percent}%)\")\n",
|
| 104 |
+
"print(f\"RAM available:{mem.available / (1024**3):.1f} GB\")\n",
|
| 105 |
+
"print(f\"Disk total: {disk.total / (1024**3):.1f} GB\")\n",
|
| 106 |
+
"print(f\"Disk used: {disk.used / (1024**3):.1f} GB\")\n",
|
| 107 |
+
"print(f\"Disk free: {disk.free / (1024**3):.1f} GB\")"
|
| 108 |
+
],
|
| 109 |
+
"execution_count": null,
|
| 110 |
+
"outputs": []
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"metadata": {},
|
| 115 |
+
"source": [
|
| 116 |
+
"# Print collection file counts\n",
|
| 117 |
+
"import os\n",
|
| 118 |
+
"from pathlib import Path\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"raw_dir = Path(\"/data/raw\")\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"print(\"=== Collection File Counts ===\")\n",
|
| 123 |
+
"print(f\"{'Collection':<40} {'Files':>10}\")\n",
|
| 124 |
+
"print(\"-\" * 55)\n",
|
| 125 |
+
"\n",
|
| 126 |
+
"if raw_dir.exists():\n",
|
| 127 |
+
" total = 0\n",
|
| 128 |
+
" for d in sorted(raw_dir.iterdir()):\n",
|
| 129 |
+
" if d.is_dir():\n",
|
| 130 |
+
" count = sum(1 for f in d.rglob(\"*\") if f.is_file())\n",
|
| 131 |
+
" total += count\n",
|
| 132 |
+
" print(f\"{d.name:<40} {count:>10,}\")\n",
|
| 133 |
+
" print(\"-\" * 55)\n",
|
| 134 |
+
" print(f\"{'TOTAL':<40} {total:>10,}\")\n",
|
| 135 |
+
"else:\n",
|
| 136 |
+
" print(f\"Directory {raw_dir} does not exist.\")"
|
| 137 |
+
],
|
| 138 |
+
"execution_count": null,
|
| 139 |
+
"outputs": []
|
| 140 |
+
}
|
| 141 |
+
],
|
| 142 |
+
"metadata": {
|
| 143 |
+
"kernelspec": {
|
| 144 |
+
"display_name": "Python 3",
|
| 145 |
+
"language": "python",
|
| 146 |
+
"name": "python3"
|
| 147 |
+
},
|
| 148 |
+
"language_info": {
|
| 149 |
+
"name": "python",
|
| 150 |
+
"version": "3.10.0"
|
| 151 |
+
}
|
| 152 |
+
},
|
| 153 |
+
"nbformat": 4,
|
| 154 |
+
"nbformat_minor": 5
|
| 155 |
+
}
|