datamatters24 commited on
Commit
883836c
·
verified ·
1 Parent(s): 65e006b

Upload notebooks/00_setup/01_verify_environment.ipynb with huggingface_hub

Browse files
notebooks/00_setup/01_verify_environment.ipynb ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Environment Verification\n",
8
+ "\n",
9
+ "Verify that all system components are correctly configured:\n",
10
+ "database connectivity, NLP models, embedding models, and system resources."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "metadata": {},
16
+ "source": [
17
+ "# Test DB connection\n",
18
+ "from research_lib.db import fetch_df\n",
19
+ "\n",
20
+ "result = fetch_df(\"SELECT 1 AS test\")\n",
21
+ "assert result.iloc[0][\"test\"] == 1, \"DB connection failed\"\n",
22
+ "print(\"Database connection: OK\")\n",
23
+ "print(result)"
24
+ ],
25
+ "execution_count": null,
26
+ "outputs": []
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "metadata": {},
31
+ "source": [
32
+ "# Check table row counts\n",
33
+ "from research_lib.db import fetch_df\n",
34
+ "\n",
35
+ "df_tables = fetch_df(\n",
36
+ " \"SELECT relname, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC\"\n",
37
+ ")\n",
38
+ "print(\"Table row counts:\")\n",
39
+ "print(df_tables.to_string(index=False))"
40
+ ],
41
+ "execution_count": null,
42
+ "outputs": []
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "metadata": {},
47
+ "source": [
48
+ "# Verify spaCy loads\n",
49
+ "from research_lib.nlp import get_nlp\n",
50
+ "\n",
51
+ "nlp = get_nlp()\n",
52
+ "print(f\"spaCy model: {nlp.meta['name']}\")\n",
53
+ "print(f\"spaCy version: {nlp.meta.get('spacy_version', 'unknown')}\")\n",
54
+ "print(f\"Pipeline components: {nlp.pipe_names}\")\n",
55
+ "\n",
56
+ "# Quick sanity check\n",
57
+ "doc = nlp(\"Jeffrey Epstein traveled to New York on January 5, 2005.\")\n",
58
+ "print(f\"\\nTest NER results:\")\n",
59
+ "for ent in doc.ents:\n",
60
+ " print(f\" {ent.text:30s} -> {ent.label_}\")"
61
+ ],
62
+ "execution_count": null,
63
+ "outputs": []
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "metadata": {},
68
+ "source": [
69
+ "# Verify sentence-transformers\n",
70
+ "from research_lib.embeddings import get_embedder\n",
71
+ "\n",
72
+ "m = get_embedder()\n",
73
+ "print(f\"Embedding dim: {m.get_sentence_embedding_dimension()}\")\n",
74
+ "print(f\"Max seq length: {m.max_seq_length}\")\n",
75
+ "\n",
76
+ "# Quick encode test\n",
77
+ "test_emb = m.encode([\"test sentence\"])\n",
78
+ "print(f\"Test embedding shape: {test_emb.shape}\")\n",
79
+ "print(\"Sentence-transformers: OK\")"
80
+ ],
81
+ "execution_count": null,
82
+ "outputs": []
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "metadata": {},
87
+ "source": [
88
+ "# Print system info\n",
89
+ "import psutil\n",
90
+ "import shutil\n",
91
+ "import platform\n",
92
+ "import os\n",
93
+ "\n",
94
+ "mem = psutil.virtual_memory()\n",
95
+ "disk = shutil.disk_usage(\"/\")\n",
96
+ "\n",
97
+ "print(\"=== System Information ===\")\n",
98
+ "print(f\"Platform: {platform.platform()}\")\n",
99
+ "print(f\"Python: {platform.python_version()}\")\n",
100
+ "print(f\"CPU cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical\")\n",
101
+ "print(f\"CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
102
+ "print(f\"RAM total: {mem.total / (1024**3):.1f} GB\")\n",
103
+ "print(f\"RAM used: {mem.used / (1024**3):.1f} GB ({mem.percent}%)\")\n",
104
+ "print(f\"RAM available:{mem.available / (1024**3):.1f} GB\")\n",
105
+ "print(f\"Disk total: {disk.total / (1024**3):.1f} GB\")\n",
106
+ "print(f\"Disk used: {disk.used / (1024**3):.1f} GB\")\n",
107
+ "print(f\"Disk free: {disk.free / (1024**3):.1f} GB\")"
108
+ ],
109
+ "execution_count": null,
110
+ "outputs": []
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "metadata": {},
115
+ "source": [
116
+ "# Print collection file counts\n",
117
+ "import os\n",
118
+ "from pathlib import Path\n",
119
+ "\n",
120
+ "raw_dir = Path(\"/data/raw\")\n",
121
+ "\n",
122
+ "print(\"=== Collection File Counts ===\")\n",
123
+ "print(f\"{'Collection':<40} {'Files':>10}\")\n",
124
+ "print(\"-\" * 55)\n",
125
+ "\n",
126
+ "if raw_dir.exists():\n",
127
+ " total = 0\n",
128
+ " for d in sorted(raw_dir.iterdir()):\n",
129
+ " if d.is_dir():\n",
130
+ " count = sum(1 for f in d.rglob(\"*\") if f.is_file())\n",
131
+ " total += count\n",
132
+ " print(f\"{d.name:<40} {count:>10,}\")\n",
133
+ " print(\"-\" * 55)\n",
134
+ " print(f\"{'TOTAL':<40} {total:>10,}\")\n",
135
+ "else:\n",
136
+ " print(f\"Directory {raw_dir} does not exist.\")"
137
+ ],
138
+ "execution_count": null,
139
+ "outputs": []
140
+ }
141
+ ],
142
+ "metadata": {
143
+ "kernelspec": {
144
+ "display_name": "Python 3",
145
+ "language": "python",
146
+ "name": "python3"
147
+ },
148
+ "language_info": {
149
+ "name": "python",
150
+ "version": "3.10.0"
151
+ }
152
+ },
153
+ "nbformat": 4,
154
+ "nbformat_minor": 5
155
+ }