datamatters24
/

research-document-archive

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Environment Verification\n",
+    "\n",
+    "Verify that all system components are correctly configured:\n",
+    "database connectivity, NLP models, embedding models, and system resources."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Test DB connection\n",
+    "from research_lib.db import fetch_df\n",
+    "\n",
+    "result = fetch_df(\"SELECT 1 AS test\")\n",
+    "assert result.iloc[0][\"test\"] == 1, \"DB connection failed\"\n",
+    "print(\"Database connection: OK\")\n",
+    "print(result)"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Check table row counts\n",
+    "from research_lib.db import fetch_df\n",
+    "\n",
+    "df_tables = fetch_df(\n",
+    "    \"SELECT relname, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC\"\n",
+    ")\n",
+    "print(\"Table row counts:\")\n",
+    "print(df_tables.to_string(index=False))"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Verify spaCy loads\n",
+    "from research_lib.nlp import get_nlp\n",
+    "\n",
+    "nlp = get_nlp()\n",
+    "print(f\"spaCy model: {nlp.meta['name']}\")\n",
+    "print(f\"spaCy version: {nlp.meta.get('spacy_version', 'unknown')}\")\n",
+    "print(f\"Pipeline components: {nlp.pipe_names}\")\n",
+    "\n",
+    "# Quick sanity check\n",
+    "doc = nlp(\"Jeffrey Epstein traveled to New York on January 5, 2005.\")\n",
+    "print(f\"\\nTest NER results:\")\n",
+    "for ent in doc.ents:\n",
+    "    print(f\"  {ent.text:30s} -> {ent.label_}\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Verify sentence-transformers\n",
+    "from research_lib.embeddings import get_embedder\n",
+    "\n",
+    "m = get_embedder()\n",
+    "print(f\"Embedding dim: {m.get_sentence_embedding_dimension()}\")\n",
+    "print(f\"Max seq length: {m.max_seq_length}\")\n",
+    "\n",
+    "# Quick encode test\n",
+    "test_emb = m.encode([\"test sentence\"])\n",
+    "print(f\"Test embedding shape: {test_emb.shape}\")\n",
+    "print(\"Sentence-transformers: OK\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Print system info\n",
+    "import psutil\n",
+    "import shutil\n",
+    "import platform\n",
+    "import os\n",
+    "\n",
+    "mem = psutil.virtual_memory()\n",
+    "disk = shutil.disk_usage(\"/\")\n",
+    "\n",
+    "print(\"=== System Information ===\")\n",
+    "print(f\"Platform:     {platform.platform()}\")\n",
+    "print(f\"Python:       {platform.python_version()}\")\n",
+    "print(f\"CPU cores:    {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count(logical=True)} logical\")\n",
+    "print(f\"CPU usage:    {psutil.cpu_percent(interval=1):.1f}%\")\n",
+    "print(f\"RAM total:    {mem.total / (1024**3):.1f} GB\")\n",
+    "print(f\"RAM used:     {mem.used / (1024**3):.1f} GB ({mem.percent}%)\")\n",
+    "print(f\"RAM available:{mem.available / (1024**3):.1f} GB\")\n",
+    "print(f\"Disk total:   {disk.total / (1024**3):.1f} GB\")\n",
+    "print(f\"Disk used:    {disk.used / (1024**3):.1f} GB\")\n",
+    "print(f\"Disk free:    {disk.free / (1024**3):.1f} GB\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "# Print collection file counts\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "raw_dir = Path(\"/data/raw\")\n",
+    "\n",
+    "print(\"=== Collection File Counts ===\")\n",
+    "print(f\"{'Collection':<40} {'Files':>10}\")\n",
+    "print(\"-\" * 55)\n",
+    "\n",
+    "if raw_dir.exists():\n",
+    "    total = 0\n",
+    "    for d in sorted(raw_dir.iterdir()):\n",
+    "        if d.is_dir():\n",
+    "            count = sum(1 for f in d.rglob(\"*\") if f.is_file())\n",
+    "            total += count\n",
+    "            print(f\"{d.name:<40} {count:>10,}\")\n",
+    "    print(\"-\" * 55)\n",
+    "    print(f\"{'TOTAL':<40} {total:>10,}\")\n",
+    "else:\n",
+    "    print(f\"Directory {raw_dir} does not exist.\")"
+   ],
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}