"""Create a clean isolated venv and install all chatbot dependencies into it. Why this script: Up to now we used --user --break-system-packages installs into ~/.local. That works but is not reproducible. This script creates a proper isolated venv at /.venv so that requirements.txt reflects EXACTLY the environment used for training and inference. Approach: 1. Create .venv at PROJECT_ROOT/.venv (clean — no --system-site-packages). 2. Upgrade pip and wheel inside the venv. 3. Pin setuptools_scm<10 first (works around seqeval build bug we hit before). 4. Install torch with CUDA. Try indexes in order: cu126 -> cu124 -> cu121 -> cpu. The GTX 1650 (compute 7.5) runs on any of these; cu130 wheels for Py3.13 are not yet on the official torch index, so we don't try it. 5. Install all other dependency groups (mirrors install.py). 6. Verify imports for every package. 7. Run `pip freeze` inside the venv -> overwrite requirements.txt with pins. 8. Print activation instructions. Run once. Re-running will reuse an existing .venv (will not delete it). """ from __future__ import annotations import os import shutil import subprocess import sys import venv from pathlib import Path from typing import Iterable PROJECT_ROOT = Path(__file__).resolve().parent.parent VENV_DIR = PROJECT_ROOT / ".venv" VENV_PYTHON = VENV_DIR / "bin" / "python" VENV_PIP = VENV_DIR / "bin" / "pip" REQ_FILE = PROJECT_ROOT / "requirements.txt" # --------------------------------------------------------------------------- # Dependency groups # --------------------------------------------------------------------------- # Each entry: (group_name, packages, optional_flag) GROUPS: list[tuple[str, list[str], bool]] = [ # Build tools — install FIRST. setuptools_scm pinned to avoid seqeval bug # we already hit on the user-site install. ("build_tools", [ "pip>=24.0", "wheel>=0.42", "setuptools>=68", "setuptools_scm<10", ], False), # Core HuggingFace stack. ("core_ml", [ "transformers>=4.40", "datasets>=2.18", "tokenizers>=0.15", "accelerate>=0.27", "evaluate>=0.4", "huggingface_hub>=0.23", ], False), # Data manipulation / ML utilities. ("data_ml", [ "numpy>=1.26", "pandas>=2.1", "scikit-learn>=1.3", "scipy>=1.11", ], False), # Language detection. ("lang_detect", [ "langdetect>=1.0.9", "lingua-language-detector>=2.0", ], False), # Arabic NLP. ("arabic_pyarabic", ["pyarabic>=0.6.15"], False), ("arabic_camel_optional", ["camel-tools"], True), # Embeddings + vector search. ("embeddings_rag", [ "sentence-transformers>=2.7", "faiss-cpu>=1.7.4", ], False), # Web / scraping. ("web_core", [ "requests>=2.31", "beautifulsoup4>=4.12", "lxml>=5.1", ], False), ("web_scrapy_optional", ["scrapy>=2.11"], True), # Sequence labelling metrics — needs setuptools_scm<10 (already pinned above). ("seq_metrics", ["seqeval==1.2.2"], False), # UI / serving. ("ui_serving", [ "gradio>=4.20", "fastapi>=0.110", "uvicorn>=0.27", ], False), # Utils. ("utils_required", ["tqdm>=4.66"], False), ("utils_optional", ["wandb>=0.16"], True), # Plotting. ("plotting", ["matplotlib>=3.8"], False), ] # torch indexes in priority order. We stop at the first one that succeeds. TORCH_INDEXES: list[tuple[str, str]] = [ ("cu126", "https://download.pytorch.org/whl/cu126"), ("cu124", "https://download.pytorch.org/whl/cu124"), ("cu121", "https://download.pytorch.org/whl/cu121"), ("cpu", "https://download.pytorch.org/whl/cpu"), ] # --------------------------------------------------------------------------- # venv creation # --------------------------------------------------------------------------- def create_venv() -> None: """Create the venv if it doesn't already exist.""" if VENV_DIR.exists() and VENV_PYTHON.exists(): print(f"venv already exists at {VENV_DIR} — reusing.") return if VENV_DIR.exists(): # Half-built venv — remove and recreate print(f"venv directory exists but is incomplete — removing {VENV_DIR}") shutil.rmtree(VENV_DIR) print(f"Creating venv at {VENV_DIR} ...") builder = venv.EnvBuilder( system_site_packages=False, # clean isolation clear=False, with_pip=True, upgrade_deps=False, ) builder.create(str(VENV_DIR)) print(" ✓ venv created.") # --------------------------------------------------------------------------- # pip helpers (use venv's pip, not system) # --------------------------------------------------------------------------- def venv_pip(args: list[str], stream: bool = False) -> tuple[bool, str]: """Run `/bin/pip `. Return (ok, log).""" cmd = [str(VENV_PIP), *args] if stream: # For long-running torch downloads, stream to terminal directly proc = subprocess.run(cmd, check=False) return proc.returncode == 0, "" proc = subprocess.run(cmd, capture_output=True, text=True, check=False) return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "") def venv_python(code: str) -> tuple[bool, str]: """Run a snippet of Python with the venv's interpreter.""" proc = subprocess.run( [str(VENV_PYTHON), "-c", code], capture_output=True, text=True, check=False, ) return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "") def upgrade_pip_in_venv() -> None: """Make sure pip itself is current inside the venv.""" print("\nUpgrading pip inside venv ...") ok, log = venv_pip(["install", "--upgrade", "pip", "wheel", "setuptools"]) print("\n".join(log.strip().splitlines()[-8:])) if not ok: raise RuntimeError("Failed to upgrade pip in venv") # --------------------------------------------------------------------------- # torch install with fallback chain # --------------------------------------------------------------------------- def install_torch() -> tuple[bool, str]: """Try torch CUDA wheels in order. Return (ok, which_index_succeeded).""" print("\n" + "=" * 72) print("Installing PyTorch (CUDA wheels — fallback chain)") print("=" * 72) for label, url in TORCH_INDEXES: print(f"\n>>> Trying torch from index: {label} ({url})") ok, _ = venv_pip( ["install", "--index-url", url, "torch", "torchvision", "torchaudio"], stream=True, ) if not ok: print(f" ✗ {label} index failed — trying next.") continue # Verify that import + CUDA reporting works ok2, out = venv_python( "import torch; " "print('TORCH', torch.__version__); " "print('CUDA', torch.cuda.is_available()); " "print('CUDA_V', torch.version.cuda); " "print('GPU', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none')" ) print(out.strip()) if ok2: print(f" ✓ torch installed and importable ({label}).") return True, label print(f" ✗ torch installed but failed to import — trying next index.") return False, "" # --------------------------------------------------------------------------- # group install # --------------------------------------------------------------------------- def install_group(name: str, packages: list[str], optional: bool) -> dict: """Install a single dep group inside the venv.""" label = f"[{name}]" + (" (optional)" if optional else "") print(f"\n{'=' * 72}") print(f"Installing group: {label}") print(f" Packages: {', '.join(packages)}") print("=" * 72) ok, log = venv_pip(["install", "--upgrade", *packages]) tail = "\n".join(log.strip().splitlines()[-15:]) print(tail) # Verify imports name_map = { "beautifulsoup4": "bs4", "scikit-learn": "sklearn", "faiss-cpu": "faiss", "lingua-language-detector": "lingua", "sentence-transformers": "sentence_transformers", "huggingface_hub": "huggingface_hub", "pyarabic": "pyarabic", "camel-tools": "camel_tools", "setuptools_scm": "setuptools_scm", "pip": "pip", "wheel": "wheel", "setuptools": "setuptools", } verifies: list[tuple[str, bool, str]] = [] for spec in packages: base = spec.split(">")[0].split("=")[0].split("<")[0].strip() import_name = name_map.get(base, base.replace("-", "_")) ok2, out = venv_python( f"import importlib, importlib.metadata as md;" f"m = importlib.import_module({import_name!r});" f"v = md.version({base!r});" f"print(v)" ) if ok2: verifies.append((base, True, out.strip())) else: verifies.append((base, False, out.strip().splitlines()[-1] if out.strip() else "import failed")) all_ok = all(v[1] for v in verifies) marker = "✓" if all_ok else ("!" if optional else "✗") print(f"\n {marker} {label}") for pkg, vok, info in verifies: sub = "OK " if vok else "FAIL" print(f" [{sub}] {pkg:<30s} {info}") return {"group": name, "optional": optional, "fully_ok": ok and all_ok, "verifies": verifies} # --------------------------------------------------------------------------- # requirements.txt generation # --------------------------------------------------------------------------- def write_requirements(torch_index: str) -> None: """Run `pip freeze` inside the venv and write requirements.txt.""" print("\n" + "=" * 72) print("Writing requirements.txt from venv pip freeze") print("=" * 72) proc = subprocess.run( [str(VENV_PIP), "freeze"], capture_output=True, text=True, check=False, ) if proc.returncode != 0: raise RuntimeError("pip freeze failed:\n" + proc.stderr) header = [ "# Auto-generated by setup/setup_venv.py", "# Pinned versions captured by `pip freeze` inside .venv", f"# Torch installed from PyTorch index: {torch_index}", f"# -> https://download.pytorch.org/whl/{torch_index}", "# To recreate this environment:", "# python -m venv .venv", "# source .venv/bin/activate", f"# pip install --index-url https://download.pytorch.org/whl/{torch_index} torch torchvision torchaudio", "# pip install -r requirements.txt", "", ] REQ_FILE.write_text("\n".join(header) + proc.stdout) print(f" ✓ {REQ_FILE} ({len(proc.stdout.splitlines())} packages pinned)") # --------------------------------------------------------------------------- # main # --------------------------------------------------------------------------- def main() -> int: """Orchestrate venv creation, package installation, and requirements export.""" print("=" * 72) print("Multilingual Chatbot — venv setup") print("=" * 72) print(f"Project : {PROJECT_ROOT}") print(f"venv path : {VENV_DIR}") print(f"Host pyver : {sys.version.split()[0]}") create_venv() # Quick sanity: make sure venv python works ok, out = venv_python("import sys; print(sys.version.split()[0])") if not ok: print("venv Python does not work:") print(out) return 1 print(f"venv pyver : {out.strip()}") upgrade_pip_in_venv() # build_tools FIRST (pins setuptools_scm<10 — required for seqeval later) bt = install_group(*GROUPS[0]) if not bt["fully_ok"]: print("Build tools failed — aborting.") return 1 # torch with fallback chain torch_ok, torch_index = install_torch() if not torch_ok: print("\nAll torch indexes failed. Aborting.") return 1 # remaining groups summaries = [bt] hard_failures: list[str] = [] for name, packages, optional in GROUPS[1:]: s = install_group(name, packages, optional) summaries.append(s) if not s["fully_ok"] and not optional: hard_failures.append(name) # final report print("\n" + "=" * 72) print("FINAL REPORT") print("=" * 72) print(f"venv: {VENV_DIR}") print(f"torch index: {torch_index}\n") for s in summaries: tag = "OPT " if s["optional"] else "REQ " status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗") print(f" {status} [{tag}] {s['group']}") if hard_failures: print(f"\nFailed required groups: {hard_failures}") return 1 write_requirements(torch_index) print("\n" + "=" * 72) print("ACTIVATION") print("=" * 72) print("To use the venv from now on:") print(" source .venv/bin/activate") print("Then run any subsequent script with plain `python ...`.") print("Or call the venv python directly without activating:") print(f" {VENV_PYTHON} ") return 0 if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: print("\nAborted by user.") sys.exit(130)