Spaces:
Sleeping
Sleeping
| """Create a clean isolated venv and install all chatbot dependencies into it. | |
| Why this script: | |
| Up to now we used --user --break-system-packages installs into ~/.local. | |
| That works but is not reproducible. This script creates a proper isolated | |
| venv at <project>/.venv so that requirements.txt reflects EXACTLY the | |
| environment used for training and inference. | |
| Approach: | |
| 1. Create .venv at PROJECT_ROOT/.venv (clean — no --system-site-packages). | |
| 2. Upgrade pip and wheel inside the venv. | |
| 3. Pin setuptools_scm<10 first (works around seqeval build bug we hit before). | |
| 4. Install torch with CUDA. Try indexes in order: cu126 -> cu124 -> cu121 -> cpu. | |
| The GTX 1650 (compute 7.5) runs on any of these; cu130 wheels for Py3.13 | |
| are not yet on the official torch index, so we don't try it. | |
| 5. Install all other dependency groups (mirrors install.py). | |
| 6. Verify imports for every package. | |
| 7. Run `pip freeze` inside the venv -> overwrite requirements.txt with pins. | |
| 8. Print activation instructions. | |
| Run once. Re-running will reuse an existing .venv (will not delete it). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import shutil | |
| import subprocess | |
| import sys | |
| import venv | |
| from pathlib import Path | |
| from typing import Iterable | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| VENV_DIR = PROJECT_ROOT / ".venv" | |
| VENV_PYTHON = VENV_DIR / "bin" / "python" | |
| VENV_PIP = VENV_DIR / "bin" / "pip" | |
| REQ_FILE = PROJECT_ROOT / "requirements.txt" | |
| # --------------------------------------------------------------------------- | |
| # Dependency groups | |
| # --------------------------------------------------------------------------- | |
| # Each entry: (group_name, packages, optional_flag) | |
| GROUPS: list[tuple[str, list[str], bool]] = [ | |
| # Build tools — install FIRST. setuptools_scm pinned to avoid seqeval bug | |
| # we already hit on the user-site install. | |
| ("build_tools", [ | |
| "pip>=24.0", | |
| "wheel>=0.42", | |
| "setuptools>=68", | |
| "setuptools_scm<10", | |
| ], False), | |
| # Core HuggingFace stack. | |
| ("core_ml", [ | |
| "transformers>=4.40", | |
| "datasets>=2.18", | |
| "tokenizers>=0.15", | |
| "accelerate>=0.27", | |
| "evaluate>=0.4", | |
| "huggingface_hub>=0.23", | |
| ], False), | |
| # Data manipulation / ML utilities. | |
| ("data_ml", [ | |
| "numpy>=1.26", | |
| "pandas>=2.1", | |
| "scikit-learn>=1.3", | |
| "scipy>=1.11", | |
| ], False), | |
| # Language detection. | |
| ("lang_detect", [ | |
| "langdetect>=1.0.9", | |
| "lingua-language-detector>=2.0", | |
| ], False), | |
| # Arabic NLP. | |
| ("arabic_pyarabic", ["pyarabic>=0.6.15"], False), | |
| ("arabic_camel_optional", ["camel-tools"], True), | |
| # Embeddings + vector search. | |
| ("embeddings_rag", [ | |
| "sentence-transformers>=2.7", | |
| "faiss-cpu>=1.7.4", | |
| ], False), | |
| # Web / scraping. | |
| ("web_core", [ | |
| "requests>=2.31", | |
| "beautifulsoup4>=4.12", | |
| "lxml>=5.1", | |
| ], False), | |
| ("web_scrapy_optional", ["scrapy>=2.11"], True), | |
| # Sequence labelling metrics — needs setuptools_scm<10 (already pinned above). | |
| ("seq_metrics", ["seqeval==1.2.2"], False), | |
| # UI / serving. | |
| ("ui_serving", [ | |
| "gradio>=4.20", | |
| "fastapi>=0.110", | |
| "uvicorn>=0.27", | |
| ], False), | |
| # Utils. | |
| ("utils_required", ["tqdm>=4.66"], False), | |
| ("utils_optional", ["wandb>=0.16"], True), | |
| # Plotting. | |
| ("plotting", ["matplotlib>=3.8"], False), | |
| ] | |
| # torch indexes in priority order. We stop at the first one that succeeds. | |
| TORCH_INDEXES: list[tuple[str, str]] = [ | |
| ("cu126", "https://download.pytorch.org/whl/cu126"), | |
| ("cu124", "https://download.pytorch.org/whl/cu124"), | |
| ("cu121", "https://download.pytorch.org/whl/cu121"), | |
| ("cpu", "https://download.pytorch.org/whl/cpu"), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # venv creation | |
| # --------------------------------------------------------------------------- | |
| def create_venv() -> None: | |
| """Create the venv if it doesn't already exist.""" | |
| if VENV_DIR.exists() and VENV_PYTHON.exists(): | |
| print(f"venv already exists at {VENV_DIR} — reusing.") | |
| return | |
| if VENV_DIR.exists(): | |
| # Half-built venv — remove and recreate | |
| print(f"venv directory exists but is incomplete — removing {VENV_DIR}") | |
| shutil.rmtree(VENV_DIR) | |
| print(f"Creating venv at {VENV_DIR} ...") | |
| builder = venv.EnvBuilder( | |
| system_site_packages=False, # clean isolation | |
| clear=False, | |
| with_pip=True, | |
| upgrade_deps=False, | |
| ) | |
| builder.create(str(VENV_DIR)) | |
| print(" ✓ venv created.") | |
| # --------------------------------------------------------------------------- | |
| # pip helpers (use venv's pip, not system) | |
| # --------------------------------------------------------------------------- | |
| def venv_pip(args: list[str], stream: bool = False) -> tuple[bool, str]: | |
| """Run `<venv>/bin/pip <args>`. Return (ok, log).""" | |
| cmd = [str(VENV_PIP), *args] | |
| if stream: | |
| # For long-running torch downloads, stream to terminal directly | |
| proc = subprocess.run(cmd, check=False) | |
| return proc.returncode == 0, "" | |
| proc = subprocess.run(cmd, capture_output=True, text=True, check=False) | |
| return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "") | |
| def venv_python(code: str) -> tuple[bool, str]: | |
| """Run a snippet of Python with the venv's interpreter.""" | |
| proc = subprocess.run( | |
| [str(VENV_PYTHON), "-c", code], | |
| capture_output=True, text=True, check=False, | |
| ) | |
| return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "") | |
| def upgrade_pip_in_venv() -> None: | |
| """Make sure pip itself is current inside the venv.""" | |
| print("\nUpgrading pip inside venv ...") | |
| ok, log = venv_pip(["install", "--upgrade", "pip", "wheel", "setuptools"]) | |
| print("\n".join(log.strip().splitlines()[-8:])) | |
| if not ok: | |
| raise RuntimeError("Failed to upgrade pip in venv") | |
| # --------------------------------------------------------------------------- | |
| # torch install with fallback chain | |
| # --------------------------------------------------------------------------- | |
| def install_torch() -> tuple[bool, str]: | |
| """Try torch CUDA wheels in order. Return (ok, which_index_succeeded).""" | |
| print("\n" + "=" * 72) | |
| print("Installing PyTorch (CUDA wheels — fallback chain)") | |
| print("=" * 72) | |
| for label, url in TORCH_INDEXES: | |
| print(f"\n>>> Trying torch from index: {label} ({url})") | |
| ok, _ = venv_pip( | |
| ["install", "--index-url", url, "torch", "torchvision", "torchaudio"], | |
| stream=True, | |
| ) | |
| if not ok: | |
| print(f" ✗ {label} index failed — trying next.") | |
| continue | |
| # Verify that import + CUDA reporting works | |
| ok2, out = venv_python( | |
| "import torch; " | |
| "print('TORCH', torch.__version__); " | |
| "print('CUDA', torch.cuda.is_available()); " | |
| "print('CUDA_V', torch.version.cuda); " | |
| "print('GPU', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none')" | |
| ) | |
| print(out.strip()) | |
| if ok2: | |
| print(f" ✓ torch installed and importable ({label}).") | |
| return True, label | |
| print(f" ✗ torch installed but failed to import — trying next index.") | |
| return False, "" | |
| # --------------------------------------------------------------------------- | |
| # group install | |
| # --------------------------------------------------------------------------- | |
| def install_group(name: str, packages: list[str], optional: bool) -> dict: | |
| """Install a single dep group inside the venv.""" | |
| label = f"[{name}]" + (" (optional)" if optional else "") | |
| print(f"\n{'=' * 72}") | |
| print(f"Installing group: {label}") | |
| print(f" Packages: {', '.join(packages)}") | |
| print("=" * 72) | |
| ok, log = venv_pip(["install", "--upgrade", *packages]) | |
| tail = "\n".join(log.strip().splitlines()[-15:]) | |
| print(tail) | |
| # Verify imports | |
| name_map = { | |
| "beautifulsoup4": "bs4", "scikit-learn": "sklearn", "faiss-cpu": "faiss", | |
| "lingua-language-detector": "lingua", | |
| "sentence-transformers": "sentence_transformers", | |
| "huggingface_hub": "huggingface_hub", "pyarabic": "pyarabic", | |
| "camel-tools": "camel_tools", "setuptools_scm": "setuptools_scm", | |
| "pip": "pip", "wheel": "wheel", "setuptools": "setuptools", | |
| } | |
| verifies: list[tuple[str, bool, str]] = [] | |
| for spec in packages: | |
| base = spec.split(">")[0].split("=")[0].split("<")[0].strip() | |
| import_name = name_map.get(base, base.replace("-", "_")) | |
| ok2, out = venv_python( | |
| f"import importlib, importlib.metadata as md;" | |
| f"m = importlib.import_module({import_name!r});" | |
| f"v = md.version({base!r});" | |
| f"print(v)" | |
| ) | |
| if ok2: | |
| verifies.append((base, True, out.strip())) | |
| else: | |
| verifies.append((base, False, out.strip().splitlines()[-1] if out.strip() else "import failed")) | |
| all_ok = all(v[1] for v in verifies) | |
| marker = "✓" if all_ok else ("!" if optional else "✗") | |
| print(f"\n {marker} {label}") | |
| for pkg, vok, info in verifies: | |
| sub = "OK " if vok else "FAIL" | |
| print(f" [{sub}] {pkg:<30s} {info}") | |
| return {"group": name, "optional": optional, "fully_ok": ok and all_ok, | |
| "verifies": verifies} | |
| # --------------------------------------------------------------------------- | |
| # requirements.txt generation | |
| # --------------------------------------------------------------------------- | |
| def write_requirements(torch_index: str) -> None: | |
| """Run `pip freeze` inside the venv and write requirements.txt.""" | |
| print("\n" + "=" * 72) | |
| print("Writing requirements.txt from venv pip freeze") | |
| print("=" * 72) | |
| proc = subprocess.run( | |
| [str(VENV_PIP), "freeze"], | |
| capture_output=True, text=True, check=False, | |
| ) | |
| if proc.returncode != 0: | |
| raise RuntimeError("pip freeze failed:\n" + proc.stderr) | |
| header = [ | |
| "# Auto-generated by setup/setup_venv.py", | |
| "# Pinned versions captured by `pip freeze` inside .venv", | |
| f"# Torch installed from PyTorch index: {torch_index}", | |
| f"# -> https://download.pytorch.org/whl/{torch_index}", | |
| "# To recreate this environment:", | |
| "# python -m venv .venv", | |
| "# source .venv/bin/activate", | |
| f"# pip install --index-url https://download.pytorch.org/whl/{torch_index} torch torchvision torchaudio", | |
| "# pip install -r requirements.txt", | |
| "", | |
| ] | |
| REQ_FILE.write_text("\n".join(header) + proc.stdout) | |
| print(f" ✓ {REQ_FILE} ({len(proc.stdout.splitlines())} packages pinned)") | |
| # --------------------------------------------------------------------------- | |
| # main | |
| # --------------------------------------------------------------------------- | |
| def main() -> int: | |
| """Orchestrate venv creation, package installation, and requirements export.""" | |
| print("=" * 72) | |
| print("Multilingual Chatbot — venv setup") | |
| print("=" * 72) | |
| print(f"Project : {PROJECT_ROOT}") | |
| print(f"venv path : {VENV_DIR}") | |
| print(f"Host pyver : {sys.version.split()[0]}") | |
| create_venv() | |
| # Quick sanity: make sure venv python works | |
| ok, out = venv_python("import sys; print(sys.version.split()[0])") | |
| if not ok: | |
| print("venv Python does not work:") | |
| print(out) | |
| return 1 | |
| print(f"venv pyver : {out.strip()}") | |
| upgrade_pip_in_venv() | |
| # build_tools FIRST (pins setuptools_scm<10 — required for seqeval later) | |
| bt = install_group(*GROUPS[0]) | |
| if not bt["fully_ok"]: | |
| print("Build tools failed — aborting.") | |
| return 1 | |
| # torch with fallback chain | |
| torch_ok, torch_index = install_torch() | |
| if not torch_ok: | |
| print("\nAll torch indexes failed. Aborting.") | |
| return 1 | |
| # remaining groups | |
| summaries = [bt] | |
| hard_failures: list[str] = [] | |
| for name, packages, optional in GROUPS[1:]: | |
| s = install_group(name, packages, optional) | |
| summaries.append(s) | |
| if not s["fully_ok"] and not optional: | |
| hard_failures.append(name) | |
| # final report | |
| print("\n" + "=" * 72) | |
| print("FINAL REPORT") | |
| print("=" * 72) | |
| print(f"venv: {VENV_DIR}") | |
| print(f"torch index: {torch_index}\n") | |
| for s in summaries: | |
| tag = "OPT " if s["optional"] else "REQ " | |
| status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗") | |
| print(f" {status} [{tag}] {s['group']}") | |
| if hard_failures: | |
| print(f"\nFailed required groups: {hard_failures}") | |
| return 1 | |
| write_requirements(torch_index) | |
| print("\n" + "=" * 72) | |
| print("ACTIVATION") | |
| print("=" * 72) | |
| print("To use the venv from now on:") | |
| print(" source .venv/bin/activate") | |
| print("Then run any subsequent script with plain `python ...`.") | |
| print("Or call the venv python directly without activating:") | |
| print(f" {VENV_PYTHON} <script.py>") | |
| return 0 | |
| if __name__ == "__main__": | |
| try: | |
| sys.exit(main()) | |
| except KeyboardInterrupt: | |
| print("\nAborted by user.") | |
| sys.exit(130) | |