Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

App Files Files Community

multilingual-chatbot / setup /setup_venv.py

momenalhamza

Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)

469ef7f verified 29 days ago

raw

history blame contribute delete

13.3 kB

	"""Create a clean isolated venv and install all chatbot dependencies into it.

	Why this script:
	Up to now we used --user --break-system-packages installs into ~/.local.
	That works but is not reproducible. This script creates a proper isolated
	venv at <project>/.venv so that requirements.txt reflects EXACTLY the
	environment used for training and inference.

	Approach:
	1. Create .venv at PROJECT_ROOT/.venv (clean — no --system-site-packages).
	2. Upgrade pip and wheel inside the venv.
	3. Pin setuptools_scm<10 first (works around seqeval build bug we hit before).
	4. Install torch with CUDA. Try indexes in order: cu126 -> cu124 -> cu121 -> cpu.
	The GTX 1650 (compute 7.5) runs on any of these; cu130 wheels for Py3.13
	are not yet on the official torch index, so we don't try it.
	5. Install all other dependency groups (mirrors install.py).
	6. Verify imports for every package.
	7. Run `pip freeze` inside the venv -> overwrite requirements.txt with pins.
	8. Print activation instructions.

	Run once. Re-running will reuse an existing .venv (will not delete it).
	"""

	from __future__ import annotations

	import os
	import shutil
	import subprocess
	import sys
	import venv
	from pathlib import Path
	from typing import Iterable

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	VENV_DIR = PROJECT_ROOT / ".venv"
	VENV_PYTHON = VENV_DIR / "bin" / "python"
	VENV_PIP = VENV_DIR / "bin" / "pip"
	REQ_FILE = PROJECT_ROOT / "requirements.txt"


	# ---------------------------------------------------------------------------
	# Dependency groups
	# ---------------------------------------------------------------------------

	# Each entry: (group_name, packages, optional_flag)
	GROUPS: list[tuple[str, list[str], bool]] = [
	# Build tools — install FIRST. setuptools_scm pinned to avoid seqeval bug
	# we already hit on the user-site install.
	("build_tools", [
	"pip>=24.0",
	"wheel>=0.42",
	"setuptools>=68",
	"setuptools_scm<10",
	], False),

	# Core HuggingFace stack.
	("core_ml", [
	"transformers>=4.40",
	"datasets>=2.18",
	"tokenizers>=0.15",
	"accelerate>=0.27",
	"evaluate>=0.4",
	"huggingface_hub>=0.23",
	], False),

	# Data manipulation / ML utilities.
	("data_ml", [
	"numpy>=1.26",
	"pandas>=2.1",
	"scikit-learn>=1.3",
	"scipy>=1.11",
	], False),

	# Language detection.
	("lang_detect", [
	"langdetect>=1.0.9",
	"lingua-language-detector>=2.0",
	], False),

	# Arabic NLP.
	("arabic_pyarabic", ["pyarabic>=0.6.15"], False),
	("arabic_camel_optional", ["camel-tools"], True),

	# Embeddings + vector search.
	("embeddings_rag", [
	"sentence-transformers>=2.7",
	"faiss-cpu>=1.7.4",
	], False),

	# Web / scraping.
	("web_core", [
	"requests>=2.31",
	"beautifulsoup4>=4.12",
	"lxml>=5.1",
	], False),
	("web_scrapy_optional", ["scrapy>=2.11"], True),

	# Sequence labelling metrics — needs setuptools_scm<10 (already pinned above).
	("seq_metrics", ["seqeval==1.2.2"], False),

	# UI / serving.
	("ui_serving", [
	"gradio>=4.20",
	"fastapi>=0.110",
	"uvicorn>=0.27",
	], False),

	# Utils.
	("utils_required", ["tqdm>=4.66"], False),
	("utils_optional", ["wandb>=0.16"], True),

	# Plotting.
	("plotting", ["matplotlib>=3.8"], False),
	]

	# torch indexes in priority order. We stop at the first one that succeeds.
	TORCH_INDEXES: list[tuple[str, str]] = [
	("cu126", "https://download.pytorch.org/whl/cu126"),
	("cu124", "https://download.pytorch.org/whl/cu124"),
	("cu121", "https://download.pytorch.org/whl/cu121"),
	("cpu", "https://download.pytorch.org/whl/cpu"),
	]


	# ---------------------------------------------------------------------------
	# venv creation
	# ---------------------------------------------------------------------------

	def create_venv() -> None:
	"""Create the venv if it doesn't already exist."""
	if VENV_DIR.exists() and VENV_PYTHON.exists():
	print(f"venv already exists at {VENV_DIR} — reusing.")
	return
	if VENV_DIR.exists():
	# Half-built venv — remove and recreate
	print(f"venv directory exists but is incomplete — removing {VENV_DIR}")
	shutil.rmtree(VENV_DIR)
	print(f"Creating venv at {VENV_DIR} ...")
	builder = venv.EnvBuilder(
	system_site_packages=False, # clean isolation
	clear=False,
	with_pip=True,
	upgrade_deps=False,
	)
	builder.create(str(VENV_DIR))
	print(" ✓ venv created.")


	# ---------------------------------------------------------------------------
	# pip helpers (use venv's pip, not system)
	# ---------------------------------------------------------------------------

	def venv_pip(args: list[str], stream: bool = False) -> tuple[bool, str]:
	"""Run `<venv>/bin/pip <args>`. Return (ok, log)."""
	cmd = [str(VENV_PIP), *args]
	if stream:
	# For long-running torch downloads, stream to terminal directly
	proc = subprocess.run(cmd, check=False)
	return proc.returncode == 0, ""
	proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
	return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "")


	def venv_python(code: str) -> tuple[bool, str]:
	"""Run a snippet of Python with the venv's interpreter."""
	proc = subprocess.run(
	[str(VENV_PYTHON), "-c", code],
	capture_output=True, text=True, check=False,
	)
	return proc.returncode == 0, (proc.stdout or "") + (proc.stderr or "")


	def upgrade_pip_in_venv() -> None:
	"""Make sure pip itself is current inside the venv."""
	print("\nUpgrading pip inside venv ...")
	ok, log = venv_pip(["install", "--upgrade", "pip", "wheel", "setuptools"])
	print("\n".join(log.strip().splitlines()[-8:]))
	if not ok:
	raise RuntimeError("Failed to upgrade pip in venv")


	# ---------------------------------------------------------------------------
	# torch install with fallback chain
	# ---------------------------------------------------------------------------

	def install_torch() -> tuple[bool, str]:
	"""Try torch CUDA wheels in order. Return (ok, which_index_succeeded)."""
	print("\n" + "=" * 72)
	print("Installing PyTorch (CUDA wheels — fallback chain)")
	print("=" * 72)
	for label, url in TORCH_INDEXES:
	print(f"\n>>> Trying torch from index: {label} ({url})")
	ok, _ = venv_pip(
	["install", "--index-url", url, "torch", "torchvision", "torchaudio"],
	stream=True,
	)
	if not ok:
	print(f" ✗ {label} index failed — trying next.")
	continue

	# Verify that import + CUDA reporting works
	ok2, out = venv_python(
	"import torch; "
	"print('TORCH', torch.__version__); "
	"print('CUDA', torch.cuda.is_available()); "
	"print('CUDA_V', torch.version.cuda); "
	"print('GPU', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none')"
	)
	print(out.strip())
	if ok2:
	print(f" ✓ torch installed and importable ({label}).")
	return True, label
	print(f" ✗ torch installed but failed to import — trying next index.")
	return False, ""


	# ---------------------------------------------------------------------------
	# group install
	# ---------------------------------------------------------------------------

	def install_group(name: str, packages: list[str], optional: bool) -> dict:
	"""Install a single dep group inside the venv."""
	label = f"[{name}]" + (" (optional)" if optional else "")
	print(f"\n{'=' * 72}")
	print(f"Installing group: {label}")
	print(f" Packages: {', '.join(packages)}")
	print("=" * 72)

	ok, log = venv_pip(["install", "--upgrade", *packages])
	tail = "\n".join(log.strip().splitlines()[-15:])
	print(tail)

	# Verify imports
	name_map = {
	"beautifulsoup4": "bs4", "scikit-learn": "sklearn", "faiss-cpu": "faiss",
	"lingua-language-detector": "lingua",
	"sentence-transformers": "sentence_transformers",
	"huggingface_hub": "huggingface_hub", "pyarabic": "pyarabic",
	"camel-tools": "camel_tools", "setuptools_scm": "setuptools_scm",
	"pip": "pip", "wheel": "wheel", "setuptools": "setuptools",
	}
	verifies: list[tuple[str, bool, str]] = []
	for spec in packages:
	base = spec.split(">")[0].split("=")[0].split("<")[0].strip()
	import_name = name_map.get(base, base.replace("-", "_"))
	ok2, out = venv_python(
	f"import importlib, importlib.metadata as md;"
	f"m = importlib.import_module({import_name!r});"
	f"v = md.version({base!r});"
	f"print(v)"
	)
	if ok2:
	verifies.append((base, True, out.strip()))
	else:
	verifies.append((base, False, out.strip().splitlines()[-1] if out.strip() else "import failed"))

	all_ok = all(v[1] for v in verifies)
	marker = "✓" if all_ok else ("!" if optional else "✗")
	print(f"\n {marker} {label}")
	for pkg, vok, info in verifies:
	sub = "OK " if vok else "FAIL"
	print(f" [{sub}] {pkg:<30s} {info}")

	return {"group": name, "optional": optional, "fully_ok": ok and all_ok,
	"verifies": verifies}


	# ---------------------------------------------------------------------------
	# requirements.txt generation
	# ---------------------------------------------------------------------------

	def write_requirements(torch_index: str) -> None:
	"""Run `pip freeze` inside the venv and write requirements.txt."""
	print("\n" + "=" * 72)
	print("Writing requirements.txt from venv pip freeze")
	print("=" * 72)
	proc = subprocess.run(
	[str(VENV_PIP), "freeze"],
	capture_output=True, text=True, check=False,
	)
	if proc.returncode != 0:
	raise RuntimeError("pip freeze failed:\n" + proc.stderr)

	header = [
	"# Auto-generated by setup/setup_venv.py",
	"# Pinned versions captured by `pip freeze` inside .venv",
	f"# Torch installed from PyTorch index: {torch_index}",
	f"# -> https://download.pytorch.org/whl/{torch_index}",
	"# To recreate this environment:",
	"# python -m venv .venv",
	"# source .venv/bin/activate",
	f"# pip install --index-url https://download.pytorch.org/whl/{torch_index} torch torchvision torchaudio",
	"# pip install -r requirements.txt",
	"",
	]
	REQ_FILE.write_text("\n".join(header) + proc.stdout)
	print(f" ✓ {REQ_FILE} ({len(proc.stdout.splitlines())} packages pinned)")


	# ---------------------------------------------------------------------------
	# main
	# ---------------------------------------------------------------------------

	def main() -> int:
	"""Orchestrate venv creation, package installation, and requirements export."""
	print("=" * 72)
	print("Multilingual Chatbot — venv setup")
	print("=" * 72)
	print(f"Project : {PROJECT_ROOT}")
	print(f"venv path : {VENV_DIR}")
	print(f"Host pyver : {sys.version.split()[0]}")

	create_venv()

	# Quick sanity: make sure venv python works
	ok, out = venv_python("import sys; print(sys.version.split()[0])")
	if not ok:
	print("venv Python does not work:")
	print(out)
	return 1
	print(f"venv pyver : {out.strip()}")

	upgrade_pip_in_venv()

	# build_tools FIRST (pins setuptools_scm<10 — required for seqeval later)
	bt = install_group(*GROUPS[0])
	if not bt["fully_ok"]:
	print("Build tools failed — aborting.")
	return 1

	# torch with fallback chain
	torch_ok, torch_index = install_torch()
	if not torch_ok:
	print("\nAll torch indexes failed. Aborting.")
	return 1

	# remaining groups
	summaries = [bt]
	hard_failures: list[str] = []
	for name, packages, optional in GROUPS[1:]:
	s = install_group(name, packages, optional)
	summaries.append(s)
	if not s["fully_ok"] and not optional:
	hard_failures.append(name)

	# final report
	print("\n" + "=" * 72)
	print("FINAL REPORT")
	print("=" * 72)
	print(f"venv: {VENV_DIR}")
	print(f"torch index: {torch_index}\n")
	for s in summaries:
	tag = "OPT " if s["optional"] else "REQ "
	status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗")
	print(f" {status} [{tag}] {s['group']}")

	if hard_failures:
	print(f"\nFailed required groups: {hard_failures}")
	return 1

	write_requirements(torch_index)

	print("\n" + "=" * 72)
	print("ACTIVATION")
	print("=" * 72)
	print("To use the venv from now on:")
	print(" source .venv/bin/activate")
	print("Then run any subsequent script with plain `python ...`.")
	print("Or call the venv python directly without activating:")
	print(f" {VENV_PYTHON} <script.py>")
	return 0


	if __name__ == "__main__":
	try:
	sys.exit(main())
	except KeyboardInterrupt:
	print("\nAborted by user.")
	sys.exit(130)