Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

App Files Files Community

multilingual-chatbot / setup /install.py

momenalhamza

Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)

469ef7f verified 19 days ago

raw

history blame contribute delete

9.21 kB

	"""Install all dependencies for the Multilingual Code-Switching Chatbot.

	Detected environment from check_env.py:
	- Python 3.13 on Linux (system Python at /usr/bin/python3)
	- PyTorch 2.11.0+cu130 already installed and working with CUDA 13.0
	- GPU: GTX 1650 (3.6 GB VRAM) -> faiss-cpu (faiss-gpu unnecessary at this scale)

	Strategy:
	- DO NOT touch torch (already installed and working with CUDA).
	- Use --user --break-system-packages (PEP 668 systems like newer Ubuntu).
	- Install in logical groups; report per-group success.
	- Some packages may not have Python 3.13 wheels yet; mark those as optional.
	- Write requirements.txt with pinned versions of what actually installed.
	"""

	from __future__ import annotations

	import importlib
	import importlib.metadata as md
	import subprocess
	import sys
	from pathlib import Path
	from typing import Iterable

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	REQ_FILE = PROJECT_ROOT / "requirements.txt"


	# ---------------------------------------------------------------------------
	# Package groups
	# ---------------------------------------------------------------------------
	# Each entry: (group_name, packages, optional_flag)
	# - packages: list of pip specs (name or name>=ver)
	# - optional: if True, group is allowed to fail without aborting

	GROUPS: list[tuple[str, list[str], bool]] = [
	# Core HuggingFace stack — REQUIRED.
	# NOTE: torch is intentionally NOT listed (already installed with CUDA 13).
	("core_ml", [
	"transformers>=4.40",
	"datasets>=2.18",
	"tokenizers>=0.15",
	"accelerate>=0.27",
	"evaluate>=0.4",
	"huggingface_hub>=0.23",
	], False),

	# Data manipulation / ML utilities — REQUIRED.
	("data_ml", [
	"numpy>=1.26",
	"pandas>=2.1",
	"scikit-learn>=1.3",
	"scipy>=1.11",
	], False),

	# Language detection — REQUIRED.
	("lang_detect", [
	"langdetect>=1.0.9",
	"lingua-language-detector>=2.0",
	], False),

	# Arabic NLP — pyarabic is required (pure-python, always works).
	("arabic_pyarabic", [
	"pyarabic>=0.6.15",
	], False),

	# camel-tools is OPTIONAL: it pulls heavy deps and may not have Python 3.13
	# wheels yet. We try, but fall back to pyarabic-only normalization if it
	# fails. Loaded lazily in the preprocessor.
	("arabic_camel_optional", [
	"camel-tools",
	], True),

	# Embeddings + vector search — REQUIRED for RAG.
	("embeddings_rag", [
	"sentence-transformers>=2.7",
	"faiss-cpu>=1.7.4",
	], False),

	# Web/scraping — REQUIRED for data collection.
	# scrapy can be heavy; we keep it but mark optional in case wheels lag.
	("web_core", [
	"requests>=2.31",
	"beautifulsoup4>=4.12",
	"lxml>=5.1",
	], False),
	("web_scrapy_optional", [
	"scrapy>=2.11",
	], True),

	# Sequence labelling metrics — REQUIRED for NER eval.
	("seq_metrics", [
	"seqeval>=1.2.2",
	], False),

	# UI / serving — REQUIRED for app.py.
	("ui_serving", [
	"gradio>=4.20",
	"fastapi>=0.110",
	"uvicorn>=0.27",
	], False),

	# Experiment tracking + utilities.
	# wandb is optional; tqdm is required.
	("utils_required", [
	"tqdm>=4.66",
	], False),
	("utils_optional", [
	"wandb>=0.16",
	], True),

	# Plotting (used in evaluate.py for confusion matrices).
	("plotting", [
	"matplotlib>=3.8",
	], False),
	]


	# ---------------------------------------------------------------------------
	# pip helpers
	# ---------------------------------------------------------------------------

	def _pip_install(packages: Iterable[str]) -> tuple[bool, str]:
	"""Run `pip install` for a group of packages. Returns (ok, combined_log)."""
	cmd = [
	sys.executable, "-m", "pip", "install",
	"--user", "--break-system-packages",
	"--upgrade",
	*packages,
	]
	try:
	proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
	log = (proc.stdout or "") + (proc.stderr or "")
	return proc.returncode == 0, log
	except Exception as exc: # noqa: BLE001
	return False, f"EXCEPTION: {exc}"


	def _verify_imports(packages: Iterable[str]) -> list[tuple[str, bool, str]]:
	"""Try importing each package by its likely import name; return list of (name, ok, version)."""
	name_map = {
	"beautifulsoup4": "bs4",
	"scikit-learn": "sklearn",
	"faiss-cpu": "faiss",
	"lingua-language-detector": "lingua",
	"sentence-transformers": "sentence_transformers",
	"huggingface_hub": "huggingface_hub",
	"pyarabic": "pyarabic",
	"camel-tools": "camel_tools",
	}
	results: list[tuple[str, bool, str]] = []
	for spec in packages:
	# strip version qualifiers like 'pkg>=1.0'
	base = spec.split(">")[0].split("=")[0].split("<")[0].strip()
	import_name = name_map.get(base, base.replace("-", "_"))
	try:
	mod = importlib.import_module(import_name)
	try:
	ver = md.version(base)
	except md.PackageNotFoundError:
	ver = getattr(mod, "__version__", "?")
	results.append((base, True, ver))
	except Exception as exc: # noqa: BLE001
	results.append((base, False, f"import error: {exc.__class__.__name__}"))
	return results


	# ---------------------------------------------------------------------------
	# Main install loop
	# ---------------------------------------------------------------------------

	def install_group(name: str, packages: list[str], optional: bool) -> dict:
	"""Install one group; print result; return summary dict."""
	label = f"[{name}]" + (" (optional)" if optional else "")
	print(f"\n{'=' * 72}")
	print(f"Installing group: {label}")
	print(f" Packages: {', '.join(packages)}")
	print("=" * 72)

	ok, log = _pip_install(packages)
	# Print only last ~25 lines to avoid noise
	tail = "\n".join(log.strip().splitlines()[-25:])
	print(tail)

	verifies = _verify_imports(packages)
	all_imports_ok = all(v[1] for v in verifies)

	if ok and all_imports_ok:
	print(f"\n ✓ {label} installed and importable.")
	else:
	marker = "✗" if not optional else "!"
	print(f"\n {marker} {label} had problems:")
	for pkg, vok, info in verifies:
	status = "OK " if vok else "FAIL"
	print(f" [{status}] {pkg} ({info})")

	return {
	"group": name,
	"optional": optional,
	"pip_ok": ok,
	"import_results": verifies,
	"fully_ok": ok and all_imports_ok,
	}


	def write_requirements(summaries: list[dict]) -> None:
	"""Write requirements.txt from packages that successfully imported."""
	lines = [
	"# Auto-generated by setup/install.py",
	"# Includes only packages that were successfully installed AND imported.",
	"# torch is intentionally omitted: pre-installed with CUDA support.",
	"",
	]
	for s in summaries:
	lines.append(f"# --- {s['group']} ---")
	for pkg, ok, info in s["import_results"]:
	if ok and not info.startswith("import error"):
	lines.append(f"{pkg}=={info}")
	else:
	lines.append(f"# {pkg} (NOT INSTALLED: {info})")
	lines.append("")
	REQ_FILE.write_text("\n".join(lines))
	print(f"\nrequirements.txt written -> {REQ_FILE}")


	def main() -> int:
	"""Install all groups and write requirements.txt."""
	print("=" * 72)
	print("Multilingual Chatbot — Dependency Installer")
	print("=" * 72)
	print(f"Python : {sys.version.split()[0]} ({sys.executable})")
	print("Strategy : pip install --user --break-system-packages")
	print("Note : torch is intentionally skipped (already installed with CUDA).")

	summaries: list[dict] = []
	hard_failures: list[str] = []

	for name, packages, optional in GROUPS:
	s = install_group(name, packages, optional)
	summaries.append(s)
	if not s["fully_ok"] and not optional:
	hard_failures.append(name)

	# Final report
	print("\n" + "=" * 72)
	print("FINAL REPORT")
	print("=" * 72)
	for s in summaries:
	tag = "OPT " if s["optional"] else "REQ "
	status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗")
	print(f" {status} [{tag}] {s['group']}")
	for pkg, ok, info in s["import_results"]:
	sub = "OK " if ok else "FAIL"
	print(f" [{sub}] {pkg:<35s} {info}")

	write_requirements(summaries)

	if hard_failures:
	print("\nRequired groups that FAILED:")
	for g in hard_failures:
	print(f" - {g}")
	print("\nFix these before continuing to Phase 2.")
	return 1

	print("\nAll required groups installed. Optional groups noted above.")
	print("Safe to continue to Phase 2 (data collection).")
	return 0


	if __name__ == "__main__":
	try:
	sys.exit(main())
	except KeyboardInterrupt:
	print("\nAborted by user.")
	sys.exit(130)