momenalhamza's picture
Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)
469ef7f verified
"""Install all dependencies for the Multilingual Code-Switching Chatbot.
Detected environment from check_env.py:
- Python 3.13 on Linux (system Python at /usr/bin/python3)
- PyTorch 2.11.0+cu130 already installed and working with CUDA 13.0
- GPU: GTX 1650 (3.6 GB VRAM) -> faiss-cpu (faiss-gpu unnecessary at this scale)
Strategy:
- DO NOT touch torch (already installed and working with CUDA).
- Use --user --break-system-packages (PEP 668 systems like newer Ubuntu).
- Install in logical groups; report per-group success.
- Some packages may not have Python 3.13 wheels yet; mark those as optional.
- Write requirements.txt with pinned versions of what actually installed.
"""
from __future__ import annotations
import importlib
import importlib.metadata as md
import subprocess
import sys
from pathlib import Path
from typing import Iterable
PROJECT_ROOT = Path(__file__).resolve().parent.parent
REQ_FILE = PROJECT_ROOT / "requirements.txt"
# ---------------------------------------------------------------------------
# Package groups
# ---------------------------------------------------------------------------
# Each entry: (group_name, packages, optional_flag)
# - packages: list of pip specs (name or name>=ver)
# - optional: if True, group is allowed to fail without aborting
GROUPS: list[tuple[str, list[str], bool]] = [
# Core HuggingFace stack — REQUIRED.
# NOTE: torch is intentionally NOT listed (already installed with CUDA 13).
("core_ml", [
"transformers>=4.40",
"datasets>=2.18",
"tokenizers>=0.15",
"accelerate>=0.27",
"evaluate>=0.4",
"huggingface_hub>=0.23",
], False),
# Data manipulation / ML utilities — REQUIRED.
("data_ml", [
"numpy>=1.26",
"pandas>=2.1",
"scikit-learn>=1.3",
"scipy>=1.11",
], False),
# Language detection — REQUIRED.
("lang_detect", [
"langdetect>=1.0.9",
"lingua-language-detector>=2.0",
], False),
# Arabic NLP — pyarabic is required (pure-python, always works).
("arabic_pyarabic", [
"pyarabic>=0.6.15",
], False),
# camel-tools is OPTIONAL: it pulls heavy deps and may not have Python 3.13
# wheels yet. We try, but fall back to pyarabic-only normalization if it
# fails. Loaded lazily in the preprocessor.
("arabic_camel_optional", [
"camel-tools",
], True),
# Embeddings + vector search — REQUIRED for RAG.
("embeddings_rag", [
"sentence-transformers>=2.7",
"faiss-cpu>=1.7.4",
], False),
# Web/scraping — REQUIRED for data collection.
# scrapy can be heavy; we keep it but mark optional in case wheels lag.
("web_core", [
"requests>=2.31",
"beautifulsoup4>=4.12",
"lxml>=5.1",
], False),
("web_scrapy_optional", [
"scrapy>=2.11",
], True),
# Sequence labelling metrics — REQUIRED for NER eval.
("seq_metrics", [
"seqeval>=1.2.2",
], False),
# UI / serving — REQUIRED for app.py.
("ui_serving", [
"gradio>=4.20",
"fastapi>=0.110",
"uvicorn>=0.27",
], False),
# Experiment tracking + utilities.
# wandb is optional; tqdm is required.
("utils_required", [
"tqdm>=4.66",
], False),
("utils_optional", [
"wandb>=0.16",
], True),
# Plotting (used in evaluate.py for confusion matrices).
("plotting", [
"matplotlib>=3.8",
], False),
]
# ---------------------------------------------------------------------------
# pip helpers
# ---------------------------------------------------------------------------
def _pip_install(packages: Iterable[str]) -> tuple[bool, str]:
"""Run `pip install` for a group of packages. Returns (ok, combined_log)."""
cmd = [
sys.executable, "-m", "pip", "install",
"--user", "--break-system-packages",
"--upgrade",
*packages,
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True, check=False)
log = (proc.stdout or "") + (proc.stderr or "")
return proc.returncode == 0, log
except Exception as exc: # noqa: BLE001
return False, f"EXCEPTION: {exc}"
def _verify_imports(packages: Iterable[str]) -> list[tuple[str, bool, str]]:
"""Try importing each package by its likely import name; return list of (name, ok, version)."""
name_map = {
"beautifulsoup4": "bs4",
"scikit-learn": "sklearn",
"faiss-cpu": "faiss",
"lingua-language-detector": "lingua",
"sentence-transformers": "sentence_transformers",
"huggingface_hub": "huggingface_hub",
"pyarabic": "pyarabic",
"camel-tools": "camel_tools",
}
results: list[tuple[str, bool, str]] = []
for spec in packages:
# strip version qualifiers like 'pkg>=1.0'
base = spec.split(">")[0].split("=")[0].split("<")[0].strip()
import_name = name_map.get(base, base.replace("-", "_"))
try:
mod = importlib.import_module(import_name)
try:
ver = md.version(base)
except md.PackageNotFoundError:
ver = getattr(mod, "__version__", "?")
results.append((base, True, ver))
except Exception as exc: # noqa: BLE001
results.append((base, False, f"import error: {exc.__class__.__name__}"))
return results
# ---------------------------------------------------------------------------
# Main install loop
# ---------------------------------------------------------------------------
def install_group(name: str, packages: list[str], optional: bool) -> dict:
"""Install one group; print result; return summary dict."""
label = f"[{name}]" + (" (optional)" if optional else "")
print(f"\n{'=' * 72}")
print(f"Installing group: {label}")
print(f" Packages: {', '.join(packages)}")
print("=" * 72)
ok, log = _pip_install(packages)
# Print only last ~25 lines to avoid noise
tail = "\n".join(log.strip().splitlines()[-25:])
print(tail)
verifies = _verify_imports(packages)
all_imports_ok = all(v[1] for v in verifies)
if ok and all_imports_ok:
print(f"\n ✓ {label} installed and importable.")
else:
marker = "✗" if not optional else "!"
print(f"\n {marker} {label} had problems:")
for pkg, vok, info in verifies:
status = "OK " if vok else "FAIL"
print(f" [{status}] {pkg} ({info})")
return {
"group": name,
"optional": optional,
"pip_ok": ok,
"import_results": verifies,
"fully_ok": ok and all_imports_ok,
}
def write_requirements(summaries: list[dict]) -> None:
"""Write requirements.txt from packages that successfully imported."""
lines = [
"# Auto-generated by setup/install.py",
"# Includes only packages that were successfully installed AND imported.",
"# torch is intentionally omitted: pre-installed with CUDA support.",
"",
]
for s in summaries:
lines.append(f"# --- {s['group']} ---")
for pkg, ok, info in s["import_results"]:
if ok and not info.startswith("import error"):
lines.append(f"{pkg}=={info}")
else:
lines.append(f"# {pkg} (NOT INSTALLED: {info})")
lines.append("")
REQ_FILE.write_text("\n".join(lines))
print(f"\nrequirements.txt written -> {REQ_FILE}")
def main() -> int:
"""Install all groups and write requirements.txt."""
print("=" * 72)
print("Multilingual Chatbot — Dependency Installer")
print("=" * 72)
print(f"Python : {sys.version.split()[0]} ({sys.executable})")
print("Strategy : pip install --user --break-system-packages")
print("Note : torch is intentionally skipped (already installed with CUDA).")
summaries: list[dict] = []
hard_failures: list[str] = []
for name, packages, optional in GROUPS:
s = install_group(name, packages, optional)
summaries.append(s)
if not s["fully_ok"] and not optional:
hard_failures.append(name)
# Final report
print("\n" + "=" * 72)
print("FINAL REPORT")
print("=" * 72)
for s in summaries:
tag = "OPT " if s["optional"] else "REQ "
status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗")
print(f" {status} [{tag}] {s['group']}")
for pkg, ok, info in s["import_results"]:
sub = "OK " if ok else "FAIL"
print(f" [{sub}] {pkg:<35s} {info}")
write_requirements(summaries)
if hard_failures:
print("\nRequired groups that FAILED:")
for g in hard_failures:
print(f" - {g}")
print("\nFix these before continuing to Phase 2.")
return 1
print("\nAll required groups installed. Optional groups noted above.")
print("Safe to continue to Phase 2 (data collection).")
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\nAborted by user.")
sys.exit(130)