"""Install all dependencies for the Multilingual Code-Switching Chatbot. Detected environment from check_env.py: - Python 3.13 on Linux (system Python at /usr/bin/python3) - PyTorch 2.11.0+cu130 already installed and working with CUDA 13.0 - GPU: GTX 1650 (3.6 GB VRAM) -> faiss-cpu (faiss-gpu unnecessary at this scale) Strategy: - DO NOT touch torch (already installed and working with CUDA). - Use --user --break-system-packages (PEP 668 systems like newer Ubuntu). - Install in logical groups; report per-group success. - Some packages may not have Python 3.13 wheels yet; mark those as optional. - Write requirements.txt with pinned versions of what actually installed. """ from __future__ import annotations import importlib import importlib.metadata as md import subprocess import sys from pathlib import Path from typing import Iterable PROJECT_ROOT = Path(__file__).resolve().parent.parent REQ_FILE = PROJECT_ROOT / "requirements.txt" # --------------------------------------------------------------------------- # Package groups # --------------------------------------------------------------------------- # Each entry: (group_name, packages, optional_flag) # - packages: list of pip specs (name or name>=ver) # - optional: if True, group is allowed to fail without aborting GROUPS: list[tuple[str, list[str], bool]] = [ # Core HuggingFace stack — REQUIRED. # NOTE: torch is intentionally NOT listed (already installed with CUDA 13). ("core_ml", [ "transformers>=4.40", "datasets>=2.18", "tokenizers>=0.15", "accelerate>=0.27", "evaluate>=0.4", "huggingface_hub>=0.23", ], False), # Data manipulation / ML utilities — REQUIRED. ("data_ml", [ "numpy>=1.26", "pandas>=2.1", "scikit-learn>=1.3", "scipy>=1.11", ], False), # Language detection — REQUIRED. ("lang_detect", [ "langdetect>=1.0.9", "lingua-language-detector>=2.0", ], False), # Arabic NLP — pyarabic is required (pure-python, always works). ("arabic_pyarabic", [ "pyarabic>=0.6.15", ], False), # camel-tools is OPTIONAL: it pulls heavy deps and may not have Python 3.13 # wheels yet. We try, but fall back to pyarabic-only normalization if it # fails. Loaded lazily in the preprocessor. ("arabic_camel_optional", [ "camel-tools", ], True), # Embeddings + vector search — REQUIRED for RAG. ("embeddings_rag", [ "sentence-transformers>=2.7", "faiss-cpu>=1.7.4", ], False), # Web/scraping — REQUIRED for data collection. # scrapy can be heavy; we keep it but mark optional in case wheels lag. ("web_core", [ "requests>=2.31", "beautifulsoup4>=4.12", "lxml>=5.1", ], False), ("web_scrapy_optional", [ "scrapy>=2.11", ], True), # Sequence labelling metrics — REQUIRED for NER eval. ("seq_metrics", [ "seqeval>=1.2.2", ], False), # UI / serving — REQUIRED for app.py. ("ui_serving", [ "gradio>=4.20", "fastapi>=0.110", "uvicorn>=0.27", ], False), # Experiment tracking + utilities. # wandb is optional; tqdm is required. ("utils_required", [ "tqdm>=4.66", ], False), ("utils_optional", [ "wandb>=0.16", ], True), # Plotting (used in evaluate.py for confusion matrices). ("plotting", [ "matplotlib>=3.8", ], False), ] # --------------------------------------------------------------------------- # pip helpers # --------------------------------------------------------------------------- def _pip_install(packages: Iterable[str]) -> tuple[bool, str]: """Run `pip install` for a group of packages. Returns (ok, combined_log).""" cmd = [ sys.executable, "-m", "pip", "install", "--user", "--break-system-packages", "--upgrade", *packages, ] try: proc = subprocess.run(cmd, capture_output=True, text=True, check=False) log = (proc.stdout or "") + (proc.stderr or "") return proc.returncode == 0, log except Exception as exc: # noqa: BLE001 return False, f"EXCEPTION: {exc}" def _verify_imports(packages: Iterable[str]) -> list[tuple[str, bool, str]]: """Try importing each package by its likely import name; return list of (name, ok, version).""" name_map = { "beautifulsoup4": "bs4", "scikit-learn": "sklearn", "faiss-cpu": "faiss", "lingua-language-detector": "lingua", "sentence-transformers": "sentence_transformers", "huggingface_hub": "huggingface_hub", "pyarabic": "pyarabic", "camel-tools": "camel_tools", } results: list[tuple[str, bool, str]] = [] for spec in packages: # strip version qualifiers like 'pkg>=1.0' base = spec.split(">")[0].split("=")[0].split("<")[0].strip() import_name = name_map.get(base, base.replace("-", "_")) try: mod = importlib.import_module(import_name) try: ver = md.version(base) except md.PackageNotFoundError: ver = getattr(mod, "__version__", "?") results.append((base, True, ver)) except Exception as exc: # noqa: BLE001 results.append((base, False, f"import error: {exc.__class__.__name__}")) return results # --------------------------------------------------------------------------- # Main install loop # --------------------------------------------------------------------------- def install_group(name: str, packages: list[str], optional: bool) -> dict: """Install one group; print result; return summary dict.""" label = f"[{name}]" + (" (optional)" if optional else "") print(f"\n{'=' * 72}") print(f"Installing group: {label}") print(f" Packages: {', '.join(packages)}") print("=" * 72) ok, log = _pip_install(packages) # Print only last ~25 lines to avoid noise tail = "\n".join(log.strip().splitlines()[-25:]) print(tail) verifies = _verify_imports(packages) all_imports_ok = all(v[1] for v in verifies) if ok and all_imports_ok: print(f"\n ✓ {label} installed and importable.") else: marker = "✗" if not optional else "!" print(f"\n {marker} {label} had problems:") for pkg, vok, info in verifies: status = "OK " if vok else "FAIL" print(f" [{status}] {pkg} ({info})") return { "group": name, "optional": optional, "pip_ok": ok, "import_results": verifies, "fully_ok": ok and all_imports_ok, } def write_requirements(summaries: list[dict]) -> None: """Write requirements.txt from packages that successfully imported.""" lines = [ "# Auto-generated by setup/install.py", "# Includes only packages that were successfully installed AND imported.", "# torch is intentionally omitted: pre-installed with CUDA support.", "", ] for s in summaries: lines.append(f"# --- {s['group']} ---") for pkg, ok, info in s["import_results"]: if ok and not info.startswith("import error"): lines.append(f"{pkg}=={info}") else: lines.append(f"# {pkg} (NOT INSTALLED: {info})") lines.append("") REQ_FILE.write_text("\n".join(lines)) print(f"\nrequirements.txt written -> {REQ_FILE}") def main() -> int: """Install all groups and write requirements.txt.""" print("=" * 72) print("Multilingual Chatbot — Dependency Installer") print("=" * 72) print(f"Python : {sys.version.split()[0]} ({sys.executable})") print("Strategy : pip install --user --break-system-packages") print("Note : torch is intentionally skipped (already installed with CUDA).") summaries: list[dict] = [] hard_failures: list[str] = [] for name, packages, optional in GROUPS: s = install_group(name, packages, optional) summaries.append(s) if not s["fully_ok"] and not optional: hard_failures.append(name) # Final report print("\n" + "=" * 72) print("FINAL REPORT") print("=" * 72) for s in summaries: tag = "OPT " if s["optional"] else "REQ " status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗") print(f" {status} [{tag}] {s['group']}") for pkg, ok, info in s["import_results"]: sub = "OK " if ok else "FAIL" print(f" [{sub}] {pkg:<35s} {info}") write_requirements(summaries) if hard_failures: print("\nRequired groups that FAILED:") for g in hard_failures: print(f" - {g}") print("\nFix these before continuing to Phase 2.") return 1 print("\nAll required groups installed. Optional groups noted above.") print("Safe to continue to Phase 2 (data collection).") return 0 if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: print("\nAborted by user.") sys.exit(130)