Spaces:
Sleeping
Sleeping
| """Install all dependencies for the Multilingual Code-Switching Chatbot. | |
| Detected environment from check_env.py: | |
| - Python 3.13 on Linux (system Python at /usr/bin/python3) | |
| - PyTorch 2.11.0+cu130 already installed and working with CUDA 13.0 | |
| - GPU: GTX 1650 (3.6 GB VRAM) -> faiss-cpu (faiss-gpu unnecessary at this scale) | |
| Strategy: | |
| - DO NOT touch torch (already installed and working with CUDA). | |
| - Use --user --break-system-packages (PEP 668 systems like newer Ubuntu). | |
| - Install in logical groups; report per-group success. | |
| - Some packages may not have Python 3.13 wheels yet; mark those as optional. | |
| - Write requirements.txt with pinned versions of what actually installed. | |
| """ | |
| from __future__ import annotations | |
| import importlib | |
| import importlib.metadata as md | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Iterable | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| REQ_FILE = PROJECT_ROOT / "requirements.txt" | |
| # --------------------------------------------------------------------------- | |
| # Package groups | |
| # --------------------------------------------------------------------------- | |
| # Each entry: (group_name, packages, optional_flag) | |
| # - packages: list of pip specs (name or name>=ver) | |
| # - optional: if True, group is allowed to fail without aborting | |
| GROUPS: list[tuple[str, list[str], bool]] = [ | |
| # Core HuggingFace stack — REQUIRED. | |
| # NOTE: torch is intentionally NOT listed (already installed with CUDA 13). | |
| ("core_ml", [ | |
| "transformers>=4.40", | |
| "datasets>=2.18", | |
| "tokenizers>=0.15", | |
| "accelerate>=0.27", | |
| "evaluate>=0.4", | |
| "huggingface_hub>=0.23", | |
| ], False), | |
| # Data manipulation / ML utilities — REQUIRED. | |
| ("data_ml", [ | |
| "numpy>=1.26", | |
| "pandas>=2.1", | |
| "scikit-learn>=1.3", | |
| "scipy>=1.11", | |
| ], False), | |
| # Language detection — REQUIRED. | |
| ("lang_detect", [ | |
| "langdetect>=1.0.9", | |
| "lingua-language-detector>=2.0", | |
| ], False), | |
| # Arabic NLP — pyarabic is required (pure-python, always works). | |
| ("arabic_pyarabic", [ | |
| "pyarabic>=0.6.15", | |
| ], False), | |
| # camel-tools is OPTIONAL: it pulls heavy deps and may not have Python 3.13 | |
| # wheels yet. We try, but fall back to pyarabic-only normalization if it | |
| # fails. Loaded lazily in the preprocessor. | |
| ("arabic_camel_optional", [ | |
| "camel-tools", | |
| ], True), | |
| # Embeddings + vector search — REQUIRED for RAG. | |
| ("embeddings_rag", [ | |
| "sentence-transformers>=2.7", | |
| "faiss-cpu>=1.7.4", | |
| ], False), | |
| # Web/scraping — REQUIRED for data collection. | |
| # scrapy can be heavy; we keep it but mark optional in case wheels lag. | |
| ("web_core", [ | |
| "requests>=2.31", | |
| "beautifulsoup4>=4.12", | |
| "lxml>=5.1", | |
| ], False), | |
| ("web_scrapy_optional", [ | |
| "scrapy>=2.11", | |
| ], True), | |
| # Sequence labelling metrics — REQUIRED for NER eval. | |
| ("seq_metrics", [ | |
| "seqeval>=1.2.2", | |
| ], False), | |
| # UI / serving — REQUIRED for app.py. | |
| ("ui_serving", [ | |
| "gradio>=4.20", | |
| "fastapi>=0.110", | |
| "uvicorn>=0.27", | |
| ], False), | |
| # Experiment tracking + utilities. | |
| # wandb is optional; tqdm is required. | |
| ("utils_required", [ | |
| "tqdm>=4.66", | |
| ], False), | |
| ("utils_optional", [ | |
| "wandb>=0.16", | |
| ], True), | |
| # Plotting (used in evaluate.py for confusion matrices). | |
| ("plotting", [ | |
| "matplotlib>=3.8", | |
| ], False), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # pip helpers | |
| # --------------------------------------------------------------------------- | |
| def _pip_install(packages: Iterable[str]) -> tuple[bool, str]: | |
| """Run `pip install` for a group of packages. Returns (ok, combined_log).""" | |
| cmd = [ | |
| sys.executable, "-m", "pip", "install", | |
| "--user", "--break-system-packages", | |
| "--upgrade", | |
| *packages, | |
| ] | |
| try: | |
| proc = subprocess.run(cmd, capture_output=True, text=True, check=False) | |
| log = (proc.stdout or "") + (proc.stderr or "") | |
| return proc.returncode == 0, log | |
| except Exception as exc: # noqa: BLE001 | |
| return False, f"EXCEPTION: {exc}" | |
| def _verify_imports(packages: Iterable[str]) -> list[tuple[str, bool, str]]: | |
| """Try importing each package by its likely import name; return list of (name, ok, version).""" | |
| name_map = { | |
| "beautifulsoup4": "bs4", | |
| "scikit-learn": "sklearn", | |
| "faiss-cpu": "faiss", | |
| "lingua-language-detector": "lingua", | |
| "sentence-transformers": "sentence_transformers", | |
| "huggingface_hub": "huggingface_hub", | |
| "pyarabic": "pyarabic", | |
| "camel-tools": "camel_tools", | |
| } | |
| results: list[tuple[str, bool, str]] = [] | |
| for spec in packages: | |
| # strip version qualifiers like 'pkg>=1.0' | |
| base = spec.split(">")[0].split("=")[0].split("<")[0].strip() | |
| import_name = name_map.get(base, base.replace("-", "_")) | |
| try: | |
| mod = importlib.import_module(import_name) | |
| try: | |
| ver = md.version(base) | |
| except md.PackageNotFoundError: | |
| ver = getattr(mod, "__version__", "?") | |
| results.append((base, True, ver)) | |
| except Exception as exc: # noqa: BLE001 | |
| results.append((base, False, f"import error: {exc.__class__.__name__}")) | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # Main install loop | |
| # --------------------------------------------------------------------------- | |
| def install_group(name: str, packages: list[str], optional: bool) -> dict: | |
| """Install one group; print result; return summary dict.""" | |
| label = f"[{name}]" + (" (optional)" if optional else "") | |
| print(f"\n{'=' * 72}") | |
| print(f"Installing group: {label}") | |
| print(f" Packages: {', '.join(packages)}") | |
| print("=" * 72) | |
| ok, log = _pip_install(packages) | |
| # Print only last ~25 lines to avoid noise | |
| tail = "\n".join(log.strip().splitlines()[-25:]) | |
| print(tail) | |
| verifies = _verify_imports(packages) | |
| all_imports_ok = all(v[1] for v in verifies) | |
| if ok and all_imports_ok: | |
| print(f"\n ✓ {label} installed and importable.") | |
| else: | |
| marker = "✗" if not optional else "!" | |
| print(f"\n {marker} {label} had problems:") | |
| for pkg, vok, info in verifies: | |
| status = "OK " if vok else "FAIL" | |
| print(f" [{status}] {pkg} ({info})") | |
| return { | |
| "group": name, | |
| "optional": optional, | |
| "pip_ok": ok, | |
| "import_results": verifies, | |
| "fully_ok": ok and all_imports_ok, | |
| } | |
| def write_requirements(summaries: list[dict]) -> None: | |
| """Write requirements.txt from packages that successfully imported.""" | |
| lines = [ | |
| "# Auto-generated by setup/install.py", | |
| "# Includes only packages that were successfully installed AND imported.", | |
| "# torch is intentionally omitted: pre-installed with CUDA support.", | |
| "", | |
| ] | |
| for s in summaries: | |
| lines.append(f"# --- {s['group']} ---") | |
| for pkg, ok, info in s["import_results"]: | |
| if ok and not info.startswith("import error"): | |
| lines.append(f"{pkg}=={info}") | |
| else: | |
| lines.append(f"# {pkg} (NOT INSTALLED: {info})") | |
| lines.append("") | |
| REQ_FILE.write_text("\n".join(lines)) | |
| print(f"\nrequirements.txt written -> {REQ_FILE}") | |
| def main() -> int: | |
| """Install all groups and write requirements.txt.""" | |
| print("=" * 72) | |
| print("Multilingual Chatbot — Dependency Installer") | |
| print("=" * 72) | |
| print(f"Python : {sys.version.split()[0]} ({sys.executable})") | |
| print("Strategy : pip install --user --break-system-packages") | |
| print("Note : torch is intentionally skipped (already installed with CUDA).") | |
| summaries: list[dict] = [] | |
| hard_failures: list[str] = [] | |
| for name, packages, optional in GROUPS: | |
| s = install_group(name, packages, optional) | |
| summaries.append(s) | |
| if not s["fully_ok"] and not optional: | |
| hard_failures.append(name) | |
| # Final report | |
| print("\n" + "=" * 72) | |
| print("FINAL REPORT") | |
| print("=" * 72) | |
| for s in summaries: | |
| tag = "OPT " if s["optional"] else "REQ " | |
| status = "✓" if s["fully_ok"] else ("!" if s["optional"] else "✗") | |
| print(f" {status} [{tag}] {s['group']}") | |
| for pkg, ok, info in s["import_results"]: | |
| sub = "OK " if ok else "FAIL" | |
| print(f" [{sub}] {pkg:<35s} {info}") | |
| write_requirements(summaries) | |
| if hard_failures: | |
| print("\nRequired groups that FAILED:") | |
| for g in hard_failures: | |
| print(f" - {g}") | |
| print("\nFix these before continuing to Phase 2.") | |
| return 1 | |
| print("\nAll required groups installed. Optional groups noted above.") | |
| print("Safe to continue to Phase 2 (data collection).") | |
| return 0 | |
| if __name__ == "__main__": | |
| try: | |
| sys.exit(main()) | |
| except KeyboardInterrupt: | |
| print("\nAborted by user.") | |
| sys.exit(130) | |