Spaces:

tusarway
/

rag-backend

Running

App Files Files Community

imtrt004 commited on Mar 4

Commit

2aa0b72

1 Parent(s): 7997082

fix: update backend lib with log

Browse files

Files changed (4) hide show

app.py +46 -9
model/loader.py +70 -31
model/log.py +154 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -7,9 +7,12 @@ from supabase import create_client
 import uuid
 import os
 import json
 from typing import Optional
 from model.loader import get_llm, get_model_name, is_llm_ready, switch_model, is_loading, get_loading_status
 from retrieval.embedder import get_model, embed_chunks, embed_query
 from retrieval.vectorstore import (
     store_chunks, similarity_search, similarity_search_multi,
@@ -50,17 +53,28 @@ def _supa():
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     import asyncio
-    print("\U0001f680 Starting up...", flush=True)
-    get_model()  # BGE-small embedding model (~2s)
-    print("  \u2714 Embedding model ready", flush=True)
-    # Load the LLM in a thread so the event loop stays responsive
     loop = asyncio.get_event_loop()
     try:
         await loop.run_in_executor(None, get_llm)
-        print(f"  \u2714 LLM ready ({get_model_name()})", flush=True)
     except Exception as exc:
-        print(f"  \u26a0  LLM load failed: {exc}", flush=True)
-    print("✅ Ready", flush=True)
     yield
@@ -83,8 +97,10 @@ async def upload(
     user_id: str,
     bg: BackgroundTasks,
 ):
-    content = await file.read()
     file_size = len(content)
     ok, msg = can_upload(user_id, file_size)
     if not ok:
@@ -93,6 +109,7 @@ async def upload(
     # ── Storage capacity gate ─────────────────────────────────────────────
     if is_storage_near_full(file_size):
         # Queue the upload; it will be processed once expired docs are purged
         result = enqueue_upload(
             user_id=user_id,
             filename=file.filename or "upload",
@@ -127,7 +144,7 @@ async def upload(
     # Process in background (parse → chunk → embed → store)
     bg.add_task(_process_doc, content, doc_id, user_id, expires, file.filename)
     return {"doc_id": doc_id, "status": "processing", "expires_at": expires.isoformat()}
@@ -200,14 +217,33 @@ async def process_from_storage(
 async def _process_doc(content, doc_id, user_id, expires, filename):
     supa = _supa()
     try:
         pages  = parse_file_pages(content, filename)
         chunks = smart_chunk_pages(pages, filename=filename)
         embeds = embed_chunks([c.text for c in chunks])
         store_chunks(doc_id, user_id, chunks, embeds, expires)
         supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
             .eq("id", doc_id).execute()
     except Exception as e:
         supa.table("documents").update({"status": "error", "error": str(e)}) \
             .eq("id", doc_id).execute()
@@ -465,6 +501,7 @@ async def llm_switch(req: LLMSwitchRequest, bg: BackgroundTasks):
     if get_model_name() == req.model and is_llm_ready():
         return {"ok": True, "switching": False, "model": req.model, "msg": "Already active"}
     bg.add_task(_do_switch_model, req.model)
     return {"ok": True, "switching": True, "model": req.model}

 import uuid
 import os
 import json
+import time
+from datetime import datetime, timezone
 from typing import Optional
 from model.loader import get_llm, get_model_name, is_llm_ready, switch_model, is_loading, get_loading_status
+from model.log import banner, section, step, ok, warn, error
 from retrieval.embedder import get_model, embed_chunks, embed_query
 from retrieval.vectorstore import (
     store_chunks, similarity_search, similarity_search_multi,
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     import asyncio
+    ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+    banner(f"LUMINARY BACKEND  ·  {ts}")
+    section("STARTUP", "Embedding model")
+    step("STARTUP", "Loading BAAI/bge-small-en-v1.5…")
+    try:
+        t0 = time.perf_counter()
+        get_model()
+        ok("STARTUP", f"Embedding model ready  ({time.perf_counter() - t0:.1f}s)")
+    except Exception as exc:
+        error("STARTUP", f"Embedding model failed: {exc}")
+    section("STARTUP", "LLM")
+    step("STARTUP", f"Loading  {get_model_name()}  in background thread…")
     loop = asyncio.get_event_loop()
     try:
         await loop.run_in_executor(None, get_llm)
+        ok("STARTUP", f"LLM ready  →  {get_model_name()}")
     except Exception as exc:
+        error("STARTUP", f"LLM load failed: {exc}")
+    section("STARTUP", "All systems go")
     yield
     user_id: str,
     bg: BackgroundTasks,
 ):
+    content   = await file.read()
     file_size = len(content)
+    size_kb   = file_size / 1024
+    step("UPLOAD", f"{file.filename}  ·  {size_kb:.0f} KB  ·  user={user_id[:8]}")
     ok, msg = can_upload(user_id, file_size)
     if not ok:
     # ── Storage capacity gate ─────────────────────────────────────────────
     if is_storage_near_full(file_size):
         # Queue the upload; it will be processed once expired docs are purged
+        warn("UPLOAD", "Storage near full — queueing upload")
         result = enqueue_upload(
             user_id=user_id,
             filename=file.filename or "upload",
     # Process in background (parse → chunk → embed → store)
     bg.add_task(_process_doc, content, doc_id, user_id, expires, file.filename)
+    ok("UPLOAD", f"Accepted  ·  doc={doc_id[:8]}  ·  expires={expires.date()}")
     return {"doc_id": doc_id, "status": "processing", "expires_at": expires.isoformat()}
 async def _process_doc(content, doc_id, user_id, expires, filename):
     supa = _supa()
+    t0   = time.perf_counter()
+    short_id = doc_id[:8]
+    section("PROCESS", f"{filename}  [{short_id}]")
     try:
+        step("PROCESS", f"Parsing  {filename}")
         pages  = parse_file_pages(content, filename)
+        ok("PROCESS",  f"Parsed  →  {len(pages)} page(s)")
+        step("PROCESS", "Chunking pages…")
         chunks = smart_chunk_pages(pages, filename=filename)
+        ok("PROCESS",  f"Chunked  →  {len(chunks)} chunk(s)")
+        step("PROCESS", f"Embedding {len(chunks)} chunks…")
         embeds = embed_chunks([c.text for c in chunks])
+        ok("PROCESS",  f"Embedded  ({len(embeds)} vectors)")
+        step("PROCESS", "Storing vectors in Supabase…")
         store_chunks(doc_id, user_id, chunks, embeds, expires)
         supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
             .eq("id", doc_id).execute()
+        elapsed = time.perf_counter() - t0
+        ok("PROCESS", f"Document ready  ·  {len(chunks)} chunks  ·  {elapsed:.2f}s  [{short_id}]")
     except Exception as e:
+        error("PROCESS", f"{filename}  [{short_id}]  →  {e}")
         supa.table("documents").update({"status": "error", "error": str(e)}) \
             .eq("id", doc_id).execute()
     if get_model_name() == req.model and is_llm_ready():
         return {"ok": True, "switching": False, "model": req.model, "msg": "Already active"}
+    step("SWITCH", f"Admin requested  {get_model_name()}  →  {req.model}")
     bg.add_task(_do_switch_model, req.model)
     return {"ok": True, "switching": True, "model": req.model}

model/loader.py CHANGED Viewed

@@ -16,6 +16,7 @@ Model options (set LLM_MODEL env var in HF Space to switch, no redeploy needed):
 Note:
   - EXAONE requires trust_remote_code=True (LG AI custom architecture).
   - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
   - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
 """
@@ -25,6 +26,7 @@ import time
 import threading
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
@@ -49,35 +51,63 @@ _switch_lock = threading.Lock()
 def _load() -> None:
     global _tokenizer, _llm, _llm_ready, _loading_msg
     if _llm is not None:
         return
-    t0 = time.time()
-    sep = "-" * 60
-    print(f"\n{sep}", flush=True)
-    print(f"  Loading {MODEL_ID}", flush=True)
-    print(f"  First boot downloads model weights then caches to disk.", flush=True)
-    print(f"{sep}\n", flush=True)
     _trc = _needs_trust_remote_code(MODEL_ID)
-    _loading_msg = f"Loading tokenizer for {MODEL_ID}…"
-    _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc)
-    print("  Tokenizer loaded.", flush=True)
-    _loading_msg = f"Loading model weights for {MODEL_ID}…  (may download on first run)"
-    _llm = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.bfloat16,
-        trust_remote_code=_trc,
-    )
     _llm.eval()
     _llm_ready = True
     _loading_msg = ""
-    elapsed = time.time() - t0
-    print(f"\n{sep}", flush=True)
-    print(f"  {MODEL_ID} ready in {elapsed:.1f}s", flush=True)
-    print(f"{sep}\n", flush=True)
 def get_tokenizer() -> AutoTokenizer:
@@ -116,32 +146,41 @@ def switch_model(new_model_id: str) -> None:
     global _loading, _loading_msg, _loading_error
     with _switch_lock:
-        _loading = True
         _loading_error = None
-        _loading_msg = f"Unloading {MODEL_ID}…"
-        _llm_ready = False
-        # Release model from memory
         try:
             import gc
-            _llm = None
             _tokenizer = None
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
-        except Exception:
-            pass
         MODEL_ID = new_model_id
         try:
-            _load()          # uses updated MODULE_ID; sets _llm_ready = True
             _loading = False
         except Exception as exc:
             _loading_error = str(exc)
-            _loading = False
-            _loading_msg = ""
 def is_llm_ready() -> bool:
-    return _llm_ready

 Note:
   - EXAONE requires trust_remote_code=True (LG AI custom architecture).
+    Requires transformers>=4.46.0 for RopeParameters support.
   - Llama 3.2 and Gemma 3 may require a HF_TOKEN env var (gated models).
   - Qwen3 supports /think and /no_think prefixes for reasoning depth control.
 """
 import threading
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from model.log import section, step, ok, warn, error
 MODEL_ID = os.environ.get("LLM_MODEL", "HuggingFaceTB/SmolLM2-360M-Instruct")
 def _load() -> None:
     global _tokenizer, _llm, _llm_ready, _loading_msg
     if _llm is not None:
         return
+    t0   = time.perf_counter()
     _trc = _needs_trust_remote_code(MODEL_ID)
+    section("MODEL", f"Loading  {MODEL_ID}")
+    if _trc:
+        step("MODEL", "trust_remote_code=True  (custom architecture)")
+    # ── Tokenizer ─────────────────────────────────────────────────────────────
+    _loading_msg = f"Loading tokenizer…"
+    step("MODEL", f"Fetching tokenizer…")
+    try:
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=_trc)
+        ok("MODEL", "Tokenizer loaded")
+    except Exception as exc:
+        error("MODEL", f"Tokenizer load failed → {exc}")
+        raise
+    # ── Weights ───────────────────────────────────────────────────────────────
+    _loading_msg = "Loading model weights…  (downloads on first run, then cached)"
+    step("MODEL", "Loading weights  (first run will download — subsequent boots use cache)")
+    device_info = "CUDA" if torch.cuda.is_available() else "CPU"
+    step("MODEL", f"Device: {device_info}  ·  dtype: bfloat16")
+    try:
+        _llm = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=_trc,
+        )
+    except ImportError as exc:
+        _hint = ""
+        if "RopeParameters" in str(exc):
+            _hint = (
+                "\n  Hint: EXAONE-3.5 requires transformers>=4.46.0.\n"
+                "  Ensure requirements.txt contains  transformers>=4.46.0\n"
+                "  and rebuild/restart the Space."
+            )
+        error("MODEL", f"{exc}{_hint}")
+        raise
+    except Exception as exc:
+        error("MODEL", str(exc))
+        raise
     _llm.eval()
     _llm_ready = True
     _loading_msg = ""
+    elapsed = time.perf_counter() - t0
+    params  = sum(p.numel() for p in _llm.parameters()) / 1e6
+    ok("MODEL", f"Ready  ·  {params:.0f}M params  ·  {elapsed:.1f}s")
+    section("MODEL", "Model online")
 def get_tokenizer() -> AutoTokenizer:
     global _loading, _loading_msg, _loading_error
     with _switch_lock:
+        prev = MODEL_ID
+        section("SWITCH", f"{prev}  →  {new_model_id}")
+        _loading       = True
         _loading_error = None
+        _loading_msg   = f"Unloading {prev}…"
+        _llm_ready     = False
+        # ── Release current model from memory ─────────────────────────────────
+        step("SWITCH", f"Unloading  {prev}")
         try:
             import gc
+            _llm       = None
             _tokenizer = None
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+                step("SWITCH", "CUDA cache cleared")
+            ok("SWITCH", "Memory freed")
+        except Exception as exc:
+            warn("SWITCH", f"Cleanup warning: {exc}")
         MODEL_ID = new_model_id
+        step("SWITCH", f"Starting load of  {new_model_id}")
         try:
+            _load()          # uses updated MODEL_ID; sets _llm_ready = True
             _loading = False
+            ok("SWITCH", f"Switch complete  →  {new_model_id}")
         except Exception as exc:
             _loading_error = str(exc)
+            _loading       = False
+            _loading_msg   = ""
+            error("SWITCH", f"Failed to load  {new_model_id}\n  {exc}")
 def is_llm_ready() -> bool:
+    return _llm_ready

model/log.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+Shared structured logger for the Luminary HF backend.
+Outputs readable, sectioned logs that are easy to scan in the HF Space container view.
+"""
+from __future__ import annotations
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+# ── ANSI colour palette ──────────────────────────────────────────────────────
+_R  = "\033[0m"          # reset
+_B  = "\033[1m"          # bold
+_DIM = "\033[2m"         # dim
+_GRN = "\033[32m"        # green
+_CYN = "\033[36m"        # cyan
+_YLW = "\033[33m"        # yellow
+_RED = "\033[31m"        # red
+_MAG = "\033[35m"        # magenta
+_BLU = "\033[34m"        # blue
+_WHT = "\033[97m"        # bright white
+_TAG_COLORS: dict[str, str] = {
+    "STARTUP":  _CYN,
+    "MODEL":    _MAG,
+    "UPLOAD":   _BLU,
+    "PROCESS":  _BLU,
+    "CHAT":     _GRN,
+    "QUIZ":     _GRN,
+    "SWITCH":   _YLW,
+    "ERROR":    _RED,
+    "HEALTH":   _DIM,
+}
+class _FmtHandler(logging.StreamHandler):
+    """Formatter that wraps log records into readable tag-prefixed lines."""
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            tag   = getattr(record, "tag", record.levelname)
+            msg   = record.getMessage()
+            color = _TAG_COLORS.get(tag, _WHT)
+            ts    = datetime.now(timezone.utc).strftime("%H:%M:%S")
+            prefix = f"{_DIM}{ts}{_R} {color}{_B}[{tag}]{_R}"
+            # indent continuation lines
+            lines  = msg.splitlines()
+            out    = prefix + "  " + lines[0]
+            for line in lines[1:]:
+                out += "\n" + (" " * (len(ts) + len(tag) + 5)) + line
+            sys.stdout.write(out + "\n")
+            sys.stdout.flush()
+        except Exception:
+            self.handleError(record)
+# ── Module-level logger setup ─────────────────────────────────────────────────
+_handler = _FmtHandler()
+_handler.setFormatter(logging.Formatter("%(message)s"))
+log = logging.getLogger("luminary")
+log.setLevel(logging.DEBUG)
+if not log.handlers:
+    log.addHandler(_handler)
+log.propagate = False
+# ── Convenience helpers ───────────────────────────────────────────────────────
+def _tag(tag: str) -> dict:
+    return {"extra": {"tag": tag}}
+def banner(title: str, width: int = 58) -> None:
+    """Print a prominent box banner (e.g. at startup)."""
+    bar   = "━" * width
+    inner = title.center(width)
+    sys.stdout.write(
+        f"\n{_B}{_CYN}┌{bar}┐\n"
+        f"│{_WHT}{_B}{inner}{_CYN}│\n"
+        f"└{bar}┘{_R}\n\n"
+    )
+    sys.stdout.flush()
+def section(tag: str, msg: str) -> None:
+    """Print a thin divider line with an annotation."""
+    color  = _TAG_COLORS.get(tag, _WHT)
+    ts     = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    width  = max(0, 58 - len(tag) - len(msg) - 4)
+    bar    = "─" * width
+    sys.stdout.write(f"{_DIM}{ts}{_R} {color}{_B}[{tag}]{_R} {_DIM}{msg} {bar}{_R}\n")
+    sys.stdout.flush()
+def ok(tag: str, msg: str) -> None:
+    color = _TAG_COLORS.get(tag, _WHT)
+    ts    = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    sys.stdout.write(f"{_DIM}{ts}{_R} {color}{_B}[{tag}]{_R}  {_GRN}✓{_R}  {msg}\n")
+    sys.stdout.flush()
+def step(tag: str, msg: str) -> None:
+    color = _TAG_COLORS.get(tag, _WHT)
+    ts    = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    sys.stdout.write(f"{_DIM}{ts}{_R} {color}{_B}[{tag}]{_R}  {_DIM}→{_R}  {msg}\n")
+    sys.stdout.flush()
+def warn(tag: str, msg: str) -> None:
+    color = _TAG_COLORS.get(tag, _WHT)
+    ts    = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    sys.stdout.write(f"{_DIM}{ts}{_R} {color}{_B}[{tag}]{_R}  {_YLW}⚠{_R}  {_YLW}{msg}{_R}\n")
+    sys.stdout.flush()
+def error(tag: str, msg: str) -> None:
+    ts = datetime.now(timezone.utc).strftime("%H:%M:%S")
+    sys.stdout.write(
+        f"{_DIM}{ts}{_R} {_RED}{_B}[{tag}]{_R}  "
+        f"{_RED}✕  Error{_R}\n"
+        f"{' ' * (len(ts) + len(tag) + 5)}{_RED}{msg}{_R}\n"
+    )
+    sys.stdout.flush()
+class Timer:
+    """Context manager / manual stopwatch with labelled output."""
+    def __init__(self, tag: str, label: str) -> None:
+        self.tag   = tag
+        self.label = label
+        self._t0: float = 0.0
+    def start(self) -> "Timer":
+        self._t0 = time.perf_counter()
+        return self
+    def elapsed(self) -> float:
+        return time.perf_counter() - self._t0
+    def done(self, extra: str = "") -> float:
+        secs = self.elapsed()
+        msg  = f"{self.label}  {_DIM}({secs:.2f}s){_R}"
+        if extra:
+            msg += f"  {_DIM}{extra}{_R}"
+        ok(self.tag, msg)
+        return secs
+    def __enter__(self) -> "Timer":
+        return self.start()
+    def __exit__(self, *_) -> None:
+        self.done()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 fastapi
 uvicorn[standard]==0.34.0
 sentence-transformers==4.1.0
 huggingface-hub>=0.31.0
 supabase==2.13.0
 pymupdf==1.25.3

 fastapi
 uvicorn[standard]==0.34.0
 sentence-transformers==4.1.0
+transformers>=4.46.0
 huggingface-hub>=0.31.0
 supabase==2.13.0
 pymupdf==1.25.3