Flight-Transit-Agent

Running on Zero

App Files Files Community

Quazim0t0 commited on 11 days ago

Commit

25223be

verified ·

1 Parent(s): c4fa320

Upload 4 files

Browse files

Files changed (4) hide show

agent.py +1 -1
app.py +9 -9
liquid.py +80 -59
requirements.txt +6 -12

agent.py CHANGED Viewed

@@ -24,7 +24,7 @@ TRACES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "traces")
 os.makedirs(TRACES_DIR, exist_ok=True)
 JSONL_LOG = os.path.join(TRACES_DIR, "agent_log.jsonl")
-MODEL_NAME = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF") + "/Q4_K_M"
 # Best-effort city/keyword -> IATA so users can type "London to Dubai".
 CITY_TO_IATA = {

 os.makedirs(TRACES_DIR, exist_ok=True)
 JSONL_LOG = os.path.join(TRACES_DIR, "agent_log.jsonl")
+MODEL_NAME = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M")
 # Best-effort city/keyword -> IATA so users can type "London to Dubai".
 CITY_TO_IATA = {

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-"""FLIGHTDECK — live flights on a transparent 3D globe, with an LLM flight agent.
-Data:  FlightRadar24 API   (https://fr24api.flightradar24.com/docs/getting-started)
-Globe: Globe.gl / Three.js (3D, transparent, neon glow)
-LLM:   GGUF model via llama-cpp-python (default: LiquidAI/LFM2.5-350M)
-Set FR24_API_TOKEN in your environment (see .env.example), then `python app.py`.
-"""
-from __future__ import annotations
 import datetime as dt
 import os

+"""FLIGHTDECK — live flights on a transparent 3D globe, with an LLM flight agent.
+Data:  FlightRadar24 API   (https://fr24api.flightradar24.com/docs/getting-started)
+Globe: Globe.gl / Three.js (3D, transparent, neon glow)
+LLM:   LiquidAI LFM2.5-350M via transformers (default safetensors model)
+Set FR24_API_TOKEN in your environment (see .env.example), then `python app.py`.
+"""
+from __future__ import annotations
 import datetime as dt
 import os

liquid.py CHANGED Viewed

@@ -1,15 +1,16 @@
-"""LiquidAI LFM2.5-350M (GGUF, Q4_K_M) wrapper via llama-cpp-python.
-The model (set by LLM_REPO, default LiquidAI/LFM2.5-350M-GGUF) is downloaded from
-HuggingFace on first use and cached. If anything is unavailable (no llama-cpp, no
-model, no network) the app keeps working and just shows a deterministic fallback.
 """
 from __future__ import annotations
 import os
 import threading
-_LLM = None
 _LOAD_LOCK = threading.Lock()
 _LOAD_ERROR = None
@@ -26,64 +27,80 @@ def llm_disabled() -> bool:
     return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}
 def _load():
-    """Load the model once. Returns the Llama instance or None on failure."""
-    global _LLM, _LOAD_ERROR
-    if _LLM is not None or _LOAD_ERROR is not None:
-        return _LLM
     with _LOAD_LOCK:
-        if _LLM is not None or _LOAD_ERROR is not None:
-            return _LLM
         try:
-            import fnmatch
-            from huggingface_hub import hf_hub_download, list_repo_files
-            from llama_cpp import Llama
-            repo = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF")
-            pattern = os.environ.get("LLM_FILE", "*Q4_K_M.gguf")
-            # Resolve a glob (or exact name) against the repo's real file list.
-            if any(ch in pattern for ch in "*?["):
-                candidates = [f for f in list_repo_files(repo)
-                              if f.endswith(".gguf") and fnmatch.fnmatch(f, pattern)
-                              or fnmatch.fnmatch(os.path.basename(f), pattern)]
-                if not candidates:
-                    raise FileNotFoundError(
-                        f"No GGUF matching {pattern!r} in {repo}")
-                filename = sorted(candidates, key=len)[0]
-            else:
-                filename = pattern
-            path = hf_hub_download(repo_id=repo, filename=filename)
-            _LLM = Llama(
-                model_path=path,
-                n_ctx=int(os.environ.get("LLM_CTX", "8192")),
-                n_gpu_layers=int(os.environ.get("N_GPU_LAYERS", "0")),
-                verbose=False,
             )
         except Exception as e:  # noqa: BLE001
             _LOAD_ERROR = e
-            _LLM = None
-    return _LLM
 def status() -> str:
-    label = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF").split("/")[-1]
     if llm_disabled():
         return "LLM disabled (DISABLE_LLM=1)."
     if _LOAD_ERROR is not None:
         return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
-    if _LLM is None:
         return f"{label} not loaded yet (loads on first query)."
-    return f"{label} Q4_K_M online."
 def available() -> bool:
     """True if the model can actually run (not disabled and loadable)."""
     if llm_disabled():
         return False
-    return _load() is not None
 def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
@@ -91,24 +108,33 @@ def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
     Raises RuntimeError if the model is unavailable so the caller can fall back.
     """
-    llm = _load()
-    if llm is None:
         raise RuntimeError(status())
     import time
     t0 = time.time()
-    out = llm.create_chat_completion(
-        messages=messages, max_tokens=max_tokens,
-        temperature=temperature, top_p=top_p,
     )
     latency = int((time.time() - t0) * 1000)
-    return out["choices"][0]["message"]["content"].strip(), latency
 def _fallback(question: str, context: str) -> str:
     return (
         "[AI offline — raw readout]\n"
         f"Q: {question}\n\n{context}\n\n"
-        "(Install llama-cpp-python and allow the model to download to enable "
         "LLM natural-language briefings.)"
     )
@@ -117,8 +143,8 @@ def briefing(question: str, context: str, max_tokens: int = 512) -> str:
     """Generate an answer about the current flights."""
     if llm_disabled():
         return _fallback(question, context)
-    llm = _load()
-    if llm is None:
         return _fallback(question, context)
     messages = [
@@ -127,12 +153,7 @@ def briefing(question: str, context: str, max_tokens: int = 512) -> str:
          "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
     ]
     try:
-        out = llm.create_chat_completion(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=0.4,
-            top_p=0.9,
-        )
-        return out["choices"][0]["message"]["content"].strip()
     except Exception as e:  # noqa: BLE001
         return _fallback(question, f"{context}\n\n(LLM error: {e})")

+"""LiquidAI LFM2.5-350M (safetensors) wrapper via transformers.
+The model (set by LLM_REPO, default LiquidAI/LFM2.5-350M) is downloaded from
+HuggingFace on first use and cached. If anything is unavailable (no transformers,
+no model, no network) the app keeps working and just shows a deterministic fallback.
 """
 from __future__ import annotations
 import os
 import threading
+_PIPELINE = None
+_TOKENIZER = None
 _LOAD_LOCK = threading.Lock()
 _LOAD_ERROR = None
     return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}
+def _model_id() -> str:
+    # The GGUF-only repo and the safetensors repo have different names.
+    # Default to the safetensors model. Allow override via LLM_REPO.
+    return os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M")
+def _apply_chat_template(messages, tokenizer):
+    """Convert [{"role":..., "content":...}, ...] to a single prompt string
+    using the tokenizer's chat template. Falls back to a manual concat if
+    the tokenizer has no chat_template attribute."""
+    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
+        return tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+    # Manual fallback: simple "system / user" format.
+    parts = []
+    for m in messages:
+        role = m.get("role", "user")
+        parts.append(f"[{role.upper()}]\n{m.get('content', '')}\n")
+    parts.append("[ASSISTANT]\n")
+    return "\n".join(parts)
 def _load():
+    """Load the model + tokenizer once. Returns (pipeline, tokenizer) or (None, None)."""
+    global _PIPELINE, _TOKENIZER, _LOAD_ERROR
+    if _PIPELINE is not None or _LOAD_ERROR is not None:
+        return _PIPELINE, _TOKENIZER
     with _LOAD_LOCK:
+        if _PIPELINE is not None or _LOAD_ERROR is not None:
+            return _PIPELINE, _TOKENIZER
         try:
+            import torch
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+            model_id = _model_id()
+            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float32,
+                device_map="auto",
+                trust_remote_code=True,
+            )
+            _PIPELINE = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                return_full_text=False,
             )
+            _TOKENIZER = tokenizer
         except Exception as e:  # noqa: BLE001
             _LOAD_ERROR = e
+            _PIPELINE = None
+            _TOKENIZER = None
+    return _PIPELINE, _TOKENIZER
 def status() -> str:
+    label = _model_id().split("/")[-1]
     if llm_disabled():
         return "LLM disabled (DISABLE_LLM=1)."
     if _LOAD_ERROR is not None:
         return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
+    if _PIPELINE is None:
         return f"{label} not loaded yet (loads on first query)."
+    return f"{label} online (transformers, CPU/GPU auto)."
 def available() -> bool:
     """True if the model can actually run (not disabled and loadable)."""
     if llm_disabled():
         return False
+    pipe, _ = _load()
+    return pipe is not None
 def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
     Raises RuntimeError if the model is unavailable so the caller can fall back.
     """
+    pipe, tokenizer = _load()
+    if pipe is None:
         raise RuntimeError(status())
     import time
+    prompt = _apply_chat_template(messages, tokenizer)
     t0 = time.time()
+    out = pipe(
+        prompt,
+        max_new_tokens=max_tokens,
+        do_sample=temperature > 0,
+        temperature=max(temperature, 1e-5),
+        top_p=top_p,
+        return_full_text=False,
     )
     latency = int((time.time() - t0) * 1000)
+    # transformers pipeline returns a list of dicts with "generated_text"
+    text = out[0]["generated_text"] if isinstance(out, list) else str(out)
+    if isinstance(text, list):
+        text = text[0].get("generated_text", "") if text else ""
+    return str(text).strip(), latency
 def _fallback(question: str, context: str) -> str:
     return (
         "[AI offline — raw readout]\n"
         f"Q: {question}\n\n{context}\n\n"
+        "(Install transformers + torch and allow the model to download to enable "
         "LLM natural-language briefings.)"
     )
     """Generate an answer about the current flights."""
     if llm_disabled():
         return _fallback(question, context)
+    pipe, _ = _load()
+    if pipe is None:
         return _fallback(question, context)
     messages = [
          "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
     ]
     try:
+        text, _latency = complete(messages, max_tokens=max_tokens, temperature=0.4)
+        return text
     except Exception as e:  # noqa: BLE001
         return _fallback(question, f"{context}\n\n(LLM error: {e})")

requirements.txt CHANGED Viewed

@@ -3,15 +3,9 @@ requests>=2.31.0
 python-dotenv>=1.0.0
 numpy>=1.26.0
 huggingface_hub>=0.24.0
-# GGUF runtime for the LLM agent (default model: LiquidAI/LFM2.5-350M-GGUF).
-# Easy path (prebuilt CPU wheel):
-#   pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-# NOTE: that prebuilt wheel is compiled with AVX-512. On CPUs without AVX-512
-# (e.g. Intel Core i5/i7/i9 9th-gen) it crashes with 0xC000001D (illegal
-# instruction). Build from source for your CPU instead (needs a C compiler):
-#   set CMAKE_ARGS=-DGGML_AVX512=OFF -DGGML_AVX2=ON -DGGML_FMA=ON -DGGML_F16C=ON
-#   set FORCE_CMAKE=1
-#   pip install --no-binary llama-cpp-python llama-cpp-python
-llama-cpp-python>=0.3.2
-# The agent LLM runs as GGUF via llama-cpp-python above; no torch needed.

 python-dotenv>=1.0.0
 numpy>=1.26.0
 huggingface_hub>=0.24.0
+# LLM agent runtime: LiquidAI LFM2.5-350M via transformers.
+# The model (default LiquidAI/LFM2.5-350M) is downloaded from HuggingFace on
+# first use and cached. Pure-Python wheels — no C++ build step.
+transformers>=4.44.0
+torch>=2.2.0
+accelerate>=0.33.0