Quazim0t0 commited on
Commit
25223be
·
verified ·
1 Parent(s): c4fa320

Upload 4 files

Browse files
Files changed (4) hide show
  1. agent.py +1 -1
  2. app.py +9 -9
  3. liquid.py +80 -59
  4. requirements.txt +6 -12
agent.py CHANGED
@@ -24,7 +24,7 @@ TRACES_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "traces")
24
  os.makedirs(TRACES_DIR, exist_ok=True)
25
  JSONL_LOG = os.path.join(TRACES_DIR, "agent_log.jsonl")
26
 
27
- MODEL_NAME = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF") + "/Q4_K_M"
28
 
29
  # Best-effort city/keyword -> IATA so users can type "London to Dubai".
30
  CITY_TO_IATA = {
 
24
  os.makedirs(TRACES_DIR, exist_ok=True)
25
  JSONL_LOG = os.path.join(TRACES_DIR, "agent_log.jsonl")
26
 
27
+ MODEL_NAME = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M")
28
 
29
  # Best-effort city/keyword -> IATA so users can type "London to Dubai".
30
  CITY_TO_IATA = {
app.py CHANGED
@@ -1,12 +1,12 @@
1
- """FLIGHTDECK — live flights on a transparent 3D globe, with an LLM flight agent.
2
-
3
- Data: FlightRadar24 API (https://fr24api.flightradar24.com/docs/getting-started)
4
- Globe: Globe.gl / Three.js (3D, transparent, neon glow)
5
- LLM: GGUF model via llama-cpp-python (default: LiquidAI/LFM2.5-350M)
6
-
7
- Set FR24_API_TOKEN in your environment (see .env.example), then `python app.py`.
8
- """
9
- from __future__ import annotations
10
 
11
  import datetime as dt
12
  import os
 
1
+ """FLIGHTDECK — live flights on a transparent 3D globe, with an LLM flight agent.
2
+
3
+ Data: FlightRadar24 API (https://fr24api.flightradar24.com/docs/getting-started)
4
+ Globe: Globe.gl / Three.js (3D, transparent, neon glow)
5
+ LLM: LiquidAI LFM2.5-350M via transformers (default safetensors model)
6
+
7
+ Set FR24_API_TOKEN in your environment (see .env.example), then `python app.py`.
8
+ """
9
+ from __future__ import annotations
10
 
11
  import datetime as dt
12
  import os
liquid.py CHANGED
@@ -1,15 +1,16 @@
1
- """LiquidAI LFM2.5-350M (GGUF, Q4_K_M) wrapper via llama-cpp-python.
2
 
3
- The model (set by LLM_REPO, default LiquidAI/LFM2.5-350M-GGUF) is downloaded from
4
- HuggingFace on first use and cached. If anything is unavailable (no llama-cpp, no
5
- model, no network) the app keeps working and just shows a deterministic fallback.
6
  """
7
  from __future__ import annotations
8
 
9
  import os
10
  import threading
11
 
12
- _LLM = None
 
13
  _LOAD_LOCK = threading.Lock()
14
  _LOAD_ERROR = None
15
 
@@ -26,64 +27,80 @@ def llm_disabled() -> bool:
26
  return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def _load():
30
- """Load the model once. Returns the Llama instance or None on failure."""
31
- global _LLM, _LOAD_ERROR
32
- if _LLM is not None or _LOAD_ERROR is not None:
33
- return _LLM
34
  with _LOAD_LOCK:
35
- if _LLM is not None or _LOAD_ERROR is not None:
36
- return _LLM
37
  try:
38
- import fnmatch
39
-
40
- from huggingface_hub import hf_hub_download, list_repo_files
41
- from llama_cpp import Llama
42
-
43
- repo = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF")
44
- pattern = os.environ.get("LLM_FILE", "*Q4_K_M.gguf")
45
-
46
- # Resolve a glob (or exact name) against the repo's real file list.
47
- if any(ch in pattern for ch in "*?["):
48
- candidates = [f for f in list_repo_files(repo)
49
- if f.endswith(".gguf") and fnmatch.fnmatch(f, pattern)
50
- or fnmatch.fnmatch(os.path.basename(f), pattern)]
51
- if not candidates:
52
- raise FileNotFoundError(
53
- f"No GGUF matching {pattern!r} in {repo}")
54
- filename = sorted(candidates, key=len)[0]
55
- else:
56
- filename = pattern
57
-
58
- path = hf_hub_download(repo_id=repo, filename=filename)
59
- _LLM = Llama(
60
- model_path=path,
61
- n_ctx=int(os.environ.get("LLM_CTX", "8192")),
62
- n_gpu_layers=int(os.environ.get("N_GPU_LAYERS", "0")),
63
- verbose=False,
64
  )
 
65
  except Exception as e: # noqa: BLE001
66
  _LOAD_ERROR = e
67
- _LLM = None
68
- return _LLM
 
69
 
70
 
71
  def status() -> str:
72
- label = os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M-GGUF").split("/")[-1]
73
  if llm_disabled():
74
  return "LLM disabled (DISABLE_LLM=1)."
75
  if _LOAD_ERROR is not None:
76
  return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
77
- if _LLM is None:
78
  return f"{label} not loaded yet (loads on first query)."
79
- return f"{label} Q4_K_M online."
80
 
81
 
82
  def available() -> bool:
83
  """True if the model can actually run (not disabled and loadable)."""
84
  if llm_disabled():
85
  return False
86
- return _load() is not None
 
87
 
88
 
89
  def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
@@ -91,24 +108,33 @@ def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
91
 
92
  Raises RuntimeError if the model is unavailable so the caller can fall back.
93
  """
94
- llm = _load()
95
- if llm is None:
96
  raise RuntimeError(status())
97
  import time
 
98
  t0 = time.time()
99
- out = llm.create_chat_completion(
100
- messages=messages, max_tokens=max_tokens,
101
- temperature=temperature, top_p=top_p,
 
 
 
 
102
  )
103
  latency = int((time.time() - t0) * 1000)
104
- return out["choices"][0]["message"]["content"].strip(), latency
 
 
 
 
105
 
106
 
107
  def _fallback(question: str, context: str) -> str:
108
  return (
109
  "[AI offline — raw readout]\n"
110
  f"Q: {question}\n\n{context}\n\n"
111
- "(Install llama-cpp-python and allow the model to download to enable "
112
  "LLM natural-language briefings.)"
113
  )
114
 
@@ -117,8 +143,8 @@ def briefing(question: str, context: str, max_tokens: int = 512) -> str:
117
  """Generate an answer about the current flights."""
118
  if llm_disabled():
119
  return _fallback(question, context)
120
- llm = _load()
121
- if llm is None:
122
  return _fallback(question, context)
123
 
124
  messages = [
@@ -127,12 +153,7 @@ def briefing(question: str, context: str, max_tokens: int = 512) -> str:
127
  "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
128
  ]
129
  try:
130
- out = llm.create_chat_completion(
131
- messages=messages,
132
- max_tokens=max_tokens,
133
- temperature=0.4,
134
- top_p=0.9,
135
- )
136
- return out["choices"][0]["message"]["content"].strip()
137
  except Exception as e: # noqa: BLE001
138
  return _fallback(question, f"{context}\n\n(LLM error: {e})")
 
1
+ """LiquidAI LFM2.5-350M (safetensors) wrapper via transformers.
2
 
3
+ The model (set by LLM_REPO, default LiquidAI/LFM2.5-350M) is downloaded from
4
+ HuggingFace on first use and cached. If anything is unavailable (no transformers,
5
+ no model, no network) the app keeps working and just shows a deterministic fallback.
6
  """
7
  from __future__ import annotations
8
 
9
  import os
10
  import threading
11
 
12
+ _PIPELINE = None
13
+ _TOKENIZER = None
14
  _LOAD_LOCK = threading.Lock()
15
  _LOAD_ERROR = None
16
 
 
27
  return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}
28
 
29
 
30
+ def _model_id() -> str:
31
+ # The GGUF-only repo and the safetensors repo have different names.
32
+ # Default to the safetensors model. Allow override via LLM_REPO.
33
+ return os.environ.get("LLM_REPO", "LiquidAI/LFM2.5-350M")
34
+
35
+
36
+ def _apply_chat_template(messages, tokenizer):
37
+ """Convert [{"role":..., "content":...}, ...] to a single prompt string
38
+ using the tokenizer's chat template. Falls back to a manual concat if
39
+ the tokenizer has no chat_template attribute."""
40
+ if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
41
+ return tokenizer.apply_chat_template(
42
+ messages, tokenize=False, add_generation_prompt=True
43
+ )
44
+ # Manual fallback: simple "system / user" format.
45
+ parts = []
46
+ for m in messages:
47
+ role = m.get("role", "user")
48
+ parts.append(f"[{role.upper()}]\n{m.get('content', '')}\n")
49
+ parts.append("[ASSISTANT]\n")
50
+ return "\n".join(parts)
51
+
52
+
53
  def _load():
54
+ """Load the model + tokenizer once. Returns (pipeline, tokenizer) or (None, None)."""
55
+ global _PIPELINE, _TOKENIZER, _LOAD_ERROR
56
+ if _PIPELINE is not None or _LOAD_ERROR is not None:
57
+ return _PIPELINE, _TOKENIZER
58
  with _LOAD_LOCK:
59
+ if _PIPELINE is not None or _LOAD_ERROR is not None:
60
+ return _PIPELINE, _TOKENIZER
61
  try:
62
+ import torch
63
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
64
+
65
+ model_id = _model_id()
66
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
67
+ model = AutoModelForCausalLM.from_pretrained(
68
+ model_id,
69
+ torch_dtype=torch.float32,
70
+ device_map="auto",
71
+ trust_remote_code=True,
72
+ )
73
+ _PIPELINE = pipeline(
74
+ "text-generation",
75
+ model=model,
76
+ tokenizer=tokenizer,
77
+ return_full_text=False,
 
 
 
 
 
 
 
 
 
 
78
  )
79
+ _TOKENIZER = tokenizer
80
  except Exception as e: # noqa: BLE001
81
  _LOAD_ERROR = e
82
+ _PIPELINE = None
83
+ _TOKENIZER = None
84
+ return _PIPELINE, _TOKENIZER
85
 
86
 
87
  def status() -> str:
88
+ label = _model_id().split("/")[-1]
89
  if llm_disabled():
90
  return "LLM disabled (DISABLE_LLM=1)."
91
  if _LOAD_ERROR is not None:
92
  return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
93
+ if _PIPELINE is None:
94
  return f"{label} not loaded yet (loads on first query)."
95
+ return f"{label} online (transformers, CPU/GPU auto)."
96
 
97
 
98
  def available() -> bool:
99
  """True if the model can actually run (not disabled and loadable)."""
100
  if llm_disabled():
101
  return False
102
+ pipe, _ = _load()
103
+ return pipe is not None
104
 
105
 
106
  def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
 
108
 
109
  Raises RuntimeError if the model is unavailable so the caller can fall back.
110
  """
111
+ pipe, tokenizer = _load()
112
+ if pipe is None:
113
  raise RuntimeError(status())
114
  import time
115
+ prompt = _apply_chat_template(messages, tokenizer)
116
  t0 = time.time()
117
+ out = pipe(
118
+ prompt,
119
+ max_new_tokens=max_tokens,
120
+ do_sample=temperature > 0,
121
+ temperature=max(temperature, 1e-5),
122
+ top_p=top_p,
123
+ return_full_text=False,
124
  )
125
  latency = int((time.time() - t0) * 1000)
126
+ # transformers pipeline returns a list of dicts with "generated_text"
127
+ text = out[0]["generated_text"] if isinstance(out, list) else str(out)
128
+ if isinstance(text, list):
129
+ text = text[0].get("generated_text", "") if text else ""
130
+ return str(text).strip(), latency
131
 
132
 
133
  def _fallback(question: str, context: str) -> str:
134
  return (
135
  "[AI offline — raw readout]\n"
136
  f"Q: {question}\n\n{context}\n\n"
137
+ "(Install transformers + torch and allow the model to download to enable "
138
  "LLM natural-language briefings.)"
139
  )
140
 
 
143
  """Generate an answer about the current flights."""
144
  if llm_disabled():
145
  return _fallback(question, context)
146
+ pipe, _ = _load()
147
+ if pipe is None:
148
  return _fallback(question, context)
149
 
150
  messages = [
 
153
  "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
154
  ]
155
  try:
156
+ text, _latency = complete(messages, max_tokens=max_tokens, temperature=0.4)
157
+ return text
 
 
 
 
 
158
  except Exception as e: # noqa: BLE001
159
  return _fallback(question, f"{context}\n\n(LLM error: {e})")
requirements.txt CHANGED
@@ -3,15 +3,9 @@ requests>=2.31.0
3
  python-dotenv>=1.0.0
4
  numpy>=1.26.0
5
  huggingface_hub>=0.24.0
6
- # GGUF runtime for the LLM agent (default model: LiquidAI/LFM2.5-350M-GGUF).
7
- # Easy path (prebuilt CPU wheel):
8
- # pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
9
- # NOTE: that prebuilt wheel is compiled with AVX-512. On CPUs without AVX-512
10
- # (e.g. Intel Core i5/i7/i9 9th-gen) it crashes with 0xC000001D (illegal
11
- # instruction). Build from source for your CPU instead (needs a C compiler):
12
- # set CMAKE_ARGS=-DGGML_AVX512=OFF -DGGML_AVX2=ON -DGGML_FMA=ON -DGGML_F16C=ON
13
- # set FORCE_CMAKE=1
14
- # pip install --no-binary llama-cpp-python llama-cpp-python
15
- llama-cpp-python>=0.3.2
16
-
17
- # The agent LLM runs as GGUF via llama-cpp-python above; no torch needed.
 
3
  python-dotenv>=1.0.0
4
  numpy>=1.26.0
5
  huggingface_hub>=0.24.0
6
+ # LLM agent runtime: LiquidAI LFM2.5-350M via transformers.
7
+ # The model (default LiquidAI/LFM2.5-350M) is downloaded from HuggingFace on
8
+ # first use and cached. Pure-Python wheels — no C++ build step.
9
+ transformers>=4.44.0
10
+ torch>=2.2.0
11
+ accelerate>=0.33.0