CharlieBonito commited on
Commit
4521963
verified
1 Parent(s): ff04f16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -26,6 +26,8 @@ LOG_FILE = os.getenv("LOG_FILE", os.path.join(APP_DIR, "startup.log"))
26
  CPU_THREADS = int(os.getenv("CPU_THREADS", "8"))
27
  LLAMA_CTX = int(os.getenv("LLAMA_CTX", "12288"))
28
  LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "8192"))
 
 
29
  JINA_API_KEY = os.getenv("JINA_API_KEY", "")
30
  JINA_EMBED_MODEL = os.getenv("JINA_EMBED_MODEL", "jina-embeddings-v3")
31
  RAG_INDEX_FILE = os.getenv("RAG_INDEX_FILE", os.path.join(APP_DIR, "rag_index.json"))
@@ -225,7 +227,12 @@ def log(msg):
225
  def start_server():
226
  global multimodal_ready
227
  os.makedirs(MODEL_DIR, exist_ok=True)
228
- log(f"Configuraci贸n: CPU_THREADS={CPU_THREADS}, LLAMA_CTX={LLAMA_CTX}, LLAMA_MAX_TOKENS={LLAMA_MAX_TOKENS}")
 
 
 
 
 
229
  log("Descargando modelo para inferencia CPU-only...")
230
  try:
231
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
@@ -262,8 +269,8 @@ def start_server():
262
  "-t", str(CPU_THREADS),
263
  "-tb", str(CPU_THREADS),
264
  "-np", "1",
265
- "-b", "512",
266
- "-ub", "128",
267
  "--threads-http", "2",
268
  "--fit", "off",
269
  "--no-mmap",
 
26
  CPU_THREADS = int(os.getenv("CPU_THREADS", "8"))
27
  LLAMA_CTX = int(os.getenv("LLAMA_CTX", "12288"))
28
  LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "8192"))
29
+ LLAMA_BATCH = int(os.getenv("LLAMA_BATCH", "512"))
30
+ LLAMA_UBATCH = int(os.getenv("LLAMA_UBATCH", "512"))
31
  JINA_API_KEY = os.getenv("JINA_API_KEY", "")
32
  JINA_EMBED_MODEL = os.getenv("JINA_EMBED_MODEL", "jina-embeddings-v3")
33
  RAG_INDEX_FILE = os.getenv("RAG_INDEX_FILE", os.path.join(APP_DIR, "rag_index.json"))
 
227
  def start_server():
228
  global multimodal_ready
229
  os.makedirs(MODEL_DIR, exist_ok=True)
230
+ log(
231
+ "Configuraci贸n: "
232
+ f"CPU_THREADS={CPU_THREADS}, LLAMA_CTX={LLAMA_CTX}, "
233
+ f"LLAMA_MAX_TOKENS={LLAMA_MAX_TOKENS}, LLAMA_BATCH={LLAMA_BATCH}, "
234
+ f"LLAMA_UBATCH={LLAMA_UBATCH}"
235
+ )
236
  log("Descargando modelo para inferencia CPU-only...")
237
  try:
238
  m_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR)
 
269
  "-t", str(CPU_THREADS),
270
  "-tb", str(CPU_THREADS),
271
  "-np", "1",
272
+ "-b", str(LLAMA_BATCH),
273
+ "-ub", str(LLAMA_UBATCH),
274
  "--threads-http", "2",
275
  "--fit", "off",
276
  "--no-mmap",