Luigi commited on
Commit
395e2bc
·
1 Parent(s): 9af0518

work on llama.cpp huge loading time

Browse files
Files changed (1) hide show
  1. src/summarization.py +8 -1
src/summarization.py CHANGED
@@ -3,6 +3,11 @@ from llama_cpp import Llama
3
  from utils import available_gguf_llms, s2tw_converter
4
  import time
5
  from functools import lru_cache
 
 
 
 
 
6
 
7
  @lru_cache(maxsize=1)
8
  def get_model(gguf_repo_id, gguf_filename):
@@ -11,13 +16,15 @@ def get_model(gguf_repo_id, gguf_filename):
11
  filename=gguf_filename,
12
  verbose=False,
13
  n_ctx=32768,
14
- n_threads=4,
15
  repeat_penalty=1.2,
16
  )
17
 
18
  def summarize_transcript(transcript, selected_gguf_model, prompt_input):
19
  repo_id, filename = available_gguf_llms[selected_gguf_model]
 
20
  llm = get_model(repo_id, filename)
 
21
  full_summary = []
22
  is_1st_token = True
23
  t1 = time.time()
 
3
  from utils import available_gguf_llms, s2tw_converter
4
  import time
5
  from functools import lru_cache
6
+ import multiprocessing
7
+
8
+ # Detect logical cores (vCPUs available to the container)
9
+ num_vcpus = multiprocessing.cpu_count()
10
+ print(f"Detected vCPUs: {num_vcpus}")
11
 
12
  @lru_cache(maxsize=1)
13
  def get_model(gguf_repo_id, gguf_filename):
 
16
  filename=gguf_filename,
17
  verbose=False,
18
  n_ctx=32768,
19
+ n_threads=num_vcpus,
20
  repeat_penalty=1.2,
21
  )
22
 
23
  def summarize_transcript(transcript, selected_gguf_model, prompt_input):
24
  repo_id, filename = available_gguf_llms[selected_gguf_model]
25
+ t0 = time.time()
26
  llm = get_model(repo_id, filename)
27
+ print(f"Model loading time: {time.time() - t0:.1f} sec")
28
  full_summary = []
29
  is_1st_token = True
30
  t1 = time.time()