CPU-LLM-Inference

Running

R-Kentaren commited on 1 day ago

Commit

6861394

verified ·

1 Parent(s): ada8101

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import torch
 from transformers import pipeline, TextIteratorStreamer, StoppingCriteria
 from transformers import AutoTokenizer
 from ddgs import DDGS
-import spaces  # Import spaces early to enable ZeroGPU support
 from torch.utils._pytree import tree_map
 from config import *
 # Global event to signal cancellation from the UI thread to the generation thread
@@ -19,9 +18,6 @@ cancel_event = threading.Event()
 access_token=os.environ['HF_TOKEN']
-# Optional: Disable GPU visibility if you wish to force CPU usage
-# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # Global cache for pipelines to avoid re-loading.
@@ -109,7 +105,7 @@ def get_duration(user_msg, chat_history, system_prompt, enable_search, max_resul
     return base_duration + token_duration + search_duration + aot_compilation_buffer
-@spaces.GPU(duration=get_duration)
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,

 from transformers import pipeline, TextIteratorStreamer, StoppingCriteria
 from transformers import AutoTokenizer
 from ddgs import DDGS
 from torch.utils._pytree import tree_map
 from config import *
 # Global event to signal cancellation from the UI thread to the generation thread
 access_token=os.environ['HF_TOKEN']
 # Global cache for pipelines to avoid re-loading.
     return base_duration + token_duration + search_duration + aot_compilation_buffer
 def chat_response(user_msg, chat_history, system_prompt,
                   enable_search, max_results, max_chars,
                   model_name, max_tokens, temperature,