Spaces:

ruhzi
/

SLM

Sleeping

App Files Files Community

ruhzi commited on Apr 13

Commit

9aed480

verified ·

1 Parent(s): 10f8f06

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -30

app.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import gradio as gr
 import torch
-import gc
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
 from huggingface_hub import hf_hub_download
 from threading import Thread
 model_path = "ruhzi/Indian_History_SLM"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -13,22 +20,17 @@ template_file = hf_hub_download(repo_id=model_path, filename="chat_template.jinj
 with open(template_file, "r", encoding="utf-8") as f:
     tokenizer.chat_template = f.read()
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
-    torch_dtype=torch.float16,
-    device_map="auto"
 )
-class StopGeneration(StoppingCriteria):
-    def __init__(self):
-        self.stop_now = False
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        return self.stop_now
 def chat_inference(message, history):
     messages = []
     recent_history = history[-3:] if len(history) > 3 else history
     for user_msg, assistant_msg in recent_history:
@@ -43,45 +45,37 @@ def chat_inference(message, history):
         enable_thinking=False
     )
-    inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
-    kill_switch = StopGeneration()
     generate_kwargs = dict(
         **inputs,
         streamer=streamer,
-        max_new_tokens=1024,
         do_sample=True,
         temperature=0.7,
         top_p=0.8,
-        stopping_criteria=StoppingCriteriaList([kill_switch])
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs, daemon=True)
     t.start()
     partial_message = ""
-    try:
-        for new_token in streamer:
-            partial_message += new_token
-            yield partial_message
-    # BaseException catches GeneratorExit and Gradio's internal Stop signals instantly
-    except BaseException:
-        pass
-    finally:
-        # Flip the switch to kill the model thread, then immediately free up the UI
-        kill_switch.stop_now = True
-        del inputs
-        gc.collect()
 demo = gr.ChatInterface(
     fn=chat_inference,
     title="Indian History SLM",
     description="Ask me anything about Indian History!",
     concurrency_limit=1
 )

+import os
+# SPEED FIX 1: Maximize CPU core usage for Hugging Face Free Tier (2 vCPUs)
+os.environ["OMP_NUM_THREADS"] = "2"
 import gradio as gr
 import torch
+import gc
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from huggingface_hub import hf_hub_download
 from threading import Thread
+# SPEED FIX 2: Explicitly tell PyTorch to use both CPU cores
+torch.set_num_threads(2)
 model_path = "ruhzi/Indian_History_SLM"
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 with open(template_file, "r", encoding="utf-8") as f:
     tokenizer.chat_template = f.read()
+# SPEED FIX 3: Removed device_map and used float32 (Native CPU math is faster)
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
+    torch_dtype=torch.float32,
+    low_cpu_mem_usage=True
 )
 def chat_inference(message, history):
     messages = []
+    # MEMORY PROTECTION: Only keep the last 3 conversational turns
     recent_history = history[-3:] if len(history) > 3 else history
     for user_msg, assistant_msg in recent_history:
         enable_thinking=False
     )
+    # Explicitly send to CPU
+    inputs = tokenizer([input_text], return_tensors="pt").to("cpu")
     streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         **inputs,
         streamer=streamer,
+        max_new_tokens=512, # SPEED FIX 4: Kept at 512 for faster, punchier demo responses
         do_sample=True,
         temperature=0.7,
         top_p=0.8,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     partial_message = ""
+    for new_token in streamer:
+        partial_message += new_token
+        yield partial_message
+    # MEMORY PROTECTION: Cleanup after generation finishes
+    del inputs
+    gc.collect()
 demo = gr.ChatInterface(
     fn=chat_inference,
     title="Indian History SLM",
     description="Ask me anything about Indian History!",
+    # CRASH PROTECTION: The strict queue. 1 user at a time.
     concurrency_limit=1
 )