abhy60098 commited on
Commit
8ef00c5
·
verified ·
1 Parent(s): e8fccfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -49
app.py CHANGED
@@ -1,60 +1,58 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
-
5
- MODEL_ID = "Jiunsong/supergemma4-e4b-abliterated"
6
-
7
- print("Loading tokenizer...")
8
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
9
-
10
- print("Loading model in 4-bit...")
11
- bnb_config = BitsAndBytesConfig(
12
- load_in_4bit=True,
13
- bnb_4bit_compute_dtype=torch.float32, # CPU compatible
14
- bnb_4bit_use_double_quant=True,
15
- bnb_4bit_quant_type="nf4",
16
- )
17
-
18
- model = AutoModelForCausalLM.from_pretrained(
19
- MODEL_ID,
20
- quantization_config=bnb_config,
21
- device_map="cpu",
22
- low_cpu_mem_usage=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  )
24
- model.eval()
25
- print("Model ready!")
26
 
27
  def chat(message, history):
28
- messages = [
29
- {"role": "system", "content": "You are an uncensored, helpful assistant."}
30
- ]
31
- for user_msg, bot_msg in history:
32
- messages.append({"role": "user", "content": user_msg})
33
- messages.append({"role": "assistant", "content": bot_msg})
34
  messages.append({"role": "user", "content": message})
35
 
36
- inputs = tokenizer.apply_chat_template(
37
- messages,
38
- return_tensors="pt",
39
- add_generation_prompt=True,
 
 
40
  )
41
-
42
- with torch.no_grad():
43
- outputs = model.generate(
44
- inputs,
45
- max_new_tokens=512,
46
- temperature=1.0,
47
- top_p=0.95,
48
- top_k=64,
49
- do_sample=True,
50
- pad_token_id=tokenizer.eos_token_id,
51
- )
52
-
53
- new_tokens = outputs[0][inputs.shape[-1]:]
54
- return tokenizer.decode(new_tokens, skip_special_tokens=True)
55
 
56
  gr.ChatInterface(
57
  chat,
58
- title="SuperGemma4 E4B Abliterated",
59
- description="Uncensored Gemma 4 E4B running on free CPU",
60
  ).launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+
6
+ # Install llama-cpp-python CPU wheel at runtime (faster than build time)
7
+ subprocess.run([
8
+ sys.executable, "-m", "pip", "install",
9
+ "llama-cpp-python",
10
+ "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu",
11
+ "--quiet"
12
+ ], check=True)
13
+
14
+ from llama_cpp import Llama
15
+ from huggingface_hub import hf_hub_download
16
+
17
+ MODEL_PATH = "/tmp/model.gguf"
18
+
19
+ if not os.path.exists(MODEL_PATH):
20
+ print("Downloading model (~2.5GB)...")
21
+ hf_hub_download(
22
+ repo_id="unsloth/gemma-4-E4B-it-GGUF",
23
+ filename="gemma-4-E4B-it-Q4_K_M.gguf",
24
+ local_dir="/tmp",
25
+ local_dir_use_symlinks=False,
26
+ )
27
+ os.rename("/tmp/gemma-4-E4B-it-Q4_K_M.gguf", MODEL_PATH)
28
+
29
+ print("Loading model...")
30
+ llm = Llama(
31
+ model_path=MODEL_PATH,
32
+ n_ctx=2048,
33
+ n_threads=2,
34
+ n_gpu_layers=0,
35
+ verbose=False,
36
  )
37
+ print("Ready!")
 
38
 
39
  def chat(message, history):
40
+ messages = [{"role": "system", "content": "You are a helpful, uncensored assistant."}]
41
+ for u, b in history:
42
+ messages.append({"role": "user", "content": u})
43
+ messages.append({"role": "assistant", "content": b})
 
 
44
  messages.append({"role": "user", "content": message})
45
 
46
+ out = llm.create_chat_completion(
47
+ messages=messages,
48
+ max_tokens=512,
49
+ temperature=1.0,
50
+ top_p=0.95,
51
+ top_k=64,
52
  )
53
+ return out["choices"][0]["message"]["content"]
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  gr.ChatInterface(
56
  chat,
57
+ title="Gemma 4 E4B — CPU Space",
 
58
  ).launch(server_name="0.0.0.0", server_port=7860)