Spaces:

abhy60098
/

blah

Build error

App Files Files Community

abhy60098 commited on Apr 29

Commit

8ef00c5

verified ·

1 Parent(s): e8fccfc

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -49

app.py CHANGED Viewed

@@ -1,60 +1,58 @@
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-MODEL_ID = "Jiunsong/supergemma4-e4b-abliterated"
-print("Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-print("Loading model in 4-bit...")
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float32,  # CPU compatible
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    quantization_config=bnb_config,
-    device_map="cpu",
-    low_cpu_mem_usage=True,
 )
-model.eval()
-print("Model ready!")
 def chat(message, history):
-    messages = [
-        {"role": "system", "content": "You are an uncensored, helpful assistant."}
-    ]
-    for user_msg, bot_msg in history:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        return_tensors="pt",
-        add_generation_prompt=True,
     )
-    with torch.no_grad():
-        outputs = model.generate(
-            inputs,
-            max_new_tokens=512,
-            temperature=1.0,
-            top_p=0.95,
-            top_k=64,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
-        )
-    new_tokens = outputs[0][inputs.shape[-1]:]
-    return tokenizer.decode(new_tokens, skip_special_tokens=True)
 gr.ChatInterface(
     chat,
-    title="SuperGemma4 E4B Abliterated",
-    description="Uncensored Gemma 4 E4B running on free CPU",
 ).launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import subprocess
+import sys
+import os
+# Install llama-cpp-python CPU wheel at runtime (faster than build time)
+subprocess.run([
+    sys.executable, "-m", "pip", "install",
+    "llama-cpp-python",
+    "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu",
+    "--quiet"
+], check=True)
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+MODEL_PATH = "/tmp/model.gguf"
+if not os.path.exists(MODEL_PATH):
+    print("Downloading model (~2.5GB)...")
+    hf_hub_download(
+        repo_id="unsloth/gemma-4-E4B-it-GGUF",
+        filename="gemma-4-E4B-it-Q4_K_M.gguf",
+        local_dir="/tmp",
+        local_dir_use_symlinks=False,
+    )
+    os.rename("/tmp/gemma-4-E4B-it-Q4_K_M.gguf", MODEL_PATH)
+print("Loading model...")
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,
+    n_threads=2,
+    n_gpu_layers=0,
+    verbose=False,
 )
+print("Ready!")
 def chat(message, history):
+    messages = [{"role": "system", "content": "You are a helpful, uncensored assistant."}]
+    for u, b in history:
+        messages.append({"role": "user", "content": u})
+        messages.append({"role": "assistant", "content": b})
     messages.append({"role": "user", "content": message})
+    out = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
+        temperature=1.0,
+        top_p=0.95,
+        top_k=64,
     )
+    return out["choices"][0]["message"]["content"]
 gr.ChatInterface(
     chat,
+    title="Gemma 4 E4B — CPU Space",
 ).launch(server_name="0.0.0.0", server_port=7860)