Taylor commited on
Commit
33ad9ed
·
1 Parent(s): 53ff270

fix: switch to transformers+torch(cpu) -- no C++ compilation needed

Browse files

llama-cpp-python requires compiling llama.cpp from source which
times out on HF Spaces (both Docker and Gradio SDK builders).

transformers 4.46+ can load GGUF files natively. CPU-only torch
wheel is ~200MB vs 2GB+ full torch. All pre-built wheels, zero
compilation, installs in seconds.

Files changed (2) hide show
  1. app.py +49 -18
  2. requirements.txt +6 -2
app.py CHANGED
@@ -4,25 +4,56 @@ Live inference. Real outputs. Nothing hardcoded.
4
  """
5
 
6
  import gradio as gr
7
- from llama_cpp import Llama
8
- from huggingface_hub import hf_hub_download
9
 
10
  print("Loading models...", flush=True)
11
- bule_path = hf_hub_download(repo_id="forkjoin-ai/buleyean-smollm2-360m", filename="buleyean-smollm2-360m-q4_k_m.gguf", cache_dir="/tmp/hf_cache")
12
- base_path = hf_hub_download(repo_id="bartowski/SmolLM2-360M-Instruct-GGUF", filename="SmolLM2-360M-Instruct-Q4_K_M.gguf", cache_dir="/tmp/hf_cache")
13
- bule_llm = Llama(model_path=bule_path, n_ctx=512, n_threads=4, verbose=False)
14
- base_llm = Llama(model_path=base_path, n_ctx=512, n_threads=4, verbose=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  print("Ready.", flush=True)
16
 
17
- def gen(prompt, model):
18
- return model(f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n",
19
- max_tokens=300, temperature=0.7, top_p=0.9,
20
- stop=["<|im_end|>", "<|im_start|>"])["choices"][0]["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def compare(prompt):
23
  if not prompt or not prompt.strip():
24
  return "", ""
25
- return gen(prompt, base_llm), gen(prompt, bule_llm)
 
 
 
26
 
27
  CSS = """
28
  /* AeonOS Design System */
@@ -109,16 +140,16 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zi
109
  # Footer
110
  gr.HTML("""
111
  <div id="footer">
112
- <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">SmolLM2-360M-Instruct &nbsp;·&nbsp; Q4_K_M GGUF &nbsp;·&nbsp; Live inference on CPU</p>
113
  <p>
114
- <a href="https://forkracefold.com/">Whitepaper</a> &nbsp;·&nbsp;
115
- <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &nbsp;·&nbsp;
116
- <a href="https://huggingface.co/forkjoin-ai">Models</a> &nbsp;·&nbsp;
117
- <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a> &nbsp;·&nbsp;
118
- <a href="https://huggingface.co/spaces/forkjoin-ai/void-attention">Void Attention</a> &nbsp;·&nbsp;
119
  <a href="https://huggingface.co/spaces/forkjoin-ai/metacog">METACOG</a>
120
  </p>
121
- <p style="margin-top:1rem;">500+ Lean 4 theorems &nbsp;·&nbsp; Zero sorry &nbsp;·&nbsp; <a href="https://forkracefold.com/">φ² = φ + 1</a></p>
122
  </div>
123
  """)
124
 
 
4
  """
5
 
6
  import gradio as gr
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
 
10
  print("Loading models...", flush=True)
11
+
12
+ # Base model -- load from safetensors (fast)
13
+ base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
14
+ base_model = AutoModelForCausalLM.from_pretrained(
15
+ "HuggingFaceTB/SmolLM2-360M-Instruct",
16
+ torch_dtype=torch.float32,
17
+ device_map="cpu",
18
+ )
19
+
20
+ # Buleyean model -- load from GGUF via transformers
21
+ bule_model = AutoModelForCausalLM.from_pretrained(
22
+ "forkjoin-ai/buleyean-smollm2-360m",
23
+ gguf_file="buleyean-smollm2-360m-q4_k_m.gguf",
24
+ torch_dtype=torch.float32,
25
+ device_map="cpu",
26
+ )
27
+ # Reuse the same tokenizer (same base architecture)
28
+ bule_tokenizer = base_tokenizer
29
+
30
  print("Ready.", flush=True)
31
 
32
+
33
+ def gen(prompt, model, tokenizer):
34
+ messages = [{"role": "user", "content": prompt}]
35
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
36
+ inputs = tokenizer(text, return_tensors="pt")
37
+ with torch.no_grad():
38
+ outputs = model.generate(
39
+ **inputs,
40
+ max_new_tokens=300,
41
+ temperature=0.7,
42
+ top_p=0.9,
43
+ do_sample=True,
44
+ pad_token_id=tokenizer.eos_token_id,
45
+ )
46
+ response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
47
+ return response.strip()
48
+
49
 
50
  def compare(prompt):
51
  if not prompt or not prompt.strip():
52
  return "", ""
53
+ base_out = gen(prompt, base_model, base_tokenizer)
54
+ bule_out = gen(prompt, bule_model, bule_tokenizer)
55
+ return base_out, bule_out
56
+
57
 
58
  CSS = """
59
  /* AeonOS Design System */
 
140
  # Footer
141
  gr.HTML("""
142
  <div id="footer">
143
+ <p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">SmolLM2-360M-Instruct &nbsp;&middot;&nbsp; Q4_K_M GGUF &nbsp;&middot;&nbsp; Live inference on CPU</p>
144
  <p>
145
+ <a href="https://forkracefold.com/">Whitepaper</a> &nbsp;&middot;&nbsp;
146
+ <a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> &nbsp;&middot;&nbsp;
147
+ <a href="https://huggingface.co/forkjoin-ai">Models</a> &nbsp;&middot;&nbsp;
148
+ <a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a> &nbsp;&middot;&nbsp;
149
+ <a href="https://huggingface.co/spaces/forkjoin-ai/void-attention">Void Attention</a> &nbsp;&middot;&nbsp;
150
  <a href="https://huggingface.co/spaces/forkjoin-ai/metacog">METACOG</a>
151
  </p>
152
+ <p style="margin-top:1rem;">500+ Lean 4 theorems &nbsp;&middot;&nbsp; Zero sorry &nbsp;&middot;&nbsp; <a href="https://forkracefold.com/">&phi;&sup2; = &phi; + 1</a></p>
153
  </div>
154
  """)
155
 
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
- llama-cpp-python>=0.3.0
 
3
  huggingface-hub>=0.26.0
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+ torch>=2.1.0
3
+ transformers>=4.46.0
4
  huggingface-hub>=0.26.0
5
+ sentencepiece>=0.2.0
6
+ accelerate>=1.0.0
7
+ gguf>=0.10.0