shuarya2011 commited on
Commit
6adb545
·
verified ·
1 Parent(s): 884d48b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -21
app.py CHANGED
@@ -1,16 +1,19 @@
1
  from flask import Flask, request, Response, stream_with_context
2
- from llama_cpp import Llama
3
- import json
 
4
 
5
  app = Flask(__name__)
6
 
7
- # Model setup - Using the Gemma-3-1B Heretic GGUF
8
- # Ensure this file is uploaded to your Space as well
9
- llm = Llama(
10
- model_path="gemma-3-1b-it-heretic-extreme-uncensored-abliterated.Q4_K_S.gguf",
11
- n_ctx=1024,
12
- n_threads=2, # Optimized for HF Free CPU
13
- verbose=False
 
 
14
  )
15
 
16
  @app.route('/generate', methods=['POST'])
@@ -18,20 +21,26 @@ def generate():
18
  data = request.json
19
  prompt = data.get("prompt", "")
20
 
21
- # System instruction for Jarvis personality
22
- full_prompt = f"<|system|>You are Jarvis, a witty and concise AI assistant. Respond briefly.<|user|>{prompt}<|assistant|>"
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  @stream_with_context
25
- def generate_tokens():
26
- # stream=True is the key for word-by-word
27
- output = llm(full_prompt, stream=True, max_tokens=150)
28
- for chunk in output:
29
- token = chunk['choices'][0]['text']
30
- if token:
31
- # Yielding the token immediately sends it to your laptop
32
- yield token
33
-
34
- return Response(generate_tokens(), mimetype='text/plain')
35
 
36
  if __name__ == "__main__":
37
  app.run(host="0.0.0.0", port=7860)
 
1
  from flask import Flask, request, Response, stream_with_context
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
+ from threading import Thread
4
+ import torch
5
 
6
  app = Flask(__name__)
7
 
8
+ model_id = "google/gemma-3-1b-it" # Using the official IT model
9
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
10
+
11
+ # Load in 4-bit to fit easily and run faster on CPU
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ model_id,
14
+ device_map="auto",
15
+ low_cpu_mem_usage=True,
16
+ load_in_4bit=True
17
  )
18
 
19
  @app.route('/generate', methods=['POST'])
 
21
  data = request.json
22
  prompt = data.get("prompt", "")
23
 
24
+ # Format for Gemma 3
25
+ messages = [
26
+ {"role": "system", "content": "You are Jarvis. Be concise."},
27
+ {"role": "user", "content": prompt}
28
+ ]
29
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
30
+
31
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
32
+
33
+ # Run generation in a separate thread so we can yield tokens immediately
34
+ generation_kwargs = dict(input_ids=inputs, streamer=streamer, max_new_tokens=128)
35
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
36
+ thread.start()
37
 
38
  @stream_with_context
39
+ def stream_words():
40
+ for new_text in streamer:
41
+ yield new_text
42
+
43
+ return Response(stream_words(), mimetype='text/plain')
 
 
 
 
 
44
 
45
  if __name__ == "__main__":
46
  app.run(host="0.0.0.0", port=7860)