sourize commited on
Commit
17d9700
Β·
1 Parent(s): b4573da
Files changed (1) hide show
  1. app.py +24 -37
app.py CHANGED
@@ -25,7 +25,6 @@ SYSTEM = (
25
  "If you don't know, say \"I don't know.\"\n"
26
  )
27
 
28
- # ── Model + Pipeline Loader ─────────────────────────────────────────────────
29
  @st.cache_resource(show_spinner=False)
30
  def load_pipeline():
31
  # 1) Tokenizer
@@ -35,7 +34,7 @@ def load_pipeline():
35
  if tokenizer.pad_token_id is None:
36
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
37
 
38
- # 2) Choose quantization config
39
  if torch.cuda.is_available():
40
  quant_config = BitsAndBytesConfig(
41
  load_in_4bit=True,
@@ -43,43 +42,40 @@ def load_pipeline():
43
  bnb_4bit_compute_dtype="float16",
44
  low_cpu_mem_usage=True,
45
  )
 
 
 
 
 
 
 
 
46
  else:
47
- quant_config = BitsAndBytesConfig(
48
- load_in_8bit=True,
49
- llm_int8_threshold=6.0,
50
- llm_int8_has_fp16_weight=False,
 
 
51
  )
52
 
53
- # 3) Load base model
54
- base = AutoModelForCausalLM.from_pretrained(
55
- BASE_MODEL,
56
- trust_remote_code=True,
57
- quantization_config=quant_config,
58
- device_map="auto",
59
- offload_folder=OFFLOAD_DIR,
60
- offload_state_dict=True,
61
- torch_dtype=None # auto
62
- )
63
-
64
- # 4) Resize embeddings & overlay LoRA
65
  base.resize_token_embeddings(len(tokenizer))
66
  model = PeftModel.from_pretrained(
67
  base,
68
  ADAPTER_REPO,
69
  trust_remote_code=True,
70
- device_map="auto",
71
- offload_folder=OFFLOAD_DIR,
72
- offload_state_dict=True,
73
- torch_dtype="auto",
74
  )
75
  model.eval()
76
 
77
- # 5) Build sampler pipeline
78
  gen = pipeline(
79
  "text-generation",
80
  model=model,
81
  tokenizer=tokenizer,
82
- device_map="auto",
83
  max_new_tokens=MAX_NEW_TOKENS,
84
  do_sample=True,
85
  temperature=0.7,
@@ -98,26 +94,20 @@ st.set_page_config(layout="centered")
98
  st.title("🧠 DeepTalks")
99
  st.subheader("Your personal AI Companion", divider='grey')
100
 
101
- # initialize history
102
  if "history" not in st.session_state:
103
- st.session_state.history = [] # list of (role, text)
104
 
105
- # render past messages
106
  for role, text in st.session_state.history:
107
- st.chat_message("user" if role=="You" else "assistant").write(text)
108
 
109
- # user input
110
  user_input = st.chat_input("Your message…")
111
  if user_input:
112
- # show & store user
113
  st.chat_message("user").write(user_input)
114
  st.session_state.history.append(("You", user_input))
115
 
116
- # build clean context from last turns (texts only)
117
  recent = st.session_state.history[-CONTEXT_TURNS*2:]
118
- context = "\n".join(text for _, text in recent)
119
-
120
- prompt = f"""{SYSTEM}
121
 
122
  Context:
123
  {context}
@@ -125,11 +115,9 @@ Context:
125
  User: {user_input}
126
  Assistant:"""
127
 
128
- # generate with spinner
129
  with st.spinner("Thinking…"):
130
  try:
131
  reply = generator(prompt)[0]["generated_text"].strip()
132
- # strip stray markers
133
  for marker in ["User:", "Assistant:"]:
134
  if marker in reply:
135
  reply = reply.split(marker)[0].strip()
@@ -139,6 +127,5 @@ Assistant:"""
139
  reply = "I’m sorry, something went wrong."
140
  st.error(f"Error: {e}")
141
 
142
- # show & store assistant
143
  st.chat_message("assistant").write(reply)
144
  st.session_state.history.append(("Bot", reply))
 
25
  "If you don't know, say \"I don't know.\"\n"
26
  )
27
 
 
28
  @st.cache_resource(show_spinner=False)
29
  def load_pipeline():
30
  # 1) Tokenizer
 
34
  if tokenizer.pad_token_id is None:
35
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
36
 
37
+ # 2) Base model: 4-bit on CUDA, plain FP16/FP32 on CPU
38
  if torch.cuda.is_available():
39
  quant_config = BitsAndBytesConfig(
40
  load_in_4bit=True,
 
42
  bnb_4bit_compute_dtype="float16",
43
  low_cpu_mem_usage=True,
44
  )
45
+ base = AutoModelForCausalLM.from_pretrained(
46
+ BASE_MODEL,
47
+ trust_remote_code=True,
48
+ quantization_config=quant_config,
49
+ device_map="auto",
50
+ offload_folder=OFFLOAD_DIR,
51
+ offload_state_dict=True,
52
+ )
53
  else:
54
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
55
+ base = AutoModelForCausalLM.from_pretrained(
56
+ BASE_MODEL,
57
+ trust_remote_code=True,
58
+ torch_dtype=dtype,
59
+ device_map="cpu", # force CPU
60
  )
61
 
62
+ # 3) Resize + LoRA overlay
 
 
 
 
 
 
 
 
 
 
 
63
  base.resize_token_embeddings(len(tokenizer))
64
  model = PeftModel.from_pretrained(
65
  base,
66
  ADAPTER_REPO,
67
  trust_remote_code=True,
68
+ device_map="auto" if torch.cuda.is_available() else None,
69
+ torch_dtype=None,
 
 
70
  )
71
  model.eval()
72
 
73
+ # 4) Build generation pipeline
74
  gen = pipeline(
75
  "text-generation",
76
  model=model,
77
  tokenizer=tokenizer,
78
+ device_map="auto" if torch.cuda.is_available() else None,
79
  max_new_tokens=MAX_NEW_TOKENS,
80
  do_sample=True,
81
  temperature=0.7,
 
94
  st.title("🧠 DeepTalks")
95
  st.subheader("Your personal AI Companion", divider='grey')
96
 
 
97
  if "history" not in st.session_state:
98
+ st.session_state.history = []
99
 
 
100
  for role, text in st.session_state.history:
101
+ st.chat_message("user" if role == "You" else "assistant").write(text)
102
 
 
103
  user_input = st.chat_input("Your message…")
104
  if user_input:
 
105
  st.chat_message("user").write(user_input)
106
  st.session_state.history.append(("You", user_input))
107
 
 
108
  recent = st.session_state.history[-CONTEXT_TURNS*2:]
109
+ context = "\n".join(t for _, t in recent)
110
+ prompt = f"""{SYSTEM}
 
111
 
112
  Context:
113
  {context}
 
115
  User: {user_input}
116
  Assistant:"""
117
 
 
118
  with st.spinner("Thinking…"):
119
  try:
120
  reply = generator(prompt)[0]["generated_text"].strip()
 
121
  for marker in ["User:", "Assistant:"]:
122
  if marker in reply:
123
  reply = reply.split(marker)[0].strip()
 
127
  reply = "I’m sorry, something went wrong."
128
  st.error(f"Error: {e}")
129
 
 
130
  st.chat_message("assistant").write(reply)
131
  st.session_state.history.append(("Bot", reply))