sourize commited on
Commit
b4573da
Β·
1 Parent(s): d216abd
Files changed (1) hide show
  1. app.py +44 -48
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import streamlit as st
3
  import torch
 
4
  from transformers import (
5
  pipeline,
6
  AutoTokenizer,
@@ -8,90 +9,86 @@ from transformers import (
8
  BitsAndBytesConfig,
9
  )
10
  from peft import PeftModel
11
- import logging
12
 
13
  # ── Configuration ──────────────────────────────────────────────────────────
14
  BASE_MODEL = "microsoft/phi-2"
15
  ADAPTER_REPO = "sourize/phi2-memory-lora"
16
  CONTEXT_TURNS = 6
17
- MAX_NEW_TOKENS = 32
18
  OFFLOAD_DIR = "offload"
19
 
20
  SYSTEM = (
21
- "You are a helpful assistant for DeepTalks with base Phi-2 "
22
  "fine-tuned by Sourish for domain support.\n"
23
  "Answer **only** using the conversation context below.\n"
24
  "Do NOT output any lines beginning with 'User:' or 'Assistant:'.\n"
25
- "If you don't know, say 'I don't know.'\n"
26
  )
27
 
28
-
29
  @st.cache_resource(show_spinner=False)
30
  def load_pipeline():
31
  # 1) Tokenizer
32
  tokenizer = AutoTokenizer.from_pretrained(
33
- BASE_MODEL,
34
- trust_remote_code=True,
35
- padding_side="left",
36
  )
37
  if tokenizer.pad_token_id is None:
38
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
39
 
40
- # 2) Base model (4-bit quant on GPU, else FP16/FP32)
41
  if torch.cuda.is_available():
42
- bnb_config = BitsAndBytesConfig(
43
  load_in_4bit=True,
44
  bnb_4bit_quant_type="nf4",
45
  bnb_4bit_compute_dtype="float16",
46
  low_cpu_mem_usage=True,
47
  )
48
- base = AutoModelForCausalLM.from_pretrained(
49
- BASE_MODEL,
50
- trust_remote_code=True,
51
- quantization_config=bnb_config,
52
- device_map="auto",
53
- offload_folder=OFFLOAD_DIR,
54
- offload_state_dict=True,
55
- )
56
  else:
57
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
58
- base = AutoModelForCausalLM.from_pretrained(
59
- BASE_MODEL,
60
- trust_remote_code=True,
61
- torch_dtype=dtype,
62
- device_map="auto",
63
- offload_folder=OFFLOAD_DIR,
64
- offload_state_dict=True,
65
  )
66
 
67
- # 3) Resize embeddings
68
- base.resize_token_embeddings(len(tokenizer))
 
 
 
 
 
 
 
 
69
 
70
- # 4) Overlay LoRA adapter from Hugging Face Hub
 
71
  model = PeftModel.from_pretrained(
72
  base,
73
  ADAPTER_REPO,
74
  trust_remote_code=True,
75
  device_map="auto",
76
- torch_dtype="auto",
77
  offload_folder=OFFLOAD_DIR,
78
  offload_state_dict=True,
 
79
  )
80
  model.eval()
81
 
82
- # 5) Build generation pipeline (greedy for speed)
83
  gen = pipeline(
84
  "text-generation",
85
  model=model,
86
  tokenizer=tokenizer,
87
  device_map="auto",
88
  max_new_tokens=MAX_NEW_TOKENS,
89
- do_sample=False, # greedy decoding
 
 
90
  use_cache=True,
91
  return_full_text=False,
92
  )
93
 
94
- logging.info(f"Loaded pipeline with model {BASE_MODEL} + adapter {ADAPTER_REPO}")
95
  return gen
96
 
97
  generator = load_pipeline()
@@ -101,24 +98,25 @@ st.set_page_config(layout="centered")
101
  st.title("🧠 DeepTalks")
102
  st.subheader("Your personal AI Companion", divider='grey')
103
 
104
- # initialize chat history
105
  if "history" not in st.session_state:
106
- st.session_state.history = [] # List of (role, text)
107
 
108
- # render chat history
109
  for role, text in st.session_state.history:
110
- st.chat_message("user" if role == "You" else "assistant").write(text)
111
 
112
  # user input
113
  user_input = st.chat_input("Your message…")
114
  if user_input:
115
- # show & store user turn
116
  st.chat_message("user").write(user_input)
117
  st.session_state.history.append(("You", user_input))
118
 
119
- # build context from last CONTEXT_TURNS
120
  recent = st.session_state.history[-CONTEXT_TURNS*2:]
121
- context = "\n".join(t for _, t in recent)
 
122
  prompt = f"""{SYSTEM}
123
 
124
  Context:
@@ -127,22 +125,20 @@ Context:
127
  User: {user_input}
128
  Assistant:"""
129
 
130
- # generate reply
131
  with st.spinner("Thinking…"):
132
  try:
133
- # pipeline was set up with `return_full_text=False`, so we get just the reply
134
  reply = generator(prompt)[0]["generated_text"].strip()
135
- # strip any accidental echoes
136
  for marker in ["User:", "Assistant:"]:
137
  if marker in reply:
138
  reply = reply.split(marker)[0].strip()
139
- # if it somehow ends up empty, backstop with an apology
140
  if not reply:
141
- reply = "I'm sorry, I didn't catch that. Could you rephrase?"
142
  except Exception as e:
143
- reply = "I'm sorry, something went wrong."
144
- st.error(f"Generation error: {e}")
145
 
146
- # show & store assistant turn
147
  st.chat_message("assistant").write(reply)
148
  st.session_state.history.append(("Bot", reply))
 
1
  import os
2
  import streamlit as st
3
  import torch
4
+ import logging
5
  from transformers import (
6
  pipeline,
7
  AutoTokenizer,
 
9
  BitsAndBytesConfig,
10
  )
11
  from peft import PeftModel
 
12
 
13
  # ── Configuration ──────────────────────────────────────────────────────────
14
  BASE_MODEL = "microsoft/phi-2"
15
  ADAPTER_REPO = "sourize/phi2-memory-lora"
16
  CONTEXT_TURNS = 6
17
+ MAX_NEW_TOKENS = 128
18
  OFFLOAD_DIR = "offload"
19
 
20
  SYSTEM = (
21
+ "You are a helpful assistant for DeepTalks with base Phi-2\n"
22
  "fine-tuned by Sourish for domain support.\n"
23
  "Answer **only** using the conversation context below.\n"
24
  "Do NOT output any lines beginning with 'User:' or 'Assistant:'.\n"
25
+ "If you don't know, say \"I don't know.\"\n"
26
  )
27
 
28
+ # ── Model + Pipeline Loader ─────────────────────────────────────────────────
29
  @st.cache_resource(show_spinner=False)
30
  def load_pipeline():
31
  # 1) Tokenizer
32
  tokenizer = AutoTokenizer.from_pretrained(
33
+ BASE_MODEL, trust_remote_code=True, padding_side="left"
 
 
34
  )
35
  if tokenizer.pad_token_id is None:
36
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
37
 
38
+ # 2) Choose quantization config
39
  if torch.cuda.is_available():
40
+ quant_config = BitsAndBytesConfig(
41
  load_in_4bit=True,
42
  bnb_4bit_quant_type="nf4",
43
  bnb_4bit_compute_dtype="float16",
44
  low_cpu_mem_usage=True,
45
  )
 
 
 
 
 
 
 
 
46
  else:
47
+ quant_config = BitsAndBytesConfig(
48
+ load_in_8bit=True,
49
+ llm_int8_threshold=6.0,
50
+ llm_int8_has_fp16_weight=False,
 
 
 
 
51
  )
52
 
53
+ # 3) Load base model
54
+ base = AutoModelForCausalLM.from_pretrained(
55
+ BASE_MODEL,
56
+ trust_remote_code=True,
57
+ quantization_config=quant_config,
58
+ device_map="auto",
59
+ offload_folder=OFFLOAD_DIR,
60
+ offload_state_dict=True,
61
+ torch_dtype=None # auto
62
+ )
63
 
64
+ # 4) Resize embeddings & overlay LoRA
65
+ base.resize_token_embeddings(len(tokenizer))
66
  model = PeftModel.from_pretrained(
67
  base,
68
  ADAPTER_REPO,
69
  trust_remote_code=True,
70
  device_map="auto",
 
71
  offload_folder=OFFLOAD_DIR,
72
  offload_state_dict=True,
73
+ torch_dtype="auto",
74
  )
75
  model.eval()
76
 
77
+ # 5) Build sampler pipeline
78
  gen = pipeline(
79
  "text-generation",
80
  model=model,
81
  tokenizer=tokenizer,
82
  device_map="auto",
83
  max_new_tokens=MAX_NEW_TOKENS,
84
+ do_sample=True,
85
+ temperature=0.7,
86
+ top_p=0.9,
87
  use_cache=True,
88
  return_full_text=False,
89
  )
90
 
91
+ logging.info("Pipeline loaded.")
92
  return gen
93
 
94
  generator = load_pipeline()
 
98
  st.title("🧠 DeepTalks")
99
  st.subheader("Your personal AI Companion", divider='grey')
100
 
101
+ # initialize history
102
  if "history" not in st.session_state:
103
+ st.session_state.history = [] # list of (role, text)
104
 
105
+ # render past messages
106
  for role, text in st.session_state.history:
107
+ st.chat_message("user" if role=="You" else "assistant").write(text)
108
 
109
  # user input
110
  user_input = st.chat_input("Your message…")
111
  if user_input:
112
+ # show & store user
113
  st.chat_message("user").write(user_input)
114
  st.session_state.history.append(("You", user_input))
115
 
116
+ # build clean context from last turns (texts only)
117
  recent = st.session_state.history[-CONTEXT_TURNS*2:]
118
+ context = "\n".join(text for _, text in recent)
119
+
120
  prompt = f"""{SYSTEM}
121
 
122
  Context:
 
125
  User: {user_input}
126
  Assistant:"""
127
 
128
+ # generate with spinner
129
  with st.spinner("Thinking…"):
130
  try:
 
131
  reply = generator(prompt)[0]["generated_text"].strip()
132
+ # strip stray markers
133
  for marker in ["User:", "Assistant:"]:
134
  if marker in reply:
135
  reply = reply.split(marker)[0].strip()
 
136
  if not reply:
137
+ reply = "I’m sorry, I didn’t catch that. Could you rephrase?"
138
  except Exception as e:
139
+ reply = "I’m sorry, something went wrong."
140
+ st.error(f"Error: {e}")
141
 
142
+ # show & store assistant
143
  st.chat_message("assistant").write(reply)
144
  st.session_state.history.append(("Bot", reply))