sourize commited on
Commit
0d312d9
Β·
verified Β·
1 Parent(s): 1e37776

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -27
app.py CHANGED
@@ -1,12 +1,19 @@
1
  import os
2
  import streamlit as st
3
  from transformers import (
4
- pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
  )
6
  from peft import PeftModel
7
  from supabase import create_client
8
  from sentence_transformers import SentenceTransformer
9
 
 
 
 
 
 
 
 
10
  # ── Supabase setup ─────────────────────────────────────────────────────────
11
  SUPA_URL = os.getenv("SUPABASE_URL")
12
  SUPA_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
@@ -43,25 +50,37 @@ def load_generator():
43
  )
44
  if tokenizer.pad_token_id is None:
45
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
46
- # 2) Quantization config for 4-bit
47
- bnb_config = BitsAndBytesConfig(
48
- load_in_4bit=True,
49
- bnb_4bit_quant_type="nf4",
50
- bnb_4bit_compute_dtype="float16",
51
- low_cpu_mem_usage=True,
52
- )
53
- # 3) Load base model in 4-bit + resize embeddings
54
- base = AutoModelForCausalLM.from_pretrained(
55
- "microsoft/phi-2",
56
- trust_remote_code=True,
57
- quantization_config=bnb_config,
58
- device_map="auto"
59
- )
 
 
 
 
 
 
 
 
 
 
60
  base.resize_token_embeddings(len(tokenizer))
61
- # 4) Overlay LoRA adapter
62
- model = PeftModel.from_pretrained(base, REPO, device_map="auto", torch_dtype="auto")
 
63
  model.eval()
64
- # 5) Pipeline with greedy sampling + constraints
 
65
  gen = pipeline(
66
  "text-generation",
67
  model=model,
@@ -80,9 +99,9 @@ tokenizer, generator = load_generator()
80
 
81
  # ── System prompt to reduce hallucinations ──────────────────────────────────
82
  SYSTEM = (
83
- "You are a helpful assistant.\\n"
84
- "Answer **only** using the information in the memory below.\\n"
85
- "If the answer is not in memory, reply: \"I don't know.\"\\n"
86
  )
87
 
88
  # ── Streamlit UI ──────────────────────────────────────────────────────────
@@ -94,10 +113,7 @@ if "history" not in st.session_state:
94
 
95
  # Render existing chat history
96
  for role, msg in st.session_state.history:
97
- if role == "You":
98
- st.chat_message("user").write(msg)
99
- else:
100
- st.chat_message("assistant").write(msg)
101
 
102
  # Input box at the bottom
103
  user_input = st.chat_input("Type your message...")
@@ -120,7 +136,7 @@ Memory:
120
  User: {user_input}
121
  Assistant:"""
122
 
123
- # Generate reply synchronously with spinner
124
  with st.spinner("Thinking..."):
125
  try:
126
  out = generator(prompt)[0]["generated_text"].strip()
@@ -130,4 +146,4 @@ Assistant:"""
130
 
131
  # Append assistant reply
132
  st.session_state.history.append(("Bot", out))
133
- add_mem("assistant", out)
 
1
  import os
2
  import streamlit as st
3
  from transformers import (
4
+ pipeline, AutoTokenizer, AutoModelForCausalLM
5
  )
6
  from peft import PeftModel
7
  from supabase import create_client
8
  from sentence_transformers import SentenceTransformer
9
 
10
+ # Try to import bitsandbytes for 4-bit; fall back if missing
11
+ try:
12
+ from transformers import BitsAndBytesConfig
13
+ BNB_AVAILABLE = True
14
+ except ImportError:
15
+ BNB_AVAILABLE = False
16
+
17
  # ── Supabase setup ─────────────────────────────────────────────────────────
18
  SUPA_URL = os.getenv("SUPABASE_URL")
19
  SUPA_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
 
50
  )
51
  if tokenizer.pad_token_id is None:
52
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
53
+
54
+ # 2) Base model load (with or without 4-bit)
55
+ if BNB_AVAILABLE:
56
+ bnb_config = BitsAndBytesConfig(
57
+ load_in_4bit=True,
58
+ bnb_4bit_quant_type="nf4",
59
+ bnb_4bit_compute_dtype="float16",
60
+ low_cpu_mem_usage=True,
61
+ )
62
+ base = AutoModelForCausalLM.from_pretrained(
63
+ "microsoft/phi-2",
64
+ trust_remote_code=True,
65
+ quantization_config=bnb_config,
66
+ device_map="auto"
67
+ )
68
+ else:
69
+ base = AutoModelForCausalLM.from_pretrained(
70
+ "microsoft/phi-2",
71
+ trust_remote_code=True,
72
+ torch_dtype="auto",
73
+ device_map="auto"
74
+ )
75
+
76
+ # 3) Resize embeddings & overlay LoRA
77
  base.resize_token_embeddings(len(tokenizer))
78
+ model = PeftModel.from_pretrained(
79
+ base, REPO, device_map="auto", torch_dtype="auto"
80
+ )
81
  model.eval()
82
+
83
+ # 4) Pipeline (greedy-ish sampling)
84
  gen = pipeline(
85
  "text-generation",
86
  model=model,
 
99
 
100
  # ── System prompt to reduce hallucinations ──────────────────────────────────
101
  SYSTEM = (
102
+ "You are a helpful assistant.\n"
103
+ "Answer **only** using the information in the memory below.\n"
104
+ "If the answer is not in memory, reply: \"I don't know.\"\n"
105
  )
106
 
107
  # ── Streamlit UI ──────────────────────────────────────────────────────────
 
113
 
114
  # Render existing chat history
115
  for role, msg in st.session_state.history:
116
+ st.chat_message("user" if role=="You" else "assistant").write(msg)
 
 
 
117
 
118
  # Input box at the bottom
119
  user_input = st.chat_input("Type your message...")
 
136
  User: {user_input}
137
  Assistant:"""
138
 
139
+ # Generate reply with spinner
140
  with st.spinner("Thinking..."):
141
  try:
142
  out = generator(prompt)[0]["generated_text"].strip()
 
146
 
147
  # Append assistant reply
148
  st.session_state.history.append(("Bot", out))
149
+ add_mem("assistant", out)