MyEnny commited on
Commit
f5cf06e
·
verified ·
1 Parent(s): d4903a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -17
app.py CHANGED
@@ -1,8 +1,15 @@
1
  import os
2
  import zipfile
 
3
  import gradio as gr
4
 
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 
 
 
 
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
8
  from langchain.llms import HuggingFacePipeline
@@ -16,30 +23,51 @@ if not os.path.exists("faiss_index") and os.path.exists("faiss_index.zip"):
16
  zip_ref.extractall(".")
17
 
18
  # --- Step 2: Load embedding and vectorstore ---
19
- embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
20
- vectordb = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)
21
 
22
- # --- Step 3: Load the LLM ---
23
  model_id = "tiiuae/falcon3-1b-instruct"
24
- tokenizer = AutoTokenizer.from_pretrained(model_id)
25
- model = AutoModelForCausalLM.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  pipe = pipeline(
28
- "text-generation",
29
  model=model,
30
  tokenizer=tokenizer,
31
  pad_token_id=tokenizer.eos_token_id,
32
- max_new_tokens=200,
33
  do_sample=True,
34
- temperature=1.0,
 
35
  )
 
36
  llm = HuggingFacePipeline(pipeline=pipe)
37
 
38
  # --- Step 4: Setup memory and QA chain ---
39
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
40
 
41
- prompt = PromptTemplate.from_template("""
42
- You are a helpful assistant at the University of Hertfordshire. Use the context below to answer the question clearly and factually.
 
43
  If the answer is not in the context, say you don't know.
44
 
45
  Context:
@@ -48,23 +76,28 @@ Context:
48
  Question:
49
  {question}
50
 
51
- Helpful Answer:
52
- """)
 
53
 
54
  qa_chain = ConversationalRetrievalChain.from_llm(
55
  llm=llm,
56
  retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
57
  memory=memory,
58
  chain_type="stuff",
59
- combine_docs_chain_kwargs={"prompt": prompt}
60
  )
61
 
62
  UH_LOGO = "images/UH.png"
 
63
  # --- Step 5: Define chatbot logic ---
64
  def chat(message, history):
65
  result = qa_chain.invoke({"question": message})
66
- response = result.get("answer", "")
67
- response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
 
 
 
68
  return response
69
 
70
  # --- Step 6: UI ---
@@ -96,4 +129,5 @@ with gr.Blocks() as demo:
96
  submit.click(respond, [txt, chatbot], [txt, chatbot])
97
  txt.submit(respond, [txt, chatbot], [txt, chatbot])
98
 
99
- demo.launch()
 
 
1
  import os
2
  import zipfile
3
+ import torch
4
  import gradio as gr
5
 
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ AutoModelForCausalLM,
9
+ BitsAndBytesConfig,
10
+ pipeline,
11
+ )
12
+
13
  from langchain.embeddings import HuggingFaceEmbeddings
14
  from langchain.vectorstores import FAISS
15
  from langchain.llms import HuggingFacePipeline
 
23
  zip_ref.extractall(".")
24
 
25
  # --- Step 2: Load embedding and vectorstore ---
26
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
+ vectordb = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
28
 
29
+ # --- Step 3: Load the LLM (memory-efficient) ---
30
  model_id = "tiiuae/falcon3-1b-instruct"
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
33
+
34
+ # 4-bit quantisation to stay within L4 memory
35
+ bnb_config = BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ bnb_4bit_compute_dtype=torch.float16,
38
+ bnb_4bit_use_double_quant=True,
39
+ bnb_4bit_quant_type="nf4",
40
+ )
41
+
42
+ model = AutoModelForCausalLM.from_pretrained(
43
+ model_id,
44
+ quantization_config=bnb_config,
45
+ device_map="auto", # place layers on GPU/CPU automatically if needed
46
+ torch_dtype=torch.float16, # keeps activation memory down
47
+ low_cpu_mem_usage=True,
48
+ )
49
+ model.eval()
50
+ torch.set_grad_enabled(False)
51
 
52
  pipe = pipeline(
53
+ task="text-generation",
54
  model=model,
55
  tokenizer=tokenizer,
56
  pad_token_id=tokenizer.eos_token_id,
57
+ max_new_tokens=160, # keep this modest to avoid spikes
58
  do_sample=True,
59
+ temperature=0.7,
60
+ top_p=0.9,
61
  )
62
+
63
  llm = HuggingFacePipeline(pipeline=pipe)
64
 
65
  # --- Step 4: Setup memory and QA chain ---
66
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
67
 
68
+ prompt = PromptTemplate.from_template(
69
+ """
70
+ You are a helpful assistant at the University of Hertfordshire. Use only the context below to answer clearly and factually.
71
  If the answer is not in the context, say you don't know.
72
 
73
  Context:
 
76
  Question:
77
  {question}
78
 
79
+ Helpful answer:
80
+ """.strip()
81
+ )
82
 
83
  qa_chain = ConversationalRetrievalChain.from_llm(
84
  llm=llm,
85
  retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
86
  memory=memory,
87
  chain_type="stuff",
88
+ combine_docs_chain_kwargs={"prompt": prompt},
89
  )
90
 
91
  UH_LOGO = "images/UH.png"
92
+
93
  # --- Step 5: Define chatbot logic ---
94
  def chat(message, history):
95
  result = qa_chain.invoke({"question": message})
96
+ # ConversationalRetrievalChain returns {"answer": "...", "source_documents": ..., "chat_history": ...}
97
+ response = result.get("answer", "").replace("<|assistant|>", "").strip()
98
+ # keep GPU clean between turns (helps on Spaces)
99
+ if torch.cuda.is_available():
100
+ torch.cuda.empty_cache()
101
  return response
102
 
103
  # --- Step 6: UI ---
 
129
  submit.click(respond, [txt, chatbot], [txt, chatbot])
130
  txt.submit(respond, [txt, chatbot], [txt, chatbot])
131
 
132
+ if __name__ == "__main__":
133
+ demo.launch()