Spaces:

MyEnny
/

Chat_bot

Runtime error

App Files Files Community

MyEnny commited on Aug 15, 2025

Commit

f5cf06e

verified ·

1 Parent(s): d4903a0

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -17

app.py CHANGED Viewed

@@ -1,8 +1,15 @@
 import os
 import zipfile
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFacePipeline
@@ -16,30 +23,51 @@ if not os.path.exists("faiss_index") and os.path.exists("faiss_index.zip"):
         zip_ref.extractall(".")
 # --- Step 2: Load embedding and vectorstore ---
-embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
-vectordb = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)
-# --- Step 3: Load the LLM ---
 model_id = "tiiuae/falcon3-1b-instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
 pipe = pipeline(
-    "text-generation",
     model=model,
     tokenizer=tokenizer,
     pad_token_id=tokenizer.eos_token_id,
-    max_new_tokens=200,
     do_sample=True,
-    temperature=1.0,
 )
 llm = HuggingFacePipeline(pipeline=pipe)
 # --- Step 4: Setup memory and QA chain ---
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-prompt = PromptTemplate.from_template("""
-You are a helpful assistant at the University of Hertfordshire. Use the context below to answer the question clearly and factually.
 If the answer is not in the context, say you don't know.
 Context:
@@ -48,23 +76,28 @@ Context:
 Question:
 {question}
-Helpful Answer:
-""")
 qa_chain = ConversationalRetrievalChain.from_llm(
     llm=llm,
     retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
     memory=memory,
     chain_type="stuff",
-    combine_docs_chain_kwargs={"prompt": prompt}
 )
 UH_LOGO = "images/UH.png"
 # --- Step 5: Define chatbot logic ---
 def chat(message, history):
     result = qa_chain.invoke({"question": message})
-    response = result.get("answer", "")
-    response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
     return response
 # --- Step 6: UI ---
@@ -96,4 +129,5 @@ with gr.Blocks() as demo:
     submit.click(respond, [txt, chatbot], [txt, chatbot])
     txt.submit(respond, [txt, chatbot], [txt, chatbot])
-demo.launch()

 import os
 import zipfile
+import torch
 import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    pipeline,
+)
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.llms import HuggingFacePipeline
         zip_ref.extractall(".")
 # --- Step 2: Load embedding and vectorstore ---
+embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+vectordb = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
+# --- Step 3: Load the LLM (memory-efficient) ---
 model_id = "tiiuae/falcon3-1b-instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+# 4-bit quantisation to stay within L4 memory
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    quantization_config=bnb_config,
+    device_map="auto",           # place layers on GPU/CPU automatically if needed
+    torch_dtype=torch.float16,   # keeps activation memory down
+    low_cpu_mem_usage=True,
+)
+model.eval()
+torch.set_grad_enabled(False)
 pipe = pipeline(
+    task="text-generation",
     model=model,
     tokenizer=tokenizer,
     pad_token_id=tokenizer.eos_token_id,
+    max_new_tokens=160,     # keep this modest to avoid spikes
     do_sample=True,
+    temperature=0.7,
+    top_p=0.9,
 )
 llm = HuggingFacePipeline(pipeline=pipe)
 # --- Step 4: Setup memory and QA chain ---
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+prompt = PromptTemplate.from_template(
+    """
+You are a helpful assistant at the University of Hertfordshire. Use only the context below to answer clearly and factually.
 If the answer is not in the context, say you don't know.
 Context:
 Question:
 {question}
+Helpful answer:
+""".strip()
+)
 qa_chain = ConversationalRetrievalChain.from_llm(
     llm=llm,
     retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
     memory=memory,
     chain_type="stuff",
+    combine_docs_chain_kwargs={"prompt": prompt},
 )
 UH_LOGO = "images/UH.png"
 # --- Step 5: Define chatbot logic ---
 def chat(message, history):
     result = qa_chain.invoke({"question": message})
+    # ConversationalRetrievalChain returns {"answer": "...", "source_documents": ..., "chat_history": ...}
+    response = result.get("answer", "").replace("<|assistant|>", "").strip()
+    # keep GPU clean between turns (helps on Spaces)
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return response
 # --- Step 6: UI ---
     submit.click(respond, [txt, chatbot], [txt, chatbot])
     txt.submit(respond, [txt, chatbot], [txt, chatbot])
+if __name__ == "__main__":
+    demo.launch()