MyEnny commited on
Commit
bf4e6af
·
verified ·
1 Parent(s): ad62d7b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -49
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import os
2
- import re
3
  import zipfile
4
  import gradio as gr
5
- from langchain_community.vectorstores import FAISS
6
- from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- from langchain.docstore.document import Document
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
  from langchain.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores import FAISS
@@ -13,41 +10,16 @@ from langchain.chains import ConversationalRetrievalChain
13
  from langchain.memory import ConversationBufferMemory
14
  from langchain.prompts import PromptTemplate
15
 
16
- # --- Step 1: Clean .txt files ---
17
- DATA_DIR = "knowledge_base"
18
- docs = []
19
-
20
- for fname in os.listdir(DATA_DIR):
21
- file_path = os.path.join(DATA_DIR, fname)
22
- if os.path.isfile(file_path) and fname.endswith(".txt"):
23
- with open(file_path, "r", encoding="utf-8") as f:
24
- text = f.read()
25
- cleaned_text = text.replace('\xa0', ' ')
26
- cleaned_text = re.sub(r'\n+', '\n', cleaned_text)
27
- cleaned_text = re.sub(r' +', ' ', cleaned_text).strip()
28
- docs.append({"page": fname, "text": cleaned_text})
29
-
30
- # --- Step 2: Split text into chunks ---
31
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
32
- texts = []
33
- metadatas = []
34
-
35
- for doc in docs:
36
- chunks = splitter.split_text(doc["text"])
37
- for i, chunk in enumerate(chunks):
38
- texts.append(chunk)
39
- metadatas.append({"source": doc["page"], "chunk": i})
40
-
41
- # --- Step 3: Create Document objects ---
42
- documents = [Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts))]
43
-
44
- # --- Step 4: Load embedding model ---
45
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
46
-
47
- # --- Step 5: Build FAISS index ---
48
- vectordb = FAISS.from_documents(documents, embedding_model)
49
-
50
- # --- Step 6: Load the LLM ---
51
  model_id = "tiiuae/falcon3-1b-instruct"
52
  tokenizer = AutoTokenizer.from_pretrained(model_id)
53
  model = AutoModelForCausalLM.from_pretrained(model_id)
@@ -63,10 +35,10 @@ pipe = pipeline(
63
  )
64
  llm = HuggingFacePipeline(pipeline=pipe)
65
 
66
- # --- Step 7: Setup memory and QA chain ---
67
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
68
 
69
- custom_prompt = PromptTemplate.from_template("""
70
  You are a helpful assistant at the University of Hertfordshire. Use the context below to answer the question clearly and factually.
71
  If the answer is not in the context, say you don't know.
72
 
@@ -76,28 +48,26 @@ Context:
76
  Question:
77
  {question}
78
 
79
- Answer:
80
  """)
81
 
82
  qa_chain = ConversationalRetrievalChain.from_llm(
83
  llm=llm,
84
  retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
85
  memory=memory,
86
- chain_type="stuff",
87
- combine_docs_chain_kwargs={"prompt": custom_prompt}
88
  )
89
 
90
- # --- Step 8: Define chatbot logic ---
 
91
  def chat(message, history):
92
  result = qa_chain.invoke({"question": message})
93
  response = result.get("answer", "")
94
  response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
95
  return response
96
 
97
- # UH logo
98
- UH_LOGO = "images/UH.png"
99
-
100
- # --- Step 9: UI ---
101
  sample_questions = [
102
  "How do I register as a new student?",
103
  "Where can I find accommodation?",
 
1
  import os
 
2
  import zipfile
3
  import gradio as gr
4
+
 
 
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
 
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.prompts import PromptTemplate
12
 
13
+ # --- Step 1: Unzip FAISS index ---
14
+ if not os.path.exists("faiss_index") and os.path.exists("faiss_index.zip"):
15
+ with zipfile.ZipFile("faiss_index.zip", "r") as zip_ref:
16
+ zip_ref.extractall(".")
17
+
18
+ # --- Step 2: Load embedding and vectorstore ---
19
+ embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
20
+ vectordb = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)
21
+
22
+ # --- Step 3: Load the LLM ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  model_id = "tiiuae/falcon3-1b-instruct"
24
  tokenizer = AutoTokenizer.from_pretrained(model_id)
25
  model = AutoModelForCausalLM.from_pretrained(model_id)
 
35
  )
36
  llm = HuggingFacePipeline(pipeline=pipe)
37
 
38
+ # --- Step 4: Setup memory and QA chain ---
39
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
40
 
41
+ prompt = PromptTemplate.from_template("""
42
  You are a helpful assistant at the University of Hertfordshire. Use the context below to answer the question clearly and factually.
43
  If the answer is not in the context, say you don't know.
44
 
 
48
  Question:
49
  {question}
50
 
51
+ Helpful Answer:
52
  """)
53
 
54
  qa_chain = ConversationalRetrievalChain.from_llm(
55
  llm=llm,
56
  retriever=vectordb.as_retriever(search_kwargs={"k": 3}),
57
  memory=memory,
58
+ chain_type="map_reduce",
59
+ combine_docs_chain_kwargs={"prompt": prompt}
60
  )
61
 
62
+ UH_LOGO = "images/UH.png"
63
+ # --- Step 5: Define chatbot logic ---
64
  def chat(message, history):
65
  result = qa_chain.invoke({"question": message})
66
  response = result.get("answer", "")
67
  response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
68
  return response
69
 
70
+ # --- Step 6: UI ---
 
 
 
71
  sample_questions = [
72
  "How do I register as a new student?",
73
  "Where can I find accommodation?",