zm-f21 commited on
Commit
9c66a72
·
verified ·
1 Parent(s): edf2f5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -93
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  from sentence_transformers import SentenceTransformer
@@ -7,11 +8,10 @@ import zipfile
7
  import os
8
  import re
9
  import torch
10
- import shutil
11
 
12
- # =======================================================
13
- # 1) Load Mistral LLM (FP16)
14
- # =======================================================
15
  llm = pipeline(
16
  "text-generation",
17
  model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -19,42 +19,46 @@ llm = pipeline(
19
  device_map="auto"
20
  )
21
 
22
- # =======================================================
23
- # 2) Load Embedding Model (Legal-BERT)
24
- # =======================================================
25
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
26
 
27
- # =======================================================
28
- # 3) Extract the ZIP dataset
29
- # =======================================================
30
- zip_path = "/app/provinces.zip" # Make sure this is uploaded in your HF Space
31
  extract_folder = "/app/provinces_texts"
32
 
 
33
  if os.path.exists(extract_folder):
 
34
  shutil.rmtree(extract_folder)
35
 
36
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
37
  zip_ref.extractall(extract_folder)
38
 
 
39
  date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
40
 
41
- # =======================================================
42
- # 4) Parse TXT files into documents
43
- # =======================================================
44
  def parse_metadata_and_content(raw_text):
45
  if "CONTENT:" not in raw_text:
46
  raise ValueError("File missing CONTENT: separator.")
 
47
  header, content = raw_text.split("CONTENT:", 1)
48
  metadata = {}
 
49
  pdf_list = []
50
 
51
- for line in header.strip().split("\n"):
52
  if ":" in line and not line.strip().startswith("-"):
53
  key, value = line.split(":", 1)
54
  metadata[key.strip().upper()] = value.strip()
55
  elif line.strip().startswith("-"):
56
  pdf_list.append(line.strip())
57
-
58
  if pdf_list:
59
  metadata["PDF_LINKS"] = "\n".join(pdf_list)
60
  return metadata, content.strip()
@@ -65,15 +69,12 @@ for root, dirs, files in os.walk(extract_folder):
65
  for filename in files:
66
  if filename.startswith("._") or not filename.endswith(".txt"):
67
  continue
68
-
69
  filepath = os.path.join(root, filename)
70
  try:
71
  with open(filepath, "r", encoding="latin-1") as f:
72
  raw = f.read()
73
-
74
  metadata, content = parse_metadata_and_content(raw)
75
  paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
76
-
77
  for p in paragraphs:
78
  documents.append({
79
  "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
@@ -83,14 +84,15 @@ for root, dirs, files in os.walk(extract_folder):
83
  "pdf_links": metadata.get("PDF_LINKS", ""),
84
  "text": p
85
  })
86
- except Exception as e:
87
  print(f"Skipping {filepath}: {e}")
 
88
 
89
  print(f"Loaded {len(documents)} paragraphs from all provinces.")
90
 
91
- # =======================================================
92
- # 5) Build embeddings & dataframe
93
- # =======================================================
94
  texts = [d["text"] for d in documents]
95
  embeddings = embedding_model.encode(texts).astype("float16")
96
 
@@ -99,24 +101,23 @@ df["Embedding"] = list(embeddings)
99
 
100
  print("Indexing complete. Total:", len(df))
101
 
102
- # =======================================================
103
- # 6) Retrieval
104
- # =======================================================
105
  def retrieve_with_pandas(query, province=None, top_k=2):
106
  query_emb = embedding_model.encode([query])[0]
107
-
108
- filtered = df if province is None else df[df["province"] == province]
109
- filtered = filtered.copy()
110
-
111
- filtered["Similarity"] = filtered["Embedding"].apply(
112
  lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
113
  )
 
114
 
115
- return filtered.sort_values("Similarity", ascending=False).head(top_k)
116
-
117
- # =======================================================
118
- # 7) Province detection
119
- # =======================================================
120
  def detect_province(query):
121
  provinces = {
122
  "yukon": "Yukon",
@@ -145,59 +146,62 @@ def detect_province(query):
145
  return prov
146
  return None
147
 
148
- # =======================================================
149
- # 8) Guardrails
150
- # =======================================================
151
  def is_disallowed(query):
152
- banned = ["suicide", "harm yourself", "bomb", "weapon"]
153
  return any(b in query.lower() for b in banned)
154
 
155
  def is_off_topic(query):
156
  tenancy_keywords = [
157
- "tenant", "landlord", "rent", "evict", "lease", "deposit",
158
- "tenancy", "rental", "apartment", "unit", "repair", "pets",
159
- "heating", "notice"
160
  ]
161
  q = query.lower()
162
  return not any(k in q for k in tenancy_keywords)
163
 
164
  INTRO_TEXT = (
165
  "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
166
- "and explain information from the Residential Tenancies Acts across all provinces.\n\n"
167
- "**Important:** I'm not a lawyer and this is **not legal advice**."
168
  )
169
 
170
- # =======================================================
171
- # 9) RAG Generation
172
- # =======================================================
173
  def generate_with_rag(query, province=None, top_k=2):
174
-
175
  if is_disallowed(query):
176
- return "Sorry — I can’t help with harmful or dangerous topics."
177
-
178
  if is_off_topic(query):
179
- return "Sorry — I can only answer questions about Canadian tenancy and housing law."
180
 
181
  if province is None:
182
  province = detect_province(query)
183
 
184
  top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
185
- if len(top_docs) == 0:
186
- return "Sorry — I couldn't find matching information."
187
 
188
  context = " ".join(top_docs["text"].tolist())
189
 
 
190
  qa_examples = """
191
- Q: My landlord took too long to install a safety item. Is that allowed?
192
- A: Landlords should respond promptly to reasonable accommodation requests.
193
- Q: I have kids making noise. Can I be evicted?
194
- A: Reasonable family noise is expected; eviction should not be based on discrimination.
 
195
  """
196
 
197
  prompt = f"""
198
- Use the examples ONLY AS A STYLE GUIDE.
199
- Do not repeat them and do not invent laws.
200
- If the context does not contain the answer, say so.
 
 
 
201
 
202
  Context:
203
  {context}
@@ -208,51 +212,36 @@ Question:
208
  Answer conversationally:
209
  """
210
 
211
- output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
212
- answer = output.split("Answer conversationally:", 1)[-1].strip()
213
 
214
- metadata = ""
215
  for _, row in top_docs.iterrows():
216
- metadata += (
217
  f"- Province: {row['province']}\n"
218
  f" Source: {row['source_title']}\n"
219
  f" Updated: {row['last_updated']}\n"
220
  f" URL: {row['url']}\n"
221
  )
222
 
223
- return f"{answer}\n\nSources Used:\n{metadata}"
224
-
225
- # =======================================================
226
- # 10) Gradio Chat Interface (INTRO only once)
227
- # =======================================================
228
- INTRO_MESSAGE = {
229
- "role": "assistant",
230
- "content": INTRO_TEXT
231
- }
232
 
233
- def chat_api(message, history):
234
- history.append({"role": "user", "content": message})
235
- reply = generate_with_rag(message)
236
- history.append({"role": "assistant", "content": reply})
 
 
237
  return history, history
238
 
239
  with gr.Blocks() as demo:
240
- gr.Markdown("## Canada Residential Tenancy Assistant (RAG + Mistral 7B)")
241
-
242
- chatbot = gr.Chatbot(
243
- value=[(None, INTRO_MESSAGE["content"])],
244
- height=500
245
- )
246
-
247
- user_box = gr.Textbox(
248
- label="Your question",
249
- placeholder="Ask a question about rentals, repairs, evictions, deposits, etc..."
250
  )
251
 
252
- send_btn = gr.Button("Send")
253
-
254
- send_btn.click(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
255
- user_box.submit(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
256
-
257
  if __name__ == "__main__":
258
  demo.launch(share=True)
 
1
+
2
  import gradio as gr
3
  from transformers import pipeline
4
  from sentence_transformers import SentenceTransformer
 
8
  import os
9
  import re
10
  import torch
 
11
 
12
+ # -----------------------------
13
+ # Load Mistral pipeline
14
+ # -----------------------------
15
  llm = pipeline(
16
  "text-generation",
17
  model="mistralai/Mistral-7B-Instruct-v0.2",
 
19
  device_map="auto"
20
  )
21
 
22
+ # -----------------------------
23
+ # Load SentenceTransformer embeddings
24
+ # -----------------------------
25
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
26
 
27
+ # -----------------------------
28
+ # Extract Provinces ZIP
29
+ # -----------------------------
30
+ zip_path = "/app/provinces.zip" # Make sure you upload this to your HF Space
31
  extract_folder = "/app/provinces_texts"
32
 
33
+ # Remove old folder if exists
34
  if os.path.exists(extract_folder):
35
+ import shutil
36
  shutil.rmtree(extract_folder)
37
 
38
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
39
  zip_ref.extractall(extract_folder)
40
 
41
+ # Regex to capture YYYY_MM_DD or YYYY-MM-DD anywhere in filename
42
  date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
43
 
44
+ # -----------------------------
45
+ # Parse TXT files and create documents
46
+ # -----------------------------
47
  def parse_metadata_and_content(raw_text):
48
  if "CONTENT:" not in raw_text:
49
  raise ValueError("File missing CONTENT: separator.")
50
+
51
  header, content = raw_text.split("CONTENT:", 1)
52
  metadata = {}
53
+ lines = header.strip().split("\n")
54
  pdf_list = []
55
 
56
+ for line in lines:
57
  if ":" in line and not line.strip().startswith("-"):
58
  key, value = line.split(":", 1)
59
  metadata[key.strip().upper()] = value.strip()
60
  elif line.strip().startswith("-"):
61
  pdf_list.append(line.strip())
 
62
  if pdf_list:
63
  metadata["PDF_LINKS"] = "\n".join(pdf_list)
64
  return metadata, content.strip()
 
69
  for filename in files:
70
  if filename.startswith("._") or not filename.endswith(".txt"):
71
  continue
 
72
  filepath = os.path.join(root, filename)
73
  try:
74
  with open(filepath, "r", encoding="latin-1") as f:
75
  raw = f.read()
 
76
  metadata, content = parse_metadata_and_content(raw)
77
  paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
 
78
  for p in paragraphs:
79
  documents.append({
80
  "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
 
84
  "pdf_links": metadata.get("PDF_LINKS", ""),
85
  "text": p
86
  })
87
+ except ValueError as e:
88
  print(f"Skipping {filepath}: {e}")
89
+ continue
90
 
91
  print(f"Loaded {len(documents)} paragraphs from all provinces.")
92
 
93
+ # -----------------------------
94
+ # Create embeddings and dataframe
95
+ # -----------------------------
96
  texts = [d["text"] for d in documents]
97
  embeddings = embedding_model.encode(texts).astype("float16")
98
 
 
101
 
102
  print("Indexing complete. Total:", len(df))
103
 
104
+ # -----------------------------
105
+ # Retrieve with Pandas
106
+ # -----------------------------
107
  def retrieve_with_pandas(query, province=None, top_k=2):
108
  query_emb = embedding_model.encode([query])[0]
109
+ if province is not None:
110
+ filtered_df = df[df['province'] == province].copy()
111
+ else:
112
+ filtered_df = df.copy()
113
+ filtered_df['Similarity'] = filtered_df['Embedding'].apply(
114
  lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
115
  )
116
+ return filtered_df.sort_values("Similarity", ascending=False).head(top_k)
117
 
118
+ # -----------------------------
119
+ # Province detection
120
+ # -----------------------------
 
 
121
  def detect_province(query):
122
  provinces = {
123
  "yukon": "Yukon",
 
146
  return prov
147
  return None
148
 
149
+ # -----------------------------
150
+ # Guardrails
151
+ # -----------------------------
152
  def is_disallowed(query):
153
+ banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
154
  return any(b in query.lower() for b in banned)
155
 
156
  def is_off_topic(query):
157
  tenancy_keywords = [
158
+ "tenant", "landlord", "rent", "evict", "lease",
159
+ "deposit", "tenancy", "rental", "apartment",
160
+ "unit", "heating", "notice", "repair", "pets"
161
  ]
162
  q = query.lower()
163
  return not any(k in q for k in tenancy_keywords)
164
 
165
  INTRO_TEXT = (
166
  "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
167
+ "and explain information from the Residential Tenancies Acts across all provinces and territories.\n\n"
168
+ "**Important:** I'm not a lawyer and this is **not legal advice**. Use your own judgment.\n\n"
169
  )
170
 
171
+ # -----------------------------
172
+ # RAG generation function
173
+ # -----------------------------
174
  def generate_with_rag(query, province=None, top_k=2):
 
175
  if is_disallowed(query):
176
+ return INTRO_TEXT + "Sorry — I can’t help with harmful or dangerous topics."
 
177
  if is_off_topic(query):
178
+ return INTRO_TEXT + "Sorry — I can only answer questions about Canadian tenancy and housing law."
179
 
180
  if province is None:
181
  province = detect_province(query)
182
 
183
  top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
184
+ if top_docs is None or len(top_docs) == 0:
185
+ return INTRO_TEXT + "Sorry — I couldn't find any matching information in the tenancy database."
186
 
187
  context = " ".join(top_docs["text"].tolist())
188
 
189
+ # Few-shot style examples (style guide)
190
  qa_examples = """
191
+ Q: I asked my landlord three months ago to install handrails in my bathroom. Can the landlord take a long time to respond?
192
+ A: Landlords should respond promptly to reasonable accommodation requests. If they delay unreasonably, you can file a discrimination complaint.
193
+
194
+ Q: My building manager keeps complaining about my children’s noise. Can I be evicted?
195
+ A: Reasonable noise from children is expected. If you're treated differently because you have children, you may file a complaint based on family status.
196
  """
197
 
198
  prompt = f"""
199
+ Use the examples as a STYLE GUIDE ONLY.
200
+ DO NOT repeat the example questions.
201
+ DO NOT invent laws only use the context provided.
202
+ If the context does not contain the answer, say you cannot confidently answer.
203
+
204
+ {qa_examples}
205
 
206
  Context:
207
  {context}
 
212
  Answer conversationally:
213
  """
214
 
215
+ raw_output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
216
+ answer = raw_output.split("Answer conversationally:", 1)[-1].strip() if "Answer conversationally:" in raw_output else raw_output.strip()
217
 
218
+ metadata_block = ""
219
  for _, row in top_docs.iterrows():
220
+ metadata_block += (
221
  f"- Province: {row['province']}\n"
222
  f" Source: {row['source_title']}\n"
223
  f" Updated: {row['last_updated']}\n"
224
  f" URL: {row['url']}\n"
225
  )
226
 
227
+ return INTRO_TEXT + f"{answer}\n\nSources Used:\n{metadata_block}"
 
 
 
 
 
 
 
 
228
 
229
+ # -----------------------------
230
+ # Gradio Chat
231
+ # -----------------------------
232
+ def respond(message, history):
233
+ answer = generate_with_rag(message)
234
+ history.append((message, answer))
235
  return history, history
236
 
237
  with gr.Blocks() as demo:
238
+ chatbot = gr.Chatbot()
239
+ msg = gr.Textbox(label="Your question")
240
+ msg.submit(respond, [msg, chatbot], [chatbot, chatbot])
241
+ gr.Markdown(
242
+ "Ask questions about Canadian tenancy and housing law.\n\n"
243
+ "**Note:** I am not a lawyer. Responses are generated from official documents."
 
 
 
 
244
  )
245
 
 
 
 
 
 
246
  if __name__ == "__main__":
247
  demo.launch(share=True)