zm-f21 commited on
Commit
edf2f5e
·
verified ·
1 Parent(s): 4dd14a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -82
app.py CHANGED
@@ -7,10 +7,11 @@ import zipfile
7
  import os
8
  import re
9
  import torch
 
10
 
11
- # -----------------------------
12
- # Load Mistral pipeline
13
- # -----------------------------
14
  llm = pipeline(
15
  "text-generation",
16
  model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -18,46 +19,42 @@ llm = pipeline(
18
  device_map="auto"
19
  )
20
 
21
- # -----------------------------
22
- # Load SentenceTransformer embeddings
23
- # -----------------------------
24
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
25
 
26
- # -----------------------------
27
- # Extract Provinces ZIP
28
- # -----------------------------
29
- zip_path = "/app/provinces.zip" # Make sure you upload this to your HF Space
30
  extract_folder = "/app/provinces_texts"
31
 
32
- # Remove old folder if exists
33
  if os.path.exists(extract_folder):
34
- import shutil
35
  shutil.rmtree(extract_folder)
36
 
37
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
38
  zip_ref.extractall(extract_folder)
39
 
40
- # Regex to capture YYYY_MM_DD or YYYY-MM-DD anywhere in filename
41
  date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
42
 
43
- # -----------------------------
44
- # Parse TXT files and create documents
45
- # -----------------------------
46
  def parse_metadata_and_content(raw_text):
47
  if "CONTENT:" not in raw_text:
48
  raise ValueError("File missing CONTENT: separator.")
49
-
50
  header, content = raw_text.split("CONTENT:", 1)
51
  metadata = {}
52
- lines = header.strip().split("\n")
53
  pdf_list = []
54
 
55
- for line in lines:
56
  if ":" in line and not line.strip().startswith("-"):
57
  key, value = line.split(":", 1)
58
  metadata[key.strip().upper()] = value.strip()
59
  elif line.strip().startswith("-"):
60
  pdf_list.append(line.strip())
 
61
  if pdf_list:
62
  metadata["PDF_LINKS"] = "\n".join(pdf_list)
63
  return metadata, content.strip()
@@ -68,12 +65,15 @@ for root, dirs, files in os.walk(extract_folder):
68
  for filename in files:
69
  if filename.startswith("._") or not filename.endswith(".txt"):
70
  continue
 
71
  filepath = os.path.join(root, filename)
72
  try:
73
  with open(filepath, "r", encoding="latin-1") as f:
74
  raw = f.read()
 
75
  metadata, content = parse_metadata_and_content(raw)
76
  paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
 
77
  for p in paragraphs:
78
  documents.append({
79
  "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
@@ -83,15 +83,14 @@ for root, dirs, files in os.walk(extract_folder):
83
  "pdf_links": metadata.get("PDF_LINKS", ""),
84
  "text": p
85
  })
86
- except ValueError as e:
87
  print(f"Skipping {filepath}: {e}")
88
- continue
89
 
90
  print(f"Loaded {len(documents)} paragraphs from all provinces.")
91
 
92
- # -----------------------------
93
- # Create embeddings and dataframe
94
- # -----------------------------
95
  texts = [d["text"] for d in documents]
96
  embeddings = embedding_model.encode(texts).astype("float16")
97
 
@@ -100,23 +99,24 @@ df["Embedding"] = list(embeddings)
100
 
101
  print("Indexing complete. Total:", len(df))
102
 
103
- # -----------------------------
104
- # Retrieve with Pandas
105
- # -----------------------------
106
  def retrieve_with_pandas(query, province=None, top_k=2):
107
  query_emb = embedding_model.encode([query])[0]
108
- if province is not None:
109
- filtered_df = df[df['province'] == province].copy()
110
- else:
111
- filtered_df = df.copy()
112
- filtered_df['Similarity'] = filtered_df['Embedding'].apply(
113
  lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
114
  )
115
- return filtered_df.sort_values("Similarity", ascending=False).head(top_k)
116
 
117
- # -----------------------------
118
- # Province detection
119
- # -----------------------------
 
 
120
  def detect_province(query):
121
  provinces = {
122
  "yukon": "Yukon",
@@ -145,62 +145,59 @@ def detect_province(query):
145
  return prov
146
  return None
147
 
148
- # -----------------------------
149
- # Guardrails
150
- # -----------------------------
151
  def is_disallowed(query):
152
- banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
153
  return any(b in query.lower() for b in banned)
154
 
155
  def is_off_topic(query):
156
  tenancy_keywords = [
157
- "tenant", "landlord", "rent", "evict", "lease",
158
- "deposit", "tenancy", "rental", "apartment",
159
- "unit", "heating", "notice", "repair", "pets"
160
  ]
161
  q = query.lower()
162
  return not any(k in q for k in tenancy_keywords)
163
 
164
  INTRO_TEXT = (
165
  "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
166
- "and explain information from the Residential Tenancies Acts across all provinces and territories.\n\n"
167
- "**Important:** I'm not a lawyer and this is **not legal advice**. Use your own judgment.\n\n"
168
  )
169
 
170
- # -----------------------------
171
- # RAG generation function
172
- # -----------------------------
173
  def generate_with_rag(query, province=None, top_k=2):
 
174
  if is_disallowed(query):
175
- return INTRO_TEXT + "Sorry — I can’t help with harmful or dangerous topics."
 
176
  if is_off_topic(query):
177
- return INTRO_TEXT + "Sorry — I can only answer questions about Canadian tenancy and housing law."
178
 
179
  if province is None:
180
  province = detect_province(query)
181
 
182
  top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
183
- if top_docs is None or len(top_docs) == 0:
184
- return INTRO_TEXT + "Sorry — I couldn't find any matching information in the tenancy database."
185
 
186
  context = " ".join(top_docs["text"].tolist())
187
 
188
- # Few-shot style examples (style guide)
189
  qa_examples = """
190
- Q: I asked my landlord three months ago to install handrails in my bathroom. Can the landlord take a long time to respond?
191
- A: Landlords should respond promptly to reasonable accommodation requests. If they delay unreasonably, you can file a discrimination complaint.
192
-
193
- Q: My building manager keeps complaining about my children’s noise. Can I be evicted?
194
- A: Reasonable noise from children is expected. If you're treated differently because you have children, you may file a complaint based on family status.
195
  """
196
 
197
  prompt = f"""
198
- Use the examples as a STYLE GUIDE ONLY.
199
- DO NOT repeat the example questions.
200
- DO NOT invent laws only use the context provided.
201
- If the context does not contain the answer, say you cannot confidently answer.
202
-
203
- {qa_examples}
204
 
205
  Context:
206
  {context}
@@ -211,36 +208,51 @@ Question:
211
  Answer conversationally:
212
  """
213
 
214
- raw_output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
215
- answer = raw_output.split("Answer conversationally:", 1)[-1].strip() if "Answer conversationally:" in raw_output else raw_output.strip()
216
 
217
- metadata_block = ""
218
  for _, row in top_docs.iterrows():
219
- metadata_block += (
220
  f"- Province: {row['province']}\n"
221
  f" Source: {row['source_title']}\n"
222
  f" Updated: {row['last_updated']}\n"
223
  f" URL: {row['url']}\n"
224
  )
225
 
226
- return INTRO_TEXT + f"{answer}\n\nSources Used:\n{metadata_block}"
227
 
228
- # -----------------------------
229
- # Gradio Chat
230
- # -----------------------------
231
- def respond(message, history):
232
- answer = generate_with_rag(message)
233
- history.append((message, answer))
 
 
 
 
 
 
234
  return history, history
235
 
236
  with gr.Blocks() as demo:
237
- chatbot = gr.Chatbot()
238
- msg = gr.Textbox(label="Your question")
239
- msg.submit(respond, [msg, chatbot], [chatbot, chatbot])
240
- gr.Markdown(
241
- "Ask questions about Canadian tenancy and housing law.\n\n"
242
- "**Note:** I am not a lawyer. Responses are generated from official documents."
243
  )
244
 
 
 
 
 
 
 
 
 
 
 
245
  if __name__ == "__main__":
246
- demo.launch(share=True)
 
7
  import os
8
  import re
9
  import torch
10
+ import shutil
11
 
12
+ # =======================================================
13
+ # 1) Load Mistral LLM (FP16)
14
+ # =======================================================
15
  llm = pipeline(
16
  "text-generation",
17
  model="mistralai/Mistral-7B-Instruct-v0.2",
 
19
  device_map="auto"
20
  )
21
 
22
+ # =======================================================
23
+ # 2) Load Embedding Model (Legal-BERT)
24
+ # =======================================================
25
  embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
26
 
27
+ # =======================================================
28
+ # 3) Extract the ZIP dataset
29
+ # =======================================================
30
+ zip_path = "/app/provinces.zip" # Make sure this is uploaded in your HF Space
31
  extract_folder = "/app/provinces_texts"
32
 
 
33
  if os.path.exists(extract_folder):
 
34
  shutil.rmtree(extract_folder)
35
 
36
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
37
  zip_ref.extractall(extract_folder)
38
 
 
39
  date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
40
 
41
+ # =======================================================
42
+ # 4) Parse TXT files into documents
43
+ # =======================================================
44
  def parse_metadata_and_content(raw_text):
45
  if "CONTENT:" not in raw_text:
46
  raise ValueError("File missing CONTENT: separator.")
 
47
  header, content = raw_text.split("CONTENT:", 1)
48
  metadata = {}
 
49
  pdf_list = []
50
 
51
+ for line in header.strip().split("\n"):
52
  if ":" in line and not line.strip().startswith("-"):
53
  key, value = line.split(":", 1)
54
  metadata[key.strip().upper()] = value.strip()
55
  elif line.strip().startswith("-"):
56
  pdf_list.append(line.strip())
57
+
58
  if pdf_list:
59
  metadata["PDF_LINKS"] = "\n".join(pdf_list)
60
  return metadata, content.strip()
 
65
  for filename in files:
66
  if filename.startswith("._") or not filename.endswith(".txt"):
67
  continue
68
+
69
  filepath = os.path.join(root, filename)
70
  try:
71
  with open(filepath, "r", encoding="latin-1") as f:
72
  raw = f.read()
73
+
74
  metadata, content = parse_metadata_and_content(raw)
75
  paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
76
+
77
  for p in paragraphs:
78
  documents.append({
79
  "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
 
83
  "pdf_links": metadata.get("PDF_LINKS", ""),
84
  "text": p
85
  })
86
+ except Exception as e:
87
  print(f"Skipping {filepath}: {e}")
 
88
 
89
  print(f"Loaded {len(documents)} paragraphs from all provinces.")
90
 
91
+ # =======================================================
92
+ # 5) Build embeddings & dataframe
93
+ # =======================================================
94
  texts = [d["text"] for d in documents]
95
  embeddings = embedding_model.encode(texts).astype("float16")
96
 
 
99
 
100
  print("Indexing complete. Total:", len(df))
101
 
102
+ # =======================================================
103
+ # 6) Retrieval
104
+ # =======================================================
105
  def retrieve_with_pandas(query, province=None, top_k=2):
106
  query_emb = embedding_model.encode([query])[0]
107
+
108
+ filtered = df if province is None else df[df["province"] == province]
109
+ filtered = filtered.copy()
110
+
111
+ filtered["Similarity"] = filtered["Embedding"].apply(
112
  lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
113
  )
 
114
 
115
+ return filtered.sort_values("Similarity", ascending=False).head(top_k)
116
+
117
+ # =======================================================
118
+ # 7) Province detection
119
+ # =======================================================
120
  def detect_province(query):
121
  provinces = {
122
  "yukon": "Yukon",
 
145
  return prov
146
  return None
147
 
148
+ # =======================================================
149
+ # 8) Guardrails
150
+ # =======================================================
151
  def is_disallowed(query):
152
+ banned = ["suicide", "harm yourself", "bomb", "weapon"]
153
  return any(b in query.lower() for b in banned)
154
 
155
  def is_off_topic(query):
156
  tenancy_keywords = [
157
+ "tenant", "landlord", "rent", "evict", "lease", "deposit",
158
+ "tenancy", "rental", "apartment", "unit", "repair", "pets",
159
+ "heating", "notice"
160
  ]
161
  q = query.lower()
162
  return not any(k in q for k in tenancy_keywords)
163
 
164
  INTRO_TEXT = (
165
  "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
166
+ "and explain information from the Residential Tenancies Acts across all provinces.\n\n"
167
+ "**Important:** I'm not a lawyer and this is **not legal advice**."
168
  )
169
 
170
+ # =======================================================
171
+ # 9) RAG Generation
172
+ # =======================================================
173
  def generate_with_rag(query, province=None, top_k=2):
174
+
175
  if is_disallowed(query):
176
+ return "Sorry — I can’t help with harmful or dangerous topics."
177
+
178
  if is_off_topic(query):
179
+ return "Sorry — I can only answer questions about Canadian tenancy and housing law."
180
 
181
  if province is None:
182
  province = detect_province(query)
183
 
184
  top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
185
+ if len(top_docs) == 0:
186
+ return "Sorry — I couldn't find matching information."
187
 
188
  context = " ".join(top_docs["text"].tolist())
189
 
 
190
  qa_examples = """
191
+ Q: My landlord took too long to install a safety item. Is that allowed?
192
+ A: Landlords should respond promptly to reasonable accommodation requests.
193
+ Q: I have kids making noise. Can I be evicted?
194
+ A: Reasonable family noise is expected; eviction should not be based on discrimination.
 
195
  """
196
 
197
  prompt = f"""
198
+ Use the examples ONLY AS A STYLE GUIDE.
199
+ Do not repeat them and do not invent laws.
200
+ If the context does not contain the answer, say so.
 
 
 
201
 
202
  Context:
203
  {context}
 
208
  Answer conversationally:
209
  """
210
 
211
+ output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
212
+ answer = output.split("Answer conversationally:", 1)[-1].strip()
213
 
214
+ metadata = ""
215
  for _, row in top_docs.iterrows():
216
+ metadata += (
217
  f"- Province: {row['province']}\n"
218
  f" Source: {row['source_title']}\n"
219
  f" Updated: {row['last_updated']}\n"
220
  f" URL: {row['url']}\n"
221
  )
222
 
223
+ return f"{answer}\n\nSources Used:\n{metadata}"
224
 
225
+ # =======================================================
226
+ # 10) Gradio Chat Interface (INTRO only once)
227
+ # =======================================================
228
+ INTRO_MESSAGE = {
229
+ "role": "assistant",
230
+ "content": INTRO_TEXT
231
+ }
232
+
233
+ def chat_api(message, history):
234
+ history.append({"role": "user", "content": message})
235
+ reply = generate_with_rag(message)
236
+ history.append({"role": "assistant", "content": reply})
237
  return history, history
238
 
239
  with gr.Blocks() as demo:
240
+ gr.Markdown("## Canada Residential Tenancy Assistant (RAG + Mistral 7B)")
241
+
242
+ chatbot = gr.Chatbot(
243
+ value=[(None, INTRO_MESSAGE["content"])],
244
+ height=500
 
245
  )
246
 
247
+ user_box = gr.Textbox(
248
+ label="Your question",
249
+ placeholder="Ask a question about rentals, repairs, evictions, deposits, etc..."
250
+ )
251
+
252
+ send_btn = gr.Button("Send")
253
+
254
+ send_btn.click(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
255
+ user_box.submit(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
256
+
257
  if __name__ == "__main__":
258
+ demo.launch(share=True)