Spaces:

zm-f21
/

IAT360-Final-Project

Sleeping

App Files Files Community

zm-f21 commited on Dec 6, 2025

Commit

edf2f5e

verified ·

1 Parent(s): 4dd14a9

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -82

app.py CHANGED Viewed

@@ -7,10 +7,11 @@ import zipfile
 import os
 import re
 import torch
-# -----------------------------
-# Load Mistral pipeline
-# -----------------------------
 llm = pipeline(
     "text-generation",
     model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -18,46 +19,42 @@ llm = pipeline(
     device_map="auto"
 )
-# -----------------------------
-# Load SentenceTransformer embeddings
-# -----------------------------
 embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
-# -----------------------------
-# Extract Provinces ZIP
-# -----------------------------
-zip_path = "/app/provinces.zip"  # Make sure you upload this to your HF Space
 extract_folder = "/app/provinces_texts"
-# Remove old folder if exists
 if os.path.exists(extract_folder):
-    import shutil
     shutil.rmtree(extract_folder)
 with zipfile.ZipFile(zip_path, "r") as zip_ref:
     zip_ref.extractall(extract_folder)
-# Regex to capture YYYY_MM_DD or YYYY-MM-DD anywhere in filename
 date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
-# -----------------------------
-# Parse TXT files and create documents
-# -----------------------------
 def parse_metadata_and_content(raw_text):
     if "CONTENT:" not in raw_text:
         raise ValueError("File missing CONTENT: separator.")
     header, content = raw_text.split("CONTENT:", 1)
     metadata = {}
-    lines = header.strip().split("\n")
     pdf_list = []
-    for line in lines:
         if ":" in line and not line.strip().startswith("-"):
             key, value = line.split(":", 1)
             metadata[key.strip().upper()] = value.strip()
         elif line.strip().startswith("-"):
             pdf_list.append(line.strip())
     if pdf_list:
         metadata["PDF_LINKS"] = "\n".join(pdf_list)
     return metadata, content.strip()
@@ -68,12 +65,15 @@ for root, dirs, files in os.walk(extract_folder):
     for filename in files:
         if filename.startswith("._") or not filename.endswith(".txt"):
             continue
         filepath = os.path.join(root, filename)
         try:
             with open(filepath, "r", encoding="latin-1") as f:
                 raw = f.read()
             metadata, content = parse_metadata_and_content(raw)
             paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
             for p in paragraphs:
                 documents.append({
                     "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
@@ -83,15 +83,14 @@ for root, dirs, files in os.walk(extract_folder):
                     "pdf_links": metadata.get("PDF_LINKS", ""),
                     "text": p
                 })
-        except ValueError as e:
             print(f"Skipping {filepath}: {e}")
-            continue
 print(f"Loaded {len(documents)} paragraphs from all provinces.")
-# -----------------------------
-# Create embeddings and dataframe
-# -----------------------------
 texts = [d["text"] for d in documents]
 embeddings = embedding_model.encode(texts).astype("float16")
@@ -100,23 +99,24 @@ df["Embedding"] = list(embeddings)
 print("Indexing complete. Total:", len(df))
-# -----------------------------
-# Retrieve with Pandas
-# -----------------------------
 def retrieve_with_pandas(query, province=None, top_k=2):
     query_emb = embedding_model.encode([query])[0]
-    if province is not None:
-        filtered_df = df[df['province'] == province].copy()
-    else:
-        filtered_df = df.copy()
-    filtered_df['Similarity'] = filtered_df['Embedding'].apply(
         lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
     )
-    return filtered_df.sort_values("Similarity", ascending=False).head(top_k)
-# -----------------------------
-# Province detection
-# -----------------------------
 def detect_province(query):
     provinces = {
         "yukon": "Yukon",
@@ -145,62 +145,59 @@ def detect_province(query):
             return prov
     return None
-# -----------------------------
-# Guardrails
-# -----------------------------
 def is_disallowed(query):
-    banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
     return any(b in query.lower() for b in banned)
 def is_off_topic(query):
     tenancy_keywords = [
-        "tenant", "landlord", "rent", "evict", "lease",
-        "deposit", "tenancy", "rental", "apartment",
-        "unit", "heating", "notice", "repair", "pets"
     ]
     q = query.lower()
     return not any(k in q for k in tenancy_keywords)
 INTRO_TEXT = (
     "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
-    "and explain information from the Residential Tenancies Acts across all provinces and territories.\n\n"
-    "**Important:** I'm not a lawyer and this is **not legal advice**. Use your own judgment.\n\n"
 )
-# -----------------------------
-# RAG generation function
-# -----------------------------
 def generate_with_rag(query, province=None, top_k=2):
     if is_disallowed(query):
-        return INTRO_TEXT + "Sorry — I can’t help with harmful or dangerous topics."
     if is_off_topic(query):
-        return INTRO_TEXT + "Sorry — I can only answer questions about Canadian tenancy and housing law."
     if province is None:
         province = detect_province(query)
     top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
-    if top_docs is None or len(top_docs) == 0:
-        return INTRO_TEXT + "Sorry — I couldn't find any matching information in the tenancy database."
     context = " ".join(top_docs["text"].tolist())
-    # Few-shot style examples (style guide)
     qa_examples = """
-Q: I asked my landlord three months ago to install handrails in my bathroom. Can the landlord take a long time to respond?
-A: Landlords should respond promptly to reasonable accommodation requests. If they delay unreasonably, you can file a discrimination complaint.
-Q: My building manager keeps complaining about my children’s noise. Can I be evicted?
-A: Reasonable noise from children is expected. If you're treated differently because you have children, you may file a complaint based on family status.
 """
     prompt = f"""
-Use the examples as a STYLE GUIDE ONLY.
-DO NOT repeat the example questions.
-DO NOT invent laws — only use the context provided.
-If the context does not contain the answer, say you cannot confidently answer.
-{qa_examples}
 Context:
 {context}
@@ -211,36 +208,51 @@ Question:
 Answer conversationally:
 """
-    raw_output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
-    answer = raw_output.split("Answer conversationally:", 1)[-1].strip() if "Answer conversationally:" in raw_output else raw_output.strip()
-    metadata_block = ""
     for _, row in top_docs.iterrows():
-        metadata_block += (
             f"- Province: {row['province']}\n"
             f"  Source: {row['source_title']}\n"
             f"  Updated: {row['last_updated']}\n"
             f"  URL: {row['url']}\n"
         )
-    return INTRO_TEXT + f"{answer}\n\nSources Used:\n{metadata_block}"
-# -----------------------------
-# Gradio Chat
-# -----------------------------
-def respond(message, history):
-    answer = generate_with_rag(message)
-    history.append((message, answer))
     return history, history
 with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(label="Your question")
-    msg.submit(respond, [msg, chatbot], [chatbot, chatbot])
-    gr.Markdown(
-        "Ask questions about Canadian tenancy and housing law.\n\n"
-        "**Note:** I am not a lawyer. Responses are generated from official documents."
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import os
 import re
 import torch
+import shutil
+# =======================================================
+# 1) Load Mistral LLM (FP16)
+# =======================================================
 llm = pipeline(
     "text-generation",
     model="mistralai/Mistral-7B-Instruct-v0.2",
     device_map="auto"
 )
+# =======================================================
+# 2) Load Embedding Model (Legal-BERT)
+# =======================================================
 embedding_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
+# =======================================================
+# 3) Extract the ZIP dataset
+# =======================================================
+zip_path = "/app/provinces.zip"   # Make sure this is uploaded in your HF Space
 extract_folder = "/app/provinces_texts"
 if os.path.exists(extract_folder):
     shutil.rmtree(extract_folder)
 with zipfile.ZipFile(zip_path, "r") as zip_ref:
     zip_ref.extractall(extract_folder)
 date_pattern = re.compile(r"(\d{4}[-]\d{2}[_-]\d{2})")
+# =======================================================
+# 4) Parse TXT files into documents
+# =======================================================
 def parse_metadata_and_content(raw_text):
     if "CONTENT:" not in raw_text:
         raise ValueError("File missing CONTENT: separator.")
     header, content = raw_text.split("CONTENT:", 1)
     metadata = {}
     pdf_list = []
+    for line in header.strip().split("\n"):
         if ":" in line and not line.strip().startswith("-"):
             key, value = line.split(":", 1)
             metadata[key.strip().upper()] = value.strip()
         elif line.strip().startswith("-"):
             pdf_list.append(line.strip())
     if pdf_list:
         metadata["PDF_LINKS"] = "\n".join(pdf_list)
     return metadata, content.strip()
     for filename in files:
         if filename.startswith("._") or not filename.endswith(".txt"):
             continue
         filepath = os.path.join(root, filename)
         try:
             with open(filepath, "r", encoding="latin-1") as f:
                 raw = f.read()
             metadata, content = parse_metadata_and_content(raw)
             paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
             for p in paragraphs:
                 documents.append({
                     "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
                     "pdf_links": metadata.get("PDF_LINKS", ""),
                     "text": p
                 })
+        except Exception as e:
             print(f"Skipping {filepath}: {e}")
 print(f"Loaded {len(documents)} paragraphs from all provinces.")
+# =======================================================
+# 5) Build embeddings & dataframe
+# =======================================================
 texts = [d["text"] for d in documents]
 embeddings = embedding_model.encode(texts).astype("float16")
 print("Indexing complete. Total:", len(df))
+# =======================================================
+# 6) Retrieval
+# =======================================================
 def retrieve_with_pandas(query, province=None, top_k=2):
     query_emb = embedding_model.encode([query])[0]
+    filtered = df if province is None else df[df["province"] == province]
+    filtered = filtered.copy()
+    filtered["Similarity"] = filtered["Embedding"].apply(
         lambda x: np.dot(query_emb, x) / (np.linalg.norm(query_emb) * np.linalg.norm(x))
     )
+    return filtered.sort_values("Similarity", ascending=False).head(top_k)
+# =======================================================
+# 7) Province detection
+# =======================================================
 def detect_province(query):
     provinces = {
         "yukon": "Yukon",
             return prov
     return None
+# =======================================================
+# 8) Guardrails
+# =======================================================
 def is_disallowed(query):
+    banned = ["suicide", "harm yourself", "bomb", "weapon"]
     return any(b in query.lower() for b in banned)
 def is_off_topic(query):
     tenancy_keywords = [
+        "tenant", "landlord", "rent", "evict", "lease", "deposit",
+        "tenancy", "rental", "apartment", "unit", "repair", "pets",
+        "heating", "notice"
     ]
     q = query.lower()
     return not any(k in q for k in tenancy_keywords)
 INTRO_TEXT = (
     "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
+    "and explain information from the Residential Tenancies Acts across all provinces.\n\n"
+    "**Important:** I'm not a lawyer and this is **not legal advice**."
 )
+# =======================================================
+# 9) RAG Generation
+# =======================================================
 def generate_with_rag(query, province=None, top_k=2):
     if is_disallowed(query):
+        return "Sorry — I can’t help with harmful or dangerous topics."
     if is_off_topic(query):
+        return "Sorry — I can only answer questions about Canadian tenancy and housing law."
     if province is None:
         province = detect_province(query)
     top_docs = retrieve_with_pandas(query, province=province, top_k=top_k)
+    if len(top_docs) == 0:
+        return "Sorry — I couldn't find matching information."
     context = " ".join(top_docs["text"].tolist())
     qa_examples = """
+Q: My landlord took too long to install a safety item. Is that allowed?
+A: Landlords should respond promptly to reasonable accommodation requests.
+Q: I have kids making noise. Can I be evicted?
+A: Reasonable family noise is expected; eviction should not be based on discrimination.
 """
     prompt = f"""
+Use the examples ONLY AS A STYLE GUIDE.
+Do not repeat them and do not invent laws.
+If the context does not contain the answer, say so.
 Context:
 {context}
 Answer conversationally:
 """
+    output = llm(prompt, max_new_tokens=150)[0]["generated_text"]
+    answer = output.split("Answer conversationally:", 1)[-1].strip()
+    metadata = ""
     for _, row in top_docs.iterrows():
+        metadata += (
             f"- Province: {row['province']}\n"
             f"  Source: {row['source_title']}\n"
             f"  Updated: {row['last_updated']}\n"
             f"  URL: {row['url']}\n"
         )
+    return f"{answer}\n\nSources Used:\n{metadata}"
+# =======================================================
+# 10) Gradio Chat Interface (INTRO only once)
+# =======================================================
+INTRO_MESSAGE = {
+    "role": "assistant",
+    "content": INTRO_TEXT
+}
+def chat_api(message, history):
+    history.append({"role": "user", "content": message})
+    reply = generate_with_rag(message)
+    history.append({"role": "assistant", "content": reply})
     return history, history
 with gr.Blocks() as demo:
+    gr.Markdown("## Canada Residential Tenancy Assistant (RAG + Mistral 7B)")
+    chatbot = gr.Chatbot(
+        value=[(None, INTRO_MESSAGE["content"])],
+        height=500
     )
+    user_box = gr.Textbox(
+        label="Your question",
+        placeholder="Ask a question about rentals, repairs, evictions, deposits, etc..."
+    )
+    send_btn = gr.Button("Send")
+    send_btn.click(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
+    user_box.submit(chat_api, inputs=[user_box, chatbot], outputs=[chatbot, chatbot])
 if __name__ == "__main__":
+    demo.launch(share=True)