Spaces:

zm-f21
/

IAT360-Final-Project

Sleeping

App Files Files Community

zm-f21 commited on Dec 6, 2025

Commit

9cfac5d

verified ·

1 Parent(s): bc508c6

Update app.py

Browse files

Files changed (1) hide show

app.py +323 -53

app.py CHANGED Viewed

@@ -1,70 +1,340 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+# app.py (copy-paste ready)
+import os
+import zipfile
+import shutil
+import re
+import math
+import json
+import logging
+# silence transformers warnings
+import transformers
+transformers.logging.set_verbosity_error()
+logging.getLogger("transformers.generation.utils").setLevel(logging.ERROR)
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+import pandas as pd
+import numpy as np
 import gradio as gr
+# ----------------------------- #
+# Configuration - edit these if needed
+# ----------------------------- #
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"   # original model (kept as requested)
+ZIP_PATH = "/app/yukon.zip"                       # where you uploaded the zip in the Space
+EXTRACT_FOLDER = "/app/yukon_texts"
+EMBEDDING_MODEL_ID = "nlpaueb/legal-bert-base-uncased"
+TOP_K = 2  # default number of retrieved docs
+# ----------------------------- #
+# Load LLM pipeline (try device_map first, fallback to CPU)
+# ----------------------------- #
+def create_llm_pipeline():
+    try:
+        # Try to load with device_map="auto" (requires accelerate)
+        llm = pipeline(
+            "text-generation",
+            model=MODEL_ID,
+            torch_dtype="auto",
+            device_map="auto",
+            max_new_tokens=150
+        )
+        return llm
+    except Exception as e:
+        # Fallback to CPU (slower). Keep the error for debugging in logs.
+        print(f"[warning] device_map auto failed ({e}). Falling back to CPU pipeline (slower).")
+        llm = pipeline(
+            "text-generation",
+            model=MODEL_ID,
+            torch_dtype=None,  # let transformers choose
+            device_map=None
+        )
+        return llm
+llm = create_llm_pipeline()
+# ----------------------------- #
+# Load embedding model
+# ----------------------------- #
+embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID)
+# ----------------------------- #
+# Helpers: Unzip dataset and normalize path
+# ----------------------------- #
+def safe_extract_zip(zip_path, extract_to):
+    # remove old extracted folder if exists
+    if os.path.exists(extract_to):
+        try:
+            shutil.rmtree(extract_to)
+        except Exception:
+            pass
+    os.makedirs(extract_to, exist_ok=True)
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        # Some zips contain a top-level folder; extract all
+        zf.extractall(extract_to)
+# If ZIP exists in the Space, extract it
+if os.path.exists(ZIP_PATH):
+    safe_extract_zip(ZIP_PATH, EXTRACT_FOLDER)
+else:
+    print(f"[warning] ZIP file not found at {ZIP_PATH}. Make sure you uploaded your dataset zip to this path.")
+# ----------------------------- #
+# Parse metadata/content from files (your existing format with "CONTENT:" separator)
+# ----------------------------- #
+def parse_metadata_and_content(raw_text):
     """
+    Splits header metadata and content using 'CONTENT:' marker.
+    Returns (metadata_dict, content_str).
     """
+    if "CONTENT:" not in raw_text:
+        # If the file doesn't follow the exact format, attempt a graceful fallback:
+        # try to extract simple "Key: Value" lines at the top and treat rest as content.
+        metadata = {}
+        lines = raw_text.split("\n")
+        content_lines = []
+        for line in lines:
+            if ":" in line and len(line.split(":",1)[0].strip()) <= 30 and len(metadata) < 12:
+                key, value = line.split(":", 1)
+                metadata[key.strip().upper()] = value.strip()
+            else:
+                content_lines.append(line)
+        content = "\n".join(content_lines).strip()
+        return metadata, content
+    header, content = raw_text.split("CONTENT:", 1)
+    metadata = {}
+    pdf_list = []
+    for line in header.strip().split("\n"):
+        if ":" in line and not line.strip().startswith("-"):
+            key, value = line.split(":", 1)
+            metadata[key.strip().upper()] = value.strip()
+        elif line.strip().startswith("-"):
+            pdf_list.append(line.strip())
+    if pdf_list:
+        metadata["PDF_LINKS"] = "\n".join(pdf_list)
+    return metadata, content.strip()
+# ----------------------------- #
+# Build documents list (paragraph-level)
+# ----------------------------- #
+documents = []
+# Walk extracted folder for .txt files
+for root, dirs, files in os.walk(EXTRACT_FOLDER):
+    for filename in files:
+        if filename.startswith("._"):  # skip mac metadata
+            continue
+        if not filename.lower().endswith(".txt"):
+            continue
+        filepath = os.path.join(root, filename)
+        try:
+            with open(filepath, "r", encoding="latin-1") as f:
+                raw = f.read()
+        except Exception:
+            try:
+                with open(filepath, "r", encoding="utf-8") as f:
+                    raw = f.read()
+            except Exception as e:
+                print(f"[warning] failed reading {filepath}: {e}")
+                continue
+        # parse metadata + content
+        metadata, content = parse_metadata_and_content(raw)
+        paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
+        for p in paragraphs:
+            documents.append({
+                "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
+                "province": metadata.get("PROVINCE", "Unknown"),
+                "last_updated": metadata.get("LAST_UPDATED", "Unknown"),
+                "url": metadata.get("URL", "N/A"),
+                "pdf_links": metadata.get("PDF_LINKS", ""),
+                "text": p
+            })
+print(f"[info] Loaded {len(documents)} document paragraphs.")
+# ----------------------------- #
+# Create embeddings and dataframe
+# ----------------------------- #
+texts = [d["text"] for d in documents]
+if len(texts) == 0:
+    df = pd.DataFrame(columns=["source_title","province","last_updated","url","pdf_links","text","Embedding"])
+else:
+    # create embeddings (this is potentially slow for many docs)
+    embeddings = embedding_model.encode(texts, show_progress_bar=True)
+    df = pd.DataFrame(documents)
+    df["Embedding"] = list(np.asarray(embeddings, dtype="float32"))
+    print("[info] Embeddings indexed. Total:", len(df))
+# ----------------------------- #
+# Retrieval function (with optional province filter)
+# ----------------------------- #
+def retrieve_with_pandas(query, province=None, top_k=TOP_K):
+    if df is None or len(df) == 0:
+        return pd.DataFrame()  # empty
+    query_emb = embedding_model.encode([query])[0].astype("float32")
+    if province is not None:
+        filtered = df[df["province"].str.lower() == str(province).lower()].copy()
+    else:
+        filtered = df.copy()
+    if filtered.empty:
+        return pd.DataFrame()
+    # cosine similarity
+    def cos_sim(a, b):
+        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12))
+    filtered["Similarity"] = filtered["Embedding"].apply(lambda x: cos_sim(query_emb, np.asarray(x)))
+    results = filtered.sort_values("Similarity", ascending=False).head(top_k)
+    return results[["text", "last_updated", "Similarity", "province", "source_title", "url"]]
+# ----------------------------- #
+# Utilities: province detection, guardrails, intros
+# ----------------------------- #
+def detect_province(query):
+    provinces = {
+        "yukon": "Yukon",
+        "alberta": "Alberta",
+        "bc": "British Columbia",
+        "british columbia": "British Columbia",
+        "manitoba": "Manitoba",
+        "nl": "Newfoundland and Labrador",
+        "newfoundland": "Newfoundland and Labrador",
+        "sask": "Saskatchewan",
+        "saskatchewan": "Saskatchewan",
+        "ontario": "Ontario",
+        "pei": "Prince Edward Island",
+        "prince edward island": "Prince Edward Island",
+        "quebec": "Quebec",
+        "nb": "New Brunswick",
+        "new brunswick": "New Brunswick",
+        "nova scotia": "Nova Scotia",
+        "nunavut": "Nunavut",
+        "nwt": "Northwest Territories",
+        "northwest territories": "Northwest Territories"
+    }
+    q = query.lower()
+    for key, prov in provinces.items():
+        if key in q:
+            return prov
+    return None
+def is_disallowed(query):
+    banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
+    q = query.lower()
+    return any(b in q for b in banned)
+def is_off_topic(query):
+    tenancy_keywords = [
+        "tenant", "landlord", "rent", "evict", "lease",
+        "deposit", "tenancy", "rental", "apartment",
+        "unit", "heating", "notice", "repair", "pets"
+    ]
+    q = query.lower()
+    return not any(k in q for k in tenancy_keywords)
+INTRO_TEXT = (
+    "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
+    "and explain information from the Residential Tenancies Acts across provinces and territories.\n\n"
+    "**Important:** I'm not a lawyer and this is NOT legal advice. I may be wrong and laws change — "
+    "please verify with official sources or a legal professional when in doubt.\n\n"
+)
+# ----------------------------- #
+# The RAG generator function
+# ----------------------------- #
+def generate_with_rag(query, province=None, top_k=TOP_K):
+    # Guardrails
+    if is_disallowed(query):
+        return INTRO_TEXT + "Sorry — I can't help with harmful or dangerous topics. Try asking about tenancy/housing instead."
+    if is_off_topic(query):
+        return INTRO_TEXT + "Sorry — I can only answer questions about Canadian tenancy and housing law. Try rephrasing with tenancy keywords or mention a province."
+    if province is None:
+        province = detect_province(query)
+    top_docs_df = retrieve_with_pandas(query, province=province, top_k=top_k)
+    if top_docs_df is None or len(top_docs_df) == 0:
+        return INTRO_TEXT + "Sorry — I couldn't find matching info in the tenancy database. Try rephrasing or include a province."
+    context = " ".join(top_docs_df["text"].tolist())
+    # Few-shot style examples (style only)
+    qa_examples = """
+Q: I asked my landlord three months ago to install handrails in my bathroom. Can the landlord take a long time to respond?
+A: Landlords should respond promptly to reasonable accommodation requests. If they delay unreasonably, you may be able to file a complaint.
+Q: My building manager keeps complaining about my children’s noise. Can I be evicted?
+A: Reasonable noise from children is expected. Differential treatment based on family status may violate housing protections.
 """
+    prompt = f"""
+Use the examples as a STYLE GUIDE ONLY.
+DO NOT repeat the example questions.
+DO NOT invent laws — only use the context provided.
+If the context does not contain the answer, say you cannot confidently answer.
+{qa_examples}
+Context:
+{context}
+Question:
+{query}
+Answer conversationally:
 """
+    # Call the model (the pipeline already set max tokens default, specify additional args as needed)
+    try:
+        raw_output = llm(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
+    except Exception as e:
+        # If the pipeline fails (OOM or other), return a helpful message
+        print(f"[error] LLM generation failed: {e}")
+        return INTRO_TEXT + "Sorry — the language model failed to produce an answer. Try again or contact the maintainer."
+    # Clean the model output: extract only the part after the "Answer conversationally:" instruction
+    if "Answer conversationally:" in raw_output:
+        answer = raw_output.split("Answer conversationally:", 1)[-1].strip()
+    else:
+        answer = raw_output.strip()
+    # Metadata formatting
+    metadata_block = ""
+    for _, row in top_docs_df.iterrows():
+        metadata_block += (
+            f"- Province: {row.get('province', 'Unknown')}\n"
+            f"  Source: {row.get('source_title', 'Unknown')}\n"
+            f"  Updated: {row.get('last_updated', 'Unknown')}\n"
+            f"  URL: {row.get('url', 'N/A')}\n"
+        )
+    return INTRO_TEXT + f"{answer}\n\nSources Used:\n{metadata_block}"
+# ----------------------------- #
+# Gradio UI
+# ----------------------------- #
+def respond_gradio(message, chat_history):
+    answer = generate_with_rag(message)
+    chat_history = chat_history or []
+    chat_history.append((message, answer))
+    return chat_history, chat_history
+with gr.Blocks() as demo:
+    gr.Markdown("## Yukon / Canada Tenancy RAG Chatbot")
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox(label="Your question", placeholder="e.g. Can my landlord increase rent in Yukon?")
+    msg.submit(respond_gradio, [msg, chatbot], [chatbot, chatbot])
+    gr.Markdown("**Note:** This assistant is informational only, not legal advice.")
+    demo.queue(concurrency_count=2)  # enable queueing to handle requests sequentially in Spaces
 if __name__ == "__main__":
+    demo.launch(share=True)