Spaces:

zm-f21
/

IAT360-Final-Project

Sleeping

App Files Files Community

zm-f21 commited on Dec 6, 2025

Commit

85551a0

verified ·

1 Parent(s): d5857d2

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -197

app.py CHANGED Viewed

@@ -1,204 +1,81 @@
-# ----------------------------- #
-#  Imports
-# ----------------------------- #
-import os
-import re
-import zipfile
-from pathlib import Path
-import pandas as pd
-import numpy as np
-from sentence_transformers import SentenceTransformer
-from ctransformers import AutoModelForCausalLM
 import gradio as gr
-# ----------------------------- #
-#  Load LLM (GGUF quantized Mistral)
-# ----------------------------- #
-# Make sure you have downloaded the model locally:
-# e.g., ./models/mistral-7B-v0.1.Q4_0.gguf
-llm = AutoModelForCausalLM.from_pretrained(
-    "./models/mistral-7B-v0.1.Q4_0.gguf",
-    model_type="mistral",
-)
-# ----------------------------- #
-#  Load Embedding Model
-# ----------------------------- #
-embedding_model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
-# ----------------------------- #
-#  Extract ZIP of provincial texts
-# ----------------------------- #
-zip_path = "provinces.zip"
-extract_folder = "provinces_texts"
-if not os.path.exists(extract_folder):
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(extract_folder)
-# ----------------------------- #
-#  Parse Files
-# ----------------------------- #
-def parse_metadata_and_content(raw_text):
-    if "CONTENT:" not in raw_text:
-        raise ValueError("File missing CONTENT: separator.")
-    header, content = raw_text.split("CONTENT:", 1)
-    metadata = {}
-    lines = header.strip().split("\n")
-    pdf_list = []
-    for line in lines:
-        if ":" in line and not line.strip().startswith("-"):
-            key, value = line.split(":", 1)
-            metadata[key.strip()] = value.strip()
-        elif line.strip().startswith("-"):
-            pdf_list.append(line.strip())
-    if pdf_list:
-        metadata["PDF_LINKS"] = "\n".join(pdf_list)
-    return metadata, content.strip()
-documents = []
-for root, dirs, files in os.walk(extract_folder):
-    for filename in files:
-        if filename.startswith("._"):
-            continue
-        if filename.endswith(".txt"):
-            filepath = os.path.join(root, filename)
-            try:
-                with open(filepath, "r", encoding="latin-1") as f:
-                    raw = f.read()
-                metadata, content = parse_metadata_and_content(raw)
-                paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
-                for p in paragraphs:
-                    documents.append({
-                        "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
-                        "province": metadata.get("PROVINCE", "Unknown"),
-                        "last_updated": metadata.get("LAST_UPDATED", "Unknown"),
-                        "url": metadata.get("URL", "N/A"),
-                        "pdf_links": metadata.get("PDF_LINKS", ""),
-                        "text": p
-                    })
-            except Exception:
-                continue
-# Build DataFrame and compute embeddings
-df = pd.DataFrame(documents)
-df["Embedding"] = df["text"].apply(lambda x: embedding_model.encode(x))
-# ----------------------------- #
-#  Province Detection
-# ----------------------------- #
-def detect_province(query):
-    provinces = {
-        "yukon": "Yukon",
-        "alberta": "Alberta",
-        "bc": "British Columbia",
-        "british columbia": "British Columbia",
-        "manitoba": "Manitoba",
-        "nl": "Newfoundland and Labrador",
-        "newfoundland": "Newfoundland and Labrador",
-        "sask": "Saskatchewan",
-        "saskatchewan": "Saskatchewan",
-        "ontario": "Ontario",
-        "pei": "Prince Edward Island",
-        "prince edward island": "Prince Edward Island",
-        "quebec": "Quebec",
-        "nb": "New Brunswick",
-        "new brunswick": "New Brunswick",
-        "nova scotia": "Nova Scotia",
-        "nunavut": "Nunavut",
-        "nwt": "Northwest Territories",
-        "northwest territories": "Northwest Territories"
-    }
-    q = query.lower()
-    for key, prov in provinces.items():
-        if key in q:
-            return prov
-    return None
-# ----------------------------- #
-#  Guardrails
-# ----------------------------- #
-def is_disallowed(query):
-    banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
-    return any(b in query.lower() for b in banned)
-def is_off_topic(query):
-    tenancy_keywords = [
-        "tenant", "landlord", "rent", "evict", "lease",
-        "deposit", "tenancy", "rental", "apartment",
-        "unit", "heating", "notice", "repair", "pets"
-    ]
-    q = query.lower()
-    return not any(k in q for k in tenancy_keywords)
-INTRO_TEXT = (
-    "Hi! I'm a Canadian rental housing assistant. I can help you find, summarize, "
-    "and explain information from the Residential Tenancies Acts across all provinces.\n\n"
-    "This is not legal advice — laws may vary and change.\n\n"
 )
-# ----------------------------- #
-#  Retrieval Function
-# ----------------------------- #
-def retrieve_with_pandas(query, province=None, top_k=2):
-    query_embedding = embedding_model.encode([query])[0]
-    filtered_df = df[df['province'] == province].copy() if province else df.copy()
-    filtered_df["Similarity"] = filtered_df["Embedding"].apply(
-        lambda x: np.dot(query_embedding, x) /
-                  (np.linalg.norm(query_embedding) * np.linalg.norm(x))
-    )
-    results = filtered_df.sort_values("Similarity", ascending=False).head(top_k)
-    return results
-# ----------------------------- #
-#  Main RAG Generator
-# ----------------------------- #
-def generate_with_rag(query):
-    if is_disallowed(query):
-        return INTRO_TEXT + "Sorry — I can’t help with harmful topics."
-    if is_off_topic(query):
-        return INTRO_TEXT + "Sorry — I can only answer questions about tenancy and housing law."
-    province = detect_province(query)
-    top_docs_df = retrieve_with_pandas(query, province=province, top_k=2)
-    if len(top_docs_df) == 0:
-        return INTRO_TEXT + "I couldn't find relevant information."
-    context = " ".join(top_docs_df["text"].tolist())
-    prompt = f"""
-Use the context below to answer the question.
-CONTEXT:
-{context}
-QUESTION:
-{query}
-ANSWER:
-"""
-    # Generate response with ctransformers
-    response = llm(prompt, max_new_tokens=300, temperature=0.2)
-    return response[0]["generated_text"].split("ANSWER:")[-1].strip()
-# ----------------------------- #
-#  Gradio UI
-# ----------------------------- #
-def ui_fn(query):
-    return generate_with_rag(query)
-demo = gr.Interface(
-    fn=ui_fn,
-    inputs=gr.Textbox(lines=3, label="Ask a question"),
-    outputs=gr.Textbox(label="Answer"),
-    title="Canadian Tenancy RAG Assistant"
-)
 if __name__ == "__main__":
     demo.launch(share=True)

 import gradio as gr
+from huggingface_hub import InferenceClient
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import os
+# -----------------------------
+#  Hugging Face token
+# -----------------------------
+os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"
+client = InferenceClient(token=os.environ["HF_TOKEN"], model="mistralai/Mistral-7B-Instruct-v0.2")
+# -----------------------------
+#  Example RAG documents
+# -----------------------------
+documents = [
+    "Quantum computing uses quantum bits.",
+    "Transformers are a type of neural network architecture.",
+    "Python is a popular programming language.",
+    # Add more docs or load from your dataset
+]
+# -----------------------------
+#  Embeddings + FAISS
+# -----------------------------
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = embed_model.encode(documents, convert_to_numpy=True)
+dimension = embeddings.shape[1]
+index = faiss.IndexFlatL2(dimension)
+index.add(embeddings)
+def retrieve(query, top_k=2):
+    query_emb = embed_model.encode([query], convert_to_numpy=True)
+    distances, indices = index.search(query_emb, top_k)
+    return [documents[i] for i in indices[0]]
+# -----------------------------
+#  RAG answer function
+# -----------------------------
+def answer_with_rag(message, history, system_message, max_tokens, temperature, top_p):
+    context_docs = retrieve(message)
+    context = " ".join(context_docs)
+    prompt = f"Answer the question using the following context:\n{context}\n\nQuestion: {message}\nAnswer:"
+    response = ""
+    for msg in client.chat_completion(
+        prompt,
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p
+    ):
+        choices = msg.choices
+        if len(choices) and choices[0].delta.content:
+            response += choices[0].delta.content
+    return response
+# -----------------------------
+#  Gradio ChatInterface
+# -----------------------------
+chatbot = gr.ChatInterface(
+    answer_with_rag,
+    type="messages",
+    additional_inputs=[
+        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
+    ],
 )
+with gr.Blocks() as demo:
+    with gr.Sidebar():
+        gr.LoginButton()
+    chatbot.render()
 if __name__ == "__main__":
     demo.launch(share=True)