Spaces:

zm-f21
/

IAT360-Final-Project

Sleeping

App Files Files Community

zm-f21 commited on Dec 6, 2025

Commit

87fa24c

verified ·

1 Parent(s): 373a1dd

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -79

app.py CHANGED Viewed

@@ -1,52 +1,37 @@
-# ----------------------------- #
-#  Imports
-# ----------------------------- #
-import os
-import zipfile
-import re
 import pandas as pd
 import numpy as np
 from sentence_transformers import SentenceTransformer
-from ctransformers import AutoModelForCausalLM
 import gradio as gr
 # ----------------------------- #
-#  Load GGUF Mistral Model
 # ----------------------------- #
-# Make sure your GGUF file is in ./models/mistral.gguf
 llm = AutoModelForCausalLM.from_pretrained(
-    "./models/mistral.gguf",
-    model_type="mistral",  # important for GGUF
-    n_threads=8  # adjust based on your environment
 )
-# ----------------------------- #
-#  Load Embedding Model
-# ----------------------------- #
 embedding_model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
 # ----------------------------- #
-#  Extract ZIP
 # ----------------------------- #
-zip_path = "provinces.zip"
-extract_folder = "provinces_texts"
-if not os.path.exists(extract_folder):
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(extract_folder)
-# ----------------------------- #
-#  Parse Files
-# ----------------------------- #
 def parse_metadata_and_content(raw_text):
     if "CONTENT:" not in raw_text:
         raise ValueError("File missing CONTENT: separator.")
     header, content = raw_text.split("CONTENT:", 1)
     metadata = {}
     pdf_list = []
-    for line in header.strip().split("\n"):
         if ":" in line and not line.strip().startswith("-"):
             key, value = line.split(":", 1)
             metadata[key.strip()] = value.strip()
@@ -58,37 +43,12 @@ def parse_metadata_and_content(raw_text):
     return metadata, content.strip()
-documents = []
-for root, dirs, files in os.walk(extract_folder):
-    for filename in files:
-        if filename.startswith("._"):
-            continue
-        if filename.endswith(".txt"):
-            filepath = os.path.join(root, filename)
-            try:
-                with open(filepath, "r", encoding="latin-1") as f:
-                    raw = f.read()
-                metadata, content = parse_metadata_and_content(raw)
-                paragraphs = [p.strip() for p in content.split("\n\n") if p.strip()]
-                for p in paragraphs:
-                    documents.append({
-                        "source_title": metadata.get("SOURCE_TITLE", "Unknown"),
-                        "province": metadata.get("PROVINCE", "Unknown"),
-                        "last_updated": metadata.get("LAST_UPDATED", "Unknown"),
-                        "url": metadata.get("URL", "N/A"),
-                        "pdf_links": metadata.get("PDF_LINKS", ""),
-                        "text": p
-                    })
-            except Exception:
-                continue
-# Build DataFrame and embeddings
-df = pd.DataFrame(documents)
-df["Embedding"] = df["text"].apply(lambda x: embedding_model.encode(x))
 # ----------------------------- #
-#  Province Detection
 # ----------------------------- #
 def detect_province(query):
     provinces = {
@@ -112,7 +72,6 @@ def detect_province(query):
         "nwt": "Northwest Territories",
         "northwest territories": "Northwest Territories"
     }
     q = query.lower()
     for key, prov in provinces.items():
         if key in q:
@@ -120,7 +79,7 @@ def detect_province(query):
     return None
 # ----------------------------- #
-#  Guardrails
 # ----------------------------- #
 def is_disallowed(query):
     banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
@@ -142,30 +101,35 @@ INTRO_TEXT = (
 )
 # ----------------------------- #
-#  Retrieval
 # ----------------------------- #
 def retrieve_with_pandas(query, province=None, top_k=2):
     query_embedding = embedding_model.encode([query])[0]
-    filtered_df = df.copy()
     if province:
-        filtered_df = filtered_df[filtered_df['province'] == province]
     filtered_df["Similarity"] = filtered_df["Embedding"].apply(
         lambda x: np.dot(query_embedding, x) /
                   (np.linalg.norm(query_embedding) * np.linalg.norm(x))
     )
-    return filtered_df.sort_values("Similarity", ascending=False).head(top_k)
 # ----------------------------- #
-#  Main RAG Generator
 # ----------------------------- #
 def generate_with_rag(query):
     if is_disallowed(query):
         return INTRO_TEXT + "Sorry — I can’t help with harmful topics."
     if is_off_topic(query):
-        return INTRO_TEXT + "Sorry — I can only answer questions about tenancy and housing law."
     province = detect_province(query)
     top_docs_df = retrieve_with_pandas(query, province=province, top_k=2)
@@ -174,7 +138,6 @@ def generate_with_rag(query):
         return INTRO_TEXT + "I couldn't find relevant information."
     context = " ".join(top_docs_df["text"].tolist())
     prompt = f"""
 Use the context below to answer the question.
 CONTEXT:
@@ -184,23 +147,15 @@ QUESTION:
 ANSWER:
 """
-    response = llm(prompt, max_new_tokens=300, temperature=0.2)
-    answer = response[0]["generated_text"].split("ANSWER:")[-1].strip()
-    # Add metadata
-    metadata_block = ""
-    for _, row in top_docs_df.iterrows():
-        metadata_block += (
-            f"- Province: {row['province']}\n"
-            f"  Source: {row['source_title']}\n"
-            f"  Updated: {row['last_updated']}\n"
-            f"  URL: {row['url']}\n"
-        )
-    return INTRO_TEXT + f"{answer}\n\nSources Used:\n{metadata_block}"
 # ----------------------------- #
-#  Gradio UI
 # ----------------------------- #
 def ui_fn(query):
     return generate_with_rag(query)
@@ -214,3 +169,4 @@ demo = gr.Interface(
 if __name__ == "__main__":
     demo.launch(share=True)

 import pandas as pd
 import numpy as np
+import re
 from sentence_transformers import SentenceTransformer
 import gradio as gr
+from ctransformers import AutoModelForCausalLM
 # ----------------------------- #
+# Load Hosted Mistral 7B Q4_0
 # ----------------------------- #
 llm = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Mistral-7B-v0.1-Q4_0",  # hosted HF model
+    model_type="mistral",             # model type
+    gpu_layers=32                      # adjust based on GPU/VRAM
 )
 embedding_model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')
 # ----------------------------- #
+# Parse & Prepare Your Documents
 # ----------------------------- #
+# Example parsing function (from your previous code)
+date_pattern = re.compile(r"(\d{4}[-]\d{2}[-_]\d{2})")
 def parse_metadata_and_content(raw_text):
     if "CONTENT:" not in raw_text:
         raise ValueError("File missing CONTENT: separator.")
     header, content = raw_text.split("CONTENT:", 1)
     metadata = {}
+    lines = header.strip().split("\n")
     pdf_list = []
+    for line in lines:
         if ":" in line and not line.strip().startswith("-"):
             key, value = line.split(":", 1)
             metadata[key.strip()] = value.strip()
     return metadata, content.strip()
+# Load your text documents into df as before
+# df = pd.DataFrame(documents)
+# df["Embedding"] = df["text"].apply(lambda x: embedding_model.encode(x))
 # ----------------------------- #
+# Province Detection
 # ----------------------------- #
 def detect_province(query):
     provinces = {
         "nwt": "Northwest Territories",
         "northwest territories": "Northwest Territories"
     }
     q = query.lower()
     for key, prov in provinces.items():
         if key in q:
     return None
 # ----------------------------- #
+# Guardrails
 # ----------------------------- #
 def is_disallowed(query):
     banned = ["kill", "suicide", "harm yourself", "bomb", "weapon"]
 )
 # ----------------------------- #
+# Retrieval
 # ----------------------------- #
 def retrieve_with_pandas(query, province=None, top_k=2):
     query_embedding = embedding_model.encode([query])[0]
     if province:
+        filtered_df = df[df['province'] == province].copy()
+    else:
+        filtered_df = df.copy()
     filtered_df["Similarity"] = filtered_df["Embedding"].apply(
         lambda x: np.dot(query_embedding, x) /
                   (np.linalg.norm(query_embedding) * np.linalg.norm(x))
     )
+    results = filtered_df.sort_values("Similarity", ascending=False).head(top_k)
+    return results
 # ----------------------------- #
+# RAG Generator
 # ----------------------------- #
 def generate_with_rag(query):
     if is_disallowed(query):
         return INTRO_TEXT + "Sorry — I can’t help with harmful topics."
     if is_off_topic(query):
+        return INTRO_TEXT + (
+            "Sorry — I can only answer questions about tenancy and housing law."
+        )
     province = detect_province(query)
     top_docs_df = retrieve_with_pandas(query, province=province, top_k=2)
         return INTRO_TEXT + "I couldn't find relevant information."
     context = " ".join(top_docs_df["text"].tolist())
     prompt = f"""
 Use the context below to answer the question.
 CONTEXT:
 ANSWER:
 """
+    response = llm(
+        prompt,
+        max_new_tokens=300,
+        temperature=0.2
+    )
+    return response[0]["generated_text"].split("ANSWER:")[-1].strip()
 # ----------------------------- #
+# Gradio UI
 # ----------------------------- #
 def ui_fn(query):
     return generate_with_rag(query)
 if __name__ == "__main__":
     demo.launch(share=True)