Spaces:

jme-datasci
/

cville-assistant

Sleeping

App Files Files Community

jme-datasci commited on 9 days ago

Commit

30d6e30

1 Parent(s): effe941

added examples, updated readme

Browse files

Files changed (2) hide show

README.md +63 -68
app.py +11 -2

README.md CHANGED Viewed

@@ -14,8 +14,6 @@ license: apache-2.0
 short_description: RAG Enabled ChatBot for Charlottesville Municipal Code
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 # Charlottesville Local Ordinance Assistant
 ## 1. Introduction
@@ -77,117 +75,114 @@ import pandas as pd
 import random
 class MyRAGPipeline:
-    '''
-    Wrapper class for RAG pipeline.
-    '''
-    def __init__(self, model_name: str, embedding_model_name: str, vector_db_path: str, tokenizer_name=None, MAX_NEW_TOKENS=500, TEMPERATURE=0.7, DO_SAMPLE=True):
-        if tokenizer_name is None:
-            tokenizer_name = model_name
         self.embedding_model_name = embedding_model_name
-        self.max_new_tokens = MAX_NEW_TOKENS
         print(f"Loading Model: {model_name}...")
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=HF_TOKEN)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            device_map="auto",
-            dtype=torch.bfloat16,
             token=HF_TOKEN
         )
         self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
         self.tokenizer.padding_side = "left"
         print("Loading Embeddings...")
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=self.embedding_model_name,
-            multi_process=False, # Set to False for stability in Spaces
-            model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
             encode_kwargs={"normalize_embeddings": True},
         )
         print(f"Loading Vector DB from {vector_db_path}...")
-        # Check if index exists to prevent crash
         if not os.path.exists(vector_db_path):
              raise FileNotFoundError(f"Could not find vector DB at {vector_db_path}. Please upload your 'index' folder.")
         self.vector_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
-        # FAISS GPU optimization (If available)
-        if torch.cuda.is_available():
-            try:
-                res = faiss.StandardGpuResources()
-                co = faiss.GpuClonerOptions()
-                co.useFloat16 = True
-                self.vector_db.index = faiss.index_cpu_to_gpu(res, 0, self.vector_db.index, co)
-            except Exception as e:
-                print(f"Could not load FAISS to GPU, running on CPU: {e}")
-        # Initialize Pipeline
-        self.pipe = pipeline(
-            'text-generation',
-            model=self.model,
-            torch_dtype=torch.bfloat16,
-            device_map='auto',
-            tokenizer=self.tokenizer,
-            max_new_tokens=self.max_new_tokens,
-            temperature=TEMPERATURE,
-            do_sample=DO_SAMPLE,
-            pad_token_id=self.tokenizer.eos_token_id,
-            # return_full_text=False is CRITICAL for chatbots so it doesn't repeat the prompt
-            return_full_text=False
-        )
     def retrieve(self, query, num_docs=3):
-        '''
-        Returns the k most similar documents to the query
-        '''
-        retrieved_docs = self.vector_db.similarity_search(query, k=num_docs)
-        return retrieved_docs
     def _format_prompt(self, query, retrieved_docs):
-        context = "\nExtracted documents:\n"
-        # Adjusted extraction slightly to handle missing metadata keys gracefully
         for doc in retrieved_docs:
             section = doc.metadata.get('Section', 'N/A')
             subtitle = doc.metadata.get('Subtitle', 'Context')
             context += f"{section} - {subtitle}:::\n{doc.page_content}\n\n"
-        prompt = f'''
-        You are a helpful legal interpreter.
-        You are given the following context:
-        {context}\n\n
-        Using the information contained in the context,
-        give a comprehensive answer to the question.
-        Respond only to the question asked. Your response should be concise and relevant to the question.
-        Always provide the section number and title of the source document.
-        Also please use plain English when responding, not legal jargon.
-        Question: {query}"
-        '''
         return prompt
-    def easy_generate(self, query, num_docs=3):
-        retrieved_docs = self.retrieve(query, num_docs=num_docs)
-        prompt = self._format_prompt(query, retrieved_docs)
-        # Because we used return_full_text=False in the pipeline,
-        # this returns only the answer.
-        result = self.pipe(prompt)[0]['generated_text']
-        return result
 # --- INITIALIZATION ---
 # Using standard paths and models
 MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
 EMBEDDING_NAME = 'Qwen/Qwen3-Embedding-0.6B'
-VECDB_PATH = './index/'
 rag = MyRAGPipeline(model_name, embedding_name, vecdb_path)
 prompt = "My neighbor is playing loud music on their porch. What time does the 'quiet period' start, and what is the maximum decibel level allowed in a residential zone?"
-print(rag.easy_generate(prompt))
 ```

 short_description: RAG Enabled ChatBot for Charlottesville Municipal Code
 ---
 # Charlottesville Local Ordinance Assistant
 ## 1. Introduction
 import random
 class MyRAGPipeline:
+    def __init__(self, model_name: str, embedding_model_name: str, vector_db_path: str):
         self.embedding_model_name = embedding_model_name
+        self.max_new_tokens = 500
         print(f"Loading Model: {model_name}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+        # --- CRITICAL: Load to CPU first ---
+        # ZeroGPU does not have a GPU available during global startup.
+        # We load the weights into System RAM now, and move them to GPU later.
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            device_map="cpu",  # Force CPU loading
+            torch_dtype=torch.bfloat16,
             token=HF_TOKEN
         )
         self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
         self.tokenizer.padding_side = "left"
         print("Loading Embeddings...")
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=self.embedding_model_name,
+            model_kwargs={"device": "cpu"}, # Keep embeddings on CPU
             encode_kwargs={"normalize_embeddings": True},
         )
         print(f"Loading Vector DB from {vector_db_path}...")
         if not os.path.exists(vector_db_path):
              raise FileNotFoundError(f"Could not find vector DB at {vector_db_path}. Please upload your 'index' folder.")
         self.vector_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
+        print("RAG Pipeline Initialized (CPU Mode)")
     def retrieve(self, query, num_docs=3):
+        return self.vector_db.similarity_search(query, k=num_docs)
     def _format_prompt(self, query, retrieved_docs):
+        # 1. Build Context
+        context = "Extracted documents:\n"
         for doc in retrieved_docs:
             section = doc.metadata.get('Section', 'N/A')
             subtitle = doc.metadata.get('Subtitle', 'Context')
             context += f"{section} - {subtitle}:::\n{doc.page_content}\n\n"
+        # 2. Universal Chat Template (Works for Qwen, Llama, Mistral, etc.)
+        messages = [
+            {
+                "role": "system",
+                "content": f"You are a helpful legal interpreter. Use the following context to answer the user's question.\nContext:\n{context}"
+            },
+            {
+                "role": "system",
+                "content": "Using the information contained in the context, give a comprehensive answer to the question. Respond only to the question asked. Your response should be concise and relevant to the question. Always provide the section number and title of the source document. Also please use plain English when responding, not legal jargon. \n Now answer the following question."
+            },
+            {
+                "role": "user",
+                "content": query
+            }
+        ]
+        # This applies the correct format for WHATEVER model you are using
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
         return prompt
+    def generate(self, query, num_docs=3):
+        # 1. Retrieve
+        retrieved_docs = self.retrieve(query, num_docs)
+        # 2. Format Prompt
+        prompt_str = self._format_prompt(query, retrieved_docs)
+        # 3. Tokenize
+        inputs = self.tokenizer(prompt_str, return_tensors="pt").to(self.model.device)
+        # 4. Generate (Streaming is simpler for direct model usage, but here we do blocking)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # 5. Decode
+        # Slicing [input_len:] ensures we only return the new text, not the prompt
+        input_len = inputs.input_ids.shape[1]
+        generated_text = self.tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
+        return generated_text
 # --- INITIALIZATION ---
 # Using standard paths and models
 MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
 EMBEDDING_NAME = 'Qwen/Qwen3-Embedding-0.6B'
+VECDB_PATH = 'index/'
 rag = MyRAGPipeline(model_name, embedding_name, vecdb_path)
 prompt = "My neighbor is playing loud music on their porch. What time does the 'quiet period' start, and what is the maximum decibel level allowed in a residential zone?"
+print(rag.generate(prompt))
 ```

app.py CHANGED Viewed

@@ -141,8 +141,17 @@ def chat_function(message, history):
 demo = gr.ChatInterface(
     fn=chat_function,
     type="messages",
-    title="Legal RAG Assistant (Qwen 2.5)",
-    description="Ask a question about the legal documents.",
 )
 if __name__ == "__main__":

 demo = gr.ChatInterface(
     fn=chat_function,
     type="messages",
+    title="Charlottesville Municipal RAG Assistant",
+    description="Ask a question about the City of Charlottesville municipal code. This app is intended to increase accessibility to the municipal code and is not a replacement for a legal professional. AI makes mistakes, check important information.",
+    examples=[
+        "My neighbor is playing loud music on their porch. What time does the 'quiet period' start, and what is the maximum decibel level allowed in a residential zone?",
+        "There is a massive oak tree on my property I want to cut down. Do I need permission from the city to remove it?",
+        "I got a parking ticket near the Downtown Mall. What is the deadline to pay the fine, and how do I contest it if I think it was issued in error?",
+        "I want to build a privacy fence in my backyard. How tall can it be before I need a permit, and are there different rules for the front yard versus the back yard?",
+        "I found a deer in my backyard. Can I keep it as a pet if I put a leash on it?",
+        "I'm having trouble catching fish in the Rivanna River. Is it legal to use explosives to help catch them?",
+        "Can I legally attach a flamethrower to my car to melt the snow on my driveway?",
+        "Is it legal for me to practice my bagpipes on the sidewalk at 2:00 AM if I'm technically walking and not 'loitering'?"]
 )
 if __name__ == "__main__":