Spaces:

jme-datasci
/

cville-assistant

Sleeping

App Files Files Community

jme-datasci commited on Dec 3, 2025

Commit

3ceae4d

1 Parent(s): 0166d40

optimize for qwen

Browse files

Files changed (1) hide show

app.py +85 -98

app.py CHANGED Viewed

@@ -1,156 +1,143 @@
 import os
 import torch
 import gradio as gr
-import faiss
-import numpy as np
-from tqdm import tqdm
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-import spaces
-# Ensure an HF Token is present for gated models (like Llama 3)
 HF_TOKEN = os.getenv("HF_TOKEN")
-rag_pipeline = None
 class MyRAGPipeline:
-    '''
-    Wrapper class for RAG pipeline.
-    '''
-    def __init__(self, model_name: str, embedding_model_name: str, vector_db_path: str, tokenizer_name=None, MAX_NEW_TOKENS=500, TEMPERATURE=0.7, DO_SAMPLE=True):
-        if tokenizer_name is None:
-            tokenizer_name = model_name
         self.embedding_model_name = embedding_model_name
-        self.max_new_tokens = MAX_NEW_TOKENS
         print(f"Loading Model: {model_name}...")
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, token=HF_TOKEN)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            device_map="auto",
-            dtype=torch.bfloat16,
             token=HF_TOKEN
         )
         self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
         self.tokenizer.padding_side = "left"
         print("Loading Embeddings...")
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=self.embedding_model_name,
-            multi_process=False, # Set to False for stability in Spaces
-            model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
             encode_kwargs={"normalize_embeddings": True},
         )
         print(f"Loading Vector DB from {vector_db_path}...")
-        # Check if index exists to prevent crash
         if not os.path.exists(vector_db_path):
              raise FileNotFoundError(f"Could not find vector DB at {vector_db_path}. Please upload your 'index' folder.")
         self.vector_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
-        # FAISS GPU optimization (If available)
-        if torch.cuda.is_available():
-            try:
-                res = faiss.StandardGpuResources()
-                co = faiss.GpuClonerOptions()
-                co.useFloat16 = True
-                self.vector_db.index = faiss.index_cpu_to_gpu(res, 0, self.vector_db.index, co)
-            except Exception as e:
-                print(f"Could not load FAISS to GPU, running on CPU: {e}")
-        # Initialize Pipeline
-        self.pipe = pipeline(
-            'text-generation',
-            model=self.model,
-            torch_dtype=torch.bfloat16,
-            device_map='auto',
-            tokenizer=self.tokenizer,
-            max_new_tokens=self.max_new_tokens,
-            temperature=TEMPERATURE,
-            do_sample=DO_SAMPLE,
-            pad_token_id=self.tokenizer.eos_token_id,
-            # return_full_text=False is CRITICAL for chatbots so it doesn't repeat the prompt
-            return_full_text=False
-        )
     def retrieve(self, query, num_docs=3):
-        '''
-        Returns the k most similar documents to the query
-        '''
-        retrieved_docs = self.vector_db.similarity_search(query, k=num_docs)
-        return retrieved_docs
     def _format_prompt(self, query, retrieved_docs):
-        context = "\nExtracted documents:\n"
-        # Adjusted extraction slightly to handle missing metadata keys gracefully
         for doc in retrieved_docs:
             section = doc.metadata.get('Section', 'N/A')
             subtitle = doc.metadata.get('Subtitle', 'Context')
             context += f"{section} - {subtitle}:::\n{doc.page_content}\n\n"
-        prompt = f'''
-        You are a helpful legal interpreter.
-        You are given the following context:
-        {context}\n\n
-        Using the information contained in the context,
-        give a comprehensive answer to the question.
-        Respond only to the question asked. Your response should be concise and relevant to the question.
-        Always provide the section number and title of the source document.
-        Also please use plain English when responding, not legal jargon.
-        Question: {query}"
-        '''
         return prompt
-    def easy_generate(self, query, num_docs=3):
-        retrieved_docs = self.retrieve(query, num_docs=num_docs)
-        prompt = self._format_prompt(query, retrieved_docs)
-        # Because we used return_full_text=False in the pipeline,
-        # this returns only the answer.
-        result = self.pipe(prompt)[0]['generated_text']
-        return result
-# --- INITIALIZATION ---
-# Using standard paths and models
-MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
 EMBEDDING_NAME = 'Qwen/Qwen3-Embedding-0.6B'
-VECDB_PATH = './index/'
-# Initialize the RAG system globally so it doesn't reload on every message
 try:
     rag = MyRAGPipeline(MODEL_NAME, EMBEDDING_NAME, VECDB_PATH)
 except Exception as e:
     rag = None
-    print(f"Error initializing RAG: {e}")
-# --- GRADIO INTERFACE ---
-@spaces.GPU(duration=120)
 def chat_function(message, history):
-    global rag_pipeline
-    # Initialize ONLY when the GPU is assigned
-    if rag_pipeline is None:
-        print("Initializing RAG Pipeline on GPU...")
-        rag_pipeline = MyRAGPipeline(MODEL_NAME, EMBEDDING_NAME, VECDB_PATH)
-    return rag_pipeline.easy_generate(message)
 demo = gr.ChatInterface(
-    fn=chat_function,
     type="messages",
-    title="Charlottesville Municipal RAG Assistant",
-    description="Ask a question about the City of Charlottesville municipal code. This app is intended to increase accessibility to the municipal code and is not a replacement for a legal professional. AI makes mistakes, check important information.",
-    examples=[
-        "My neighbor is playing loud music on their porch. What time does the 'quiet period' start, and what is the maximum decibel level allowed in a residential zone?",
-        "There is a massive oak tree on my property I want to cut down. Do I need permission from the city to remove it?",
-        "I got a parking ticket near the Downtown Mall. What is the deadline to pay the fine, and how do I contest it if I think it was issued in error?",
-        "I want to build a privacy fence in my backyard. How tall can it be before I need a permit, and are there different rules for the front yard versus the back yard?",
-        "I found a deer in my backyard. Can I keep it as a pet if I put a leash on it?",
-        "I'm having trouble catching fish in the Rivanna River. Is it legal to use explosives to help catch them?",
-        "Can I legally attach a flamethrower to my car to melt the snow on my driveway?",
-        "Is it legal for me to practice my bagpipes on the sidewalk at 2:00 AM if I'm technically walking and not 'loitering'?"]
 )
 if __name__ == "__main__":

 import os
 import torch
 import gradio as gr
+import spaces
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
+from threading import Thread
 HF_TOKEN = os.getenv("HF_TOKEN")
 class MyRAGPipeline:
+    def __init__(self, model_name: str, embedding_model_name: str, vector_db_path: str):
         self.embedding_model_name = embedding_model_name
+        self.max_new_tokens = 500
         print(f"Loading Model: {model_name}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+        # --- CRITICAL: Load to CPU first ---
+        # ZeroGPU does not have a GPU available during global startup.
+        # We load the weights into System RAM now, and move them to GPU later.
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            device_map="cpu",  # Force CPU loading
+            torch_dtype=torch.bfloat16,
             token=HF_TOKEN
         )
         self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
         self.tokenizer.padding_side = "left"
         print("Loading Embeddings...")
         self.embedding_model = HuggingFaceEmbeddings(
             model_name=self.embedding_model_name,
+            model_kwargs={"device": "cpu"}, # Keep embeddings on CPU
             encode_kwargs={"normalize_embeddings": True},
         )
         print(f"Loading Vector DB from {vector_db_path}...")
         if not os.path.exists(vector_db_path):
              raise FileNotFoundError(f"Could not find vector DB at {vector_db_path}. Please upload your 'index' folder.")
         self.vector_db = FAISS.load_local(vector_db_path, self.embedding_model, allow_dangerous_deserialization=True)
+        print("RAG Pipeline Initialized (CPU Mode)")
     def retrieve(self, query, num_docs=3):
+        return self.vector_db.similarity_search(query, k=num_docs)
     def _format_prompt(self, query, retrieved_docs):
+        # 1. Build Context
+        context = "Extracted documents:\n"
         for doc in retrieved_docs:
             section = doc.metadata.get('Section', 'N/A')
             subtitle = doc.metadata.get('Subtitle', 'Context')
             context += f"{section} - {subtitle}:::\n{doc.page_content}\n\n"
+        # 2. Universal Chat Template (Works for Qwen, Llama, etc.)
+        messages = [
+            {
+                "role": "system",
+                "content": f"You are a helpful legal interpreter. Use the following context to answer the user's question.\nContext:\n{context}"
+            },
+            {
+                "role": "user",
+                "content": query
+            }
+        ]
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
         return prompt
+    def generate(self, query, num_docs=3):
+        # 1. Retrieve
+        retrieved_docs = self.retrieve(query, num_docs)
+        # 2. Format Prompt
+        prompt_str = self._format_prompt(query, retrieved_docs)
+        # 3. Tokenize
+        inputs = self.tokenizer(prompt_str, return_tensors="pt").to(self.model.device)
+        # 4. Generate (Streaming is simpler for direct model usage, but here we do blocking)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=self.max_new_tokens,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # 5. Decode
+        # Slicing [input_len:] ensures we only return the new text, not the prompt
+        input_len = inputs.input_ids.shape[1]
+        generated_text = self.tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)
+        return generated_text
+# --- CONFIGURATION ---
+MODEL_NAME = 'Qwen/Qwen2.5-7B-Instruct'
 EMBEDDING_NAME = 'Qwen/Qwen3-Embedding-0.6B'
+VECDB_PATH = 'index/'
+# --- GLOBAL INSTANTIATION ---
+# This runs once when the container starts.
 try:
     rag = MyRAGPipeline(MODEL_NAME, EMBEDDING_NAME, VECDB_PATH)
 except Exception as e:
+    print(f"Initialization Error: {e}")
     rag = None
+# --- ZERO-GPU INFERENCE FUNCTION ---
+@spaces.GPU
 def chat_function(message, history):
+    if rag is None:
+        return "System Error: RAG Pipeline failed to initialize."
+    # 1. Move Model to GPU (Fast operation on ZeroGPU)
+    print("Moving model to GPU...")
+    rag.model.to("cuda")
+    # 2. Generate
+    response = rag.generate(message)
+    # 3. (Optional) Move back to CPU to save VRAM?
+    # Usually not needed as ZeroGPU handles cleanup, but good practice if sharing resources.
+    # rag.model.to("cpu")
+    return response
 demo = gr.ChatInterface(
+    fn=chat_function,
     type="messages",
+    title="Legal RAG Assistant (Qwen 2.5)",
+    description="Ask a question about the legal documents.",
 )
 if __name__ == "__main__":