Spaces:

InnovisionLLC
/

example_test

Paused

App Files Files Community

Wenye He commited on Feb 20, 2025

Commit

68053b5

verified ·

1 Parent(s): 46a9cde

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -120

app.py CHANGED Viewed

@@ -1,144 +1,201 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from langchain.chains import ConversationalRetrievalChain
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.memory import ConversationBufferMemory
-from langchain.llms import HuggingFacePipeline
 import torch
-# Model Configuration
 MODEL_CONFIG = {
-    "phi-3-mini": {
-        "name": "microsoft/phi-3-mini-4k-instruct",
-        "max_tokens": 1024,
-        "temperature": 0.8
     },
-    "Mistral-7B": {
-        "name": "mistralai/Mistral-7B-Instruct-v0.3",
-        "max_tokens": 512,
-        "temperature": 0.7
     }
 }
-# Cache Stores
-vector_store_cache = {}
-model_pipeline_cache = {}
-embedder = HuggingFaceEmbeddings()
-def load_vector_store(store_name):
-    """Cache vector stores in memory"""
-    if store_name not in vector_store_cache:
-        vector_store_cache[store_name] = FAISS.load_local(
-            f"vector_store/{store_name}",
-            embedder
-        )
-    return vector_store_cache[store_name]
-def get_model_pipeline(model_choice):
-    """Cache model pipelines in memory"""
-    if model_choice not in model_pipeline_cache:
-        cfg = MODEL_CONFIG[model_choice]
-        tokenizer = AutoTokenizer.from_pretrained(cfg["name"])
-        model = AutoModelForCausalLM.from_pretrained(
-            cfg["name"],
-            device_map="auto",
-            torch_dtype="auto" if "phi-3" in model_choice else torch.float16
-        )
-        model_pipeline_cache[model_choice] = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=cfg["max_tokens"],
-            temperature=cfg["temperature"]
-        )
-    return model_pipeline_cache[model_choice]
-class SessionChain:
-    """Per-session chain manager with memory"""
-    def __init__(self):
-        self.current_model = None
-        self.current_vector_store = None
-        self.chain = None
-    def get_chain(self, model_choice, vector_store_name):
-        """Get or create chain with proper configuration"""
-        if self.current_model != model_choice or self.current_vector_store != vector_store_name:
-            self._create_new_chain(model_choice, vector_store_name)
-        return self.chain
-    def _create_new_chain(self, model_choice, vector_store_name):
-        """Create new chain with updated configuration"""
-        vector_store = load_vector_store(vector_store_name)
-        pipe = get_model_pipeline(model_choice)
-        self.chain = ConversationalRetrievalChain.from_llm(
-            llm=HuggingFacePipeline(pipeline=pipe),
-            retriever=vector_store.as_retriever(),
-            memory=ConversationBufferMemory(),
-            verbose=False
         )
-        self.current_model = model_choice
-        self.current_vector_store = vector_store_name
-def respond(message, history, model_choice, vector_store, session_state):
-    """Handle message with cached resources and session chain"""
-    # Initialize session chain if not exists
-    if session_state is None:
-        session_state = SessionChain()
-    # Get the appropriate chain for this session
-    chain = session_state.get_chain(model_choice, vector_store)
-    try:
-        # Convert Gradio history to LangChain format
-        for human, ai in history[-5:]:  # Keep last 5 exchanges as memory
-            chain.memory.save_context({"input": human}, {"output": ai})
         # Generate response
-        result = chain.invoke({"question": message})
-        response = result["answer"]
-        return "", history + [(message, response)], session_state
     except Exception as e:
-        return "", history + [(message, f"⚠️ Error: {str(e)}")], session_state
-with gr.Blocks() as demo:
-    gr.Markdown("# 🚀 Optimized Chat with Session Management")
-    # UI Components
-    model_dropdown = gr.Dropdown(
-        list(MODEL_CONFIG.keys()),
-        value="phi-3-mini",
-        label="Select Model"
-    )
-    vector_store_dropdown = gr.Dropdown(
-        ["llm", "scoliosis"],
-        value="scoliosis",
-        label="Knowledge Base"
-    )
-    # Session state stored in the browser
-    session = gr.State()
-    chatbot = gr.Chatbot(height=400)
-    msg = gr.Textbox(label="Your Message")
-    clear = gr.Button("Clear History")
-    # Chat handlers
-    msg.submit(
-        respond,
-        [msg, chatbot, model_dropdown, vector_store_dropdown, session],
-        [msg, chatbot, session]
     )
-    clear.click(
-        lambda: ([], None),
-        [],
-        [chatbot, session]
-    )
-demo.launch()

+# app.py
 import gradio as gr
 import torch
+import time
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+# Configuration
 MODEL_CONFIG = {
+    "phi-3": {
+        "model_name": "microsoft/phi-3-mini-4k-instruct",
+        "template": "<|user|>\n{message}<|end|>\n<|assistant|>"
     },
+    "llama3-8b": {
+        "model_name": "NousResearch/Meta-Llama-3-8B-Instruct",
+        "template": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
     }
 }
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True
+)
+class ChatModel:
+    def __init__(self):
+        self.models = {}
+        self.tokenizers = {}
+        self.vectorstore = None
+    def load_model(self, model_name):
+        if model_name not in self.models:
+            config = MODEL_CONFIG[model_name]
+            tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
+            tokenizer.pad_token = tokenizer.eos_token
+            model = AutoModelForCausalLM.from_pretrained(
+                config["model_name"],
+                quantization_config=bnb_config,
+                device_map="auto",
+                torch_dtype=torch.float16,
+            )
+            self.models[model_name] = model
+            self.tokenizers[model_name] = tokenizer
+    def load_vector_store(self, store_name):
+        """Cache vector stores in memory"""
+        if store_name not in self.vectorstore:
+            embeddings = HuggingFaceEmbeddings(
+                model_name="BAAI/bge-small-en-v1.5"
+            )
+            self.vectorstore[store_name] = FAISS.load_local(
+                f"vector_stores/{store_name}",
+                embeddings
+            )
+        return self.vectorstore[store_name]
+    def process_documents(self, files, progress=gr.Progress()):
+        """Process uploaded documents into vector embeddings"""
+        try:
+            progress(0, desc="Starting document processing")
+            documents = []
+            # Load documents
+            for file_path in progress.tqdm(files, desc="Loading files"):
+                if file_path.endswith(".pdf"):
+                    loader = PyPDFLoader(file_path)
+                elif file_path.endswith(".txt"):
+                    loader = TextLoader(file_path)
+                else:
+                    continue
+                documents.extend(loader.load())
+            # Split documents
+            progress(0.3, desc="Processing documents")
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=512,
+                chunk_overlap=50
+            )
+            texts = text_splitter.split_documents(documents)
+            # Create embeddings
+            progress(0.6, desc="Generating embeddings")
+            embeddings = HuggingFaceEmbeddings(
+                model_name="BAAI/bge-small-en-v1.5"
+            )
+            # Create vector store
+            progress(0.8, desc="Building vector database")
+            self.vectorstore = FAISS.from_documents(texts, embeddings)
+            return "✅ Documents processed successfully! Ready for queries."
+        except Exception as e:
+            return f"❌ Error processing documents: {str(e)}"
+    def generate(self, message, model_name, vector_store_name, history):
+        start_time = time.time()
+        self.load_model(model_name)
+        self.load_vector_store(vector_store_name)
+        config = MODEL_CONFIG[model_name]
+        # Retrieve relevant context
+        context = ""
+        if self.vectorstore[vector_store_name]:
+            docs = self.vectorstore[vector_store_name].similarity_search(message, k=3)
+            context = "\n\n".join([d.page_content for d in docs])
+        # Format prompt with context
+        prompt = config["template"].format(
+            message=f"Context:\n{context}\n\nQuestion: {message}"
         )
         # Generate response
+        pipe = pipeline(
+            "text-generation",
+            model=self.models[model_name],
+            tokenizer=self.tokenizers[model_name],
+            max_new_tokens=384,
+            temperature=0.7,
+            top_p=0.9,
+            repetition_penalty=1.1,
+            do_sample=True,
+            return_full_text=False
+        )
+        response = pipe(prompt)[0]['generated_text']
+        # Calculate metrics
+        elapsed_time = time.time() - start_time
+        tokens = len(self.tokenizers[model_name].encode(response))
+        tokens_per_sec = tokens / elapsed_time if elapsed_time > 0 else 0
+        return response, elapsed_time, tokens_per_sec
+# Initialize model handler
+model_handler = ChatModel()
+def chat(message, history, model_choice, vector_store_choice):
+    try:
+        response, response_time, token_speed = model_handler.generate(message, model_choice, vector_store_choice, history)
+        formatted_response = f"{response}\n\n⏱️ Response Time: {response_time:.2f}s | 🚀 Speed: {token_speed:.2f} tokens/s"
+        return [(message, formatted_response)]
     except Exception as e:
+        return [(message, f"Error: {str(e)}")]
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 LLM Chatbot with RAG & Performance Metrics")
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            choices=["phi-3", "llama3-8b"],
+            label="Select Model",
+            value="phi-3"
+        )
+        vector_store_choice = gr.Dropdown(
+            ["llm", "scoliosis"],
+            value="scoliosis",
+            label="Knowledge Base"
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(
+                label="Upload Documents (PDF/TXT)",
+                file_count="multiple",
+                file_types=[".pdf", ".txt"],
+                type="filepath"
+            )
+            status = gr.Textbox(label="Processing Status", interactive=False)
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(height=500)
+            msg = gr.Textbox(label="Message", placeholder="Type your question here...")
+    with gr.Row():
+        submit_btn = gr.Button("Send", variant="primary")
+        clear_btn = gr.ClearButton([msg, chatbot, file_upload])
+    # Event handlers
+    file_upload.upload(
+        fn=model_handler.process_documents,
+        inputs=file_upload,
+        outputs=status,
+        show_progress="full"
     )
+    msg.submit(chat, [msg, chatbot, model_choice, vector_store_choice], chatbot)
+    submit_btn.click(chat, [msg, chatbot, model_choice, vector_store_choice], chatbot)
+demo.launch()