BeRU Deployer
Deploy BeRU Streamlit RAG System - Add app, models logic, configs, and optimizations for HF Spaces
dec533d | import streamlit as st | |
| import torch | |
| import os | |
| import pickle | |
| import faiss | |
| import numpy as np | |
| from transformers import AutoModel, AutoProcessor, AutoTokenizer | |
| from typing import List, Dict | |
| import time | |
| # ======================================== | |
| # π¨ STREAMLIT PAGE CONFIG | |
| # ======================================== | |
| st.set_page_config( | |
| page_title="BeRU Chat - RAG Assistant", | |
| page_icon="π€", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # ======================================== | |
| # π― CACHING FOR MODEL LOADING | |
| # ======================================== | |
| def load_embedding_model(): | |
| """Load VLM2Vec embedding model""" | |
| st.write("β³ Loading embedding model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = AutoModel.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 | |
| ).to(device) | |
| processor = AutoProcessor.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True | |
| ) | |
| model.eval() | |
| st.success("β Embedding model loaded!") | |
| return model, processor, tokenizer, device | |
| def load_llm_model(): | |
| """Load Mistral 7B LLM""" | |
| st.write("β³ Loading language model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| from transformers import AutoModelForCausalLM, BitsAndBytesConfig | |
| # 4-bit quantization config for efficiency | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| quantization_config=quantization_config, | |
| device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "mistralai/Mistral-7B-Instruct-v0.3" | |
| ) | |
| st.success("β Language model loaded!") | |
| return model, tokenizer, device | |
| def load_faiss_index(): | |
| """Load FAISS index if exists""" | |
| if os.path.exists("VLM2Vec-V2rag2/text_index.faiss"): | |
| st.write("β³ Loading FAISS index...") | |
| index = faiss.read_index("VLM2Vec-V2rag2/text_index.faiss") | |
| st.success("β FAISS index loaded!") | |
| return index | |
| else: | |
| st.warning("β οΈ FAISS index not found. Please build the index first.") | |
| return None | |
| # ======================================== | |
| # π¬ EMBEDDING & RETRIEVAL FUNCTIONS | |
| # ======================================== | |
| def get_embeddings(texts: List[str], model, processor, tokenizer, device) -> np.ndarray: | |
| """Generate embeddings for texts""" | |
| embeddings_list = [] | |
| for text in texts: | |
| inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device) | |
| with torch.no_grad(): | |
| outputs = model(**inputs, output_hidden_states=True) | |
| embedding = outputs.hidden_states[-1].mean(dim=1).cpu().numpy() | |
| embeddings_list.append(embedding.flatten()) | |
| return np.array(embeddings_list) | |
| def retrieve_documents(query: str, model, processor, tokenizer, device, faiss_index, k: int = 5) -> List[Dict]: | |
| """Retrieve relevant documents using FAISS""" | |
| if faiss_index is None: | |
| return [] | |
| # Get query embedding | |
| query_embedding = get_embeddings([query], model, processor, tokenizer, device) | |
| # Search FAISS index | |
| distances, indices = faiss_index.search(query_embedding, k) | |
| # Load documents metadata (assuming you have this stored) | |
| results = [] | |
| for idx in indices[0]: | |
| if idx >= 0: | |
| results.append({ | |
| "index": idx, | |
| "distance": float(distances[0][list(indices[0]).index(idx)]) | |
| }) | |
| return results | |
| def generate_response(query: str, context: str, model, tokenizer, device) -> str: | |
| """Generate response using Mistral""" | |
| prompt = f"""[INST] You are a helpful assistant answering questions about technical documentation. | |
| Context: | |
| {context} | |
| Question: {query} [/INST]""" | |
| inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True).to(device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.95, | |
| do_sample=True | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return response.split("[/INST]")[1].strip() if "[/INST]" in response else response | |
| # ======================================== | |
| # π¨ STREAMLIT UI | |
| # ======================================== | |
| st.title("π€ BeRU Chat Assistant") | |
| st.markdown("*100% Offline RAG System with Mistral 7B & VLM2Vec*") | |
| # Sidebar Configuration | |
| with st.sidebar: | |
| st.header("βοΈ Configuration") | |
| device_info = "π’ GPU" if torch.cuda.is_available() else "π΄ CPU" | |
| st.metric("Device", device_info) | |
| num_results = st.slider("Retrieve top K documents", 1, 10, 5) | |
| temperature = st.slider("Response Temperature", 0.1, 1.0, 0.7) | |
| st.divider() | |
| st.markdown("### π Project Info") | |
| st.markdown(""" | |
| - **Model**: Mistral 7B Instruct v0.3 | |
| - **Embeddings**: VLM2Vec-Qwen2VL-2B | |
| - **Vector Store**: FAISS with 10K+ documents | |
| - **Retrieval**: Hybrid (Dense + BM25) | |
| """) | |
| # Main Chat Interface | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.subheader("π¬ Ask a Question") | |
| with col2: | |
| if st.button("π Clear Chat", use_container_width=True): | |
| st.session_state.messages = [] | |
| st.rerun() | |
| # Initialize session state | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| if "models_loaded" not in st.session_state: | |
| st.session_state.models_loaded = False | |
| # Load models | |
| if not st.session_state.models_loaded: | |
| st.info("π¦ Loading models on first run... This may take 2-3 minutes.") | |
| try: | |
| embed_model, processor, tokenizer_embed, embed_device = load_embedding_model() | |
| llm_model, tokenizer_llm, llm_device = load_llm_model() | |
| faiss_idx = load_faiss_index() | |
| st.session_state.embed_model = embed_model | |
| st.session_state.processor = processor | |
| st.session_state.tokenizer_embed = tokenizer_embed | |
| st.session_state.embed_device = embed_device | |
| st.session_state.llm_model = llm_model | |
| st.session_state.tokenizer_llm = tokenizer_llm | |
| st.session_state.llm_device = llm_device | |
| st.session_state.faiss_idx = faiss_idx | |
| st.session_state.models_loaded = True | |
| st.success("β All models loaded successfully!") | |
| except Exception as e: | |
| st.error(f"β Error loading models: {str(e)}") | |
| st.stop() | |
| # Chat Interface | |
| st.markdown("---") | |
| # Display chat history | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.markdown(message["content"]) | |
| # User input | |
| user_input = st.chat_input("Type your question here...", key="user_input") | |
| if user_input: | |
| # Add user message to chat | |
| st.session_state.messages.append({"role": "user", "content": user_input}) | |
| with st.chat_message("user"): | |
| st.markdown(user_input) | |
| # Generate response | |
| with st.chat_message("assistant"): | |
| st.write("π Retrieving relevant documents...") | |
| # Retrieve documents | |
| retrieved = retrieve_documents( | |
| user_input, | |
| st.session_state.embed_model, | |
| st.session_state.processor, | |
| st.session_state.tokenizer_embed, | |
| st.session_state.embed_device, | |
| st.session_state.faiss_idx, | |
| k=num_results | |
| ) | |
| context = "\n\n".join([f"Document {i+1}: Context from index {doc['index']}" | |
| for i, doc in enumerate(retrieved)]) | |
| st.write("π Generating response...") | |
| # Generate response | |
| response = generate_response( | |
| user_input, | |
| context, | |
| st.session_state.llm_model, | |
| st.session_state.tokenizer_llm, | |
| st.session_state.llm_device | |
| ) | |
| st.markdown(response) | |
| # Add to chat history | |
| st.session_state.messages.append({"role": "assistant", "content": response}) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center; color: gray; font-size: 12px;'> | |
| <p>BeRU Chat Assistant | Powered by Mistral 7B + VLM2Vec | 100% Offline</p> | |
| <p><a href='https://github.com/AnwinJosy/BeRU'>GitHub</a> | | |
| <a href='https://huggingface.co/AnwinJosy'>Hugging Face</a></p> | |
| </div> | |
| """, unsafe_allow_html=True) | |