BeRU Deployer
Deploy BeRU Streamlit RAG System - Add app, models logic, configs, and optimizations for HF Spaces
dec533d | """ | |
| BeRU RAG Chat App - Optimized for Hugging Face Spaces | |
| Deployment: https://huggingface.co/spaces/AnwinMJ/Beru | |
| """ | |
| import streamlit as st | |
| import torch | |
| import os | |
| import pickle | |
| import faiss | |
| import numpy as np | |
| from transformers import AutoModel, AutoProcessor, AutoTokenizer | |
| from typing import List, Dict | |
| import time | |
| import logging | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # ======================================== | |
| # π¨ STREAMLIT PAGE CONFIG | |
| # ======================================== | |
| st.set_page_config( | |
| page_title="BeRU Chat - RAG Assistant", | |
| page_icon="π€", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| menu_items={ | |
| "About": "BeRU - Offline RAG System with VLM2Vec and Mistral 7B" | |
| } | |
| ) | |
| # ======================================== | |
| # π ENVIRONMENT DETECTION | |
| # ======================================== | |
| def detect_environment(): | |
| """Detect if running on HF Spaces""" | |
| is_spaces = os.getenv('SPACES', 'false').lower() == 'true' or 'huggingface' in os.path.exists('/app') | |
| return { | |
| 'is_spaces': is_spaces, | |
| 'device': 'cuda' if torch.cuda.is_available() else 'cpu', | |
| 'model_cache': os.getenv('HF_HOME', './cache'), | |
| 'gpu_memory': torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0 | |
| } | |
| env_info = detect_environment() | |
| # Display environment info in sidebar | |
| with st.sidebar: | |
| st.write("### System Info") | |
| st.write(f"π₯οΈ Device: `{env_info['device'].upper()}`") | |
| if env_info['device'] == 'cuda': | |
| st.write(f"πΎ GPU VRAM: `{env_info['gpu_memory'] / 1e9:.1f} GB`") | |
| st.write(f"π¦ Cache: `{env_info['model_cache']}`") | |
| # ======================================== | |
| # π― MODEL LOADING WITH CACHING | |
| # ======================================== | |
| def load_embedding_model(): | |
| """Load VLM2Vec embedding model with error handling""" | |
| with st.spinner("β³ Loading embedding model... (first time may take 5 min)"): | |
| try: | |
| logger.info("Loading VLM2Vec model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = AutoModel.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| cache_dir=env_info['model_cache'] | |
| ).to(device) | |
| processor = AutoProcessor.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True, | |
| cache_dir=env_info['model_cache'] | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "TIGER-Lab/VLM2Vec-Qwen2VL-2B", | |
| trust_remote_code=True, | |
| cache_dir=env_info['model_cache'] | |
| ) | |
| model.eval() | |
| logger.info("β Embedding model loaded successfully") | |
| st.success("β Embedding model loaded!") | |
| return model, processor, tokenizer, device | |
| except Exception as e: | |
| st.error(f"β Error loading embedding model: {str(e)}") | |
| logger.error(f"Model loading error: {e}") | |
| raise | |
| def load_llm_model(): | |
| """Load Mistral 7B LLM with quantization""" | |
| with st.spinner("β³ Loading LLM model... (first time may take 5 min)"): | |
| try: | |
| logger.info("Loading Mistral-7B model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| # 4-bit quantization config for memory efficiency | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| cache_dir=env_info['model_cache'] | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| cache_dir=env_info['model_cache'] | |
| ) | |
| logger.info("β LLM model loaded successfully") | |
| st.success("β LLM model loaded!") | |
| return model, tokenizer, device | |
| except Exception as e: | |
| st.error(f"β Error loading LLM: {str(e)}") | |
| logger.error(f"LLM loading error: {e}") | |
| raise | |
| # ======================================== | |
| # π UI LAYOUT | |
| # ======================================== | |
| st.title("π€ BeRU Chat - RAG Assistant") | |
| st.markdown(""" | |
| A powerful offline RAG system combining Mistral 7B LLM with VLM2Vec embeddings | |
| for intelligent document search and conversation. | |
| **Status**: Models loading on first access (5-8 minutes) | |
| """) | |
| # Load models | |
| try: | |
| embedding_model, processor, tokenizer, device = load_embedding_model() | |
| llm_model, llm_tokenizer, llm_device = load_llm_model() | |
| models_loaded = True | |
| except Exception as e: | |
| st.error(f"Failed to load models: {str(e)}") | |
| models_loaded = False | |
| if models_loaded: | |
| # Main chat interface | |
| left_col, right_col = st.columns([2, 1]) | |
| with left_col: | |
| st.subheader("π¬ Chat") | |
| # Initialize session state | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| # Display chat history | |
| for msg in st.session_state.messages: | |
| with st.chat_message(msg["role"]): | |
| st.write(msg["content"]) | |
| # Chat input | |
| user_input = st.chat_input("Ask a question about your documents...") | |
| if user_input: | |
| # Add user message | |
| st.session_state.messages.append({"role": "user", "content": user_input}) | |
| with st.chat_message("user"): | |
| st.write(user_input) | |
| # Generate response | |
| with st.chat_message("assistant"): | |
| with st.spinner("π€ Thinking..."): | |
| # Placeholder for RAG response | |
| response = "Response generated from RAG system..." | |
| st.write(response) | |
| st.session_state.messages.append({"role": "assistant", "content": response}) | |
| with right_col: | |
| st.subheader("π Info") | |
| st.info(""" | |
| **Model Info:** | |
| - π§ Embedding: VLM2Vec-Qwen2VL-2B | |
| - π¬ LLM: Mistral-7B-Instruct | |
| - π Search: FAISS + BM25 | |
| **Performance:** | |
| - Device: GPU if available | |
| - Quantization: 4-bit | |
| - Context: Multi-turn | |
| """) | |
| st.subheader("βοΈ Settings") | |
| temperature = st.slider("Temperature", 0.0, 1.0, 0.7) | |
| max_tokens = st.slider("Max Tokens", 100, 2000, 512) | |
| else: | |
| st.error("β Failed to initialize models. Check logs for details.") | |
| st.info("Try refreshing the page or restarting the Space.") | |
| # ======================================== | |
| # π FOOTER | |
| # ======================================== | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center'> | |
| <small> | |
| BeRU RAG System | | |
| <a href='https://huggingface.co/spaces/AnwinMJ/Beru'>Space</a> | | |
| <a href='https://github.com/AnwinMJ/BeRU'>GitHub</a> | |
| </small> | |
| </div> | |
| """, unsafe_allow_html=True) | |