Spaces:

AnwinMJ
/

BeRu

Runtime error

BeRu / app.py

BeRU Deployer

Deploy BeRU Streamlit RAG System - Add app, models logic, configs, and optimizations for HF Spaces

dec533d 2 days ago

8.99 kB

	import streamlit as st
	import torch
	import os
	import pickle
	import faiss
	import numpy as np
	from transformers import AutoModel, AutoProcessor, AutoTokenizer
	from typing import List, Dict
	import time

	# ========================================
	# 🎨 STREAMLIT PAGE CONFIG
	# ========================================
	st.set_page_config(
	page_title="BeRU Chat - RAG Assistant",
	page_icon="🤖",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# ========================================
	# 🎯 CACHING FOR MODEL LOADING
	# ========================================
	@st.cache_resource
	def load_embedding_model():
	"""Load VLM2Vec embedding model"""
	st.write("⏳ Loading embedding model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = AutoModel.from_pretrained(
	"TIGER-Lab/VLM2Vec-Qwen2VL-2B",
	trust_remote_code=True,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	).to(device)

	processor = AutoProcessor.from_pretrained(
	"TIGER-Lab/VLM2Vec-Qwen2VL-2B",
	trust_remote_code=True
	)

	tokenizer = AutoTokenizer.from_pretrained(
	"TIGER-Lab/VLM2Vec-Qwen2VL-2B",
	trust_remote_code=True
	)

	model.eval()
	st.success("✅ Embedding model loaded!")
	return model, processor, tokenizer, device

	@st.cache_resource
	def load_llm_model():
	"""Load Mistral 7B LLM"""
	st.write("⏳ Loading language model...")
	device = "cuda" if torch.cuda.is_available() else "cpu"

	from transformers import AutoModelForCausalLM, BitsAndBytesConfig

	# 4-bit quantization config for efficiency
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)

	model = AutoModelForCausalLM.from_pretrained(
	"mistralai/Mistral-7B-Instruct-v0.3",
	quantization_config=quantization_config,
	device_map="auto"
	)

	tokenizer = AutoTokenizer.from_pretrained(
	"mistralai/Mistral-7B-Instruct-v0.3"
	)

	st.success("✅ Language model loaded!")
	return model, tokenizer, device

	@st.cache_resource
	def load_faiss_index():
	"""Load FAISS index if exists"""
	if os.path.exists("VLM2Vec-V2rag2/text_index.faiss"):
	st.write("⏳ Loading FAISS index...")
	index = faiss.read_index("VLM2Vec-V2rag2/text_index.faiss")
	st.success("✅ FAISS index loaded!")
	return index
	else:
	st.warning("⚠️ FAISS index not found. Please build the index first.")
	return None

	# ========================================
	# 💬 EMBEDDING & RETRIEVAL FUNCTIONS
	# ========================================
	def get_embeddings(texts: List[str], model, processor, tokenizer, device) -> np.ndarray:
	"""Generate embeddings for texts"""
	embeddings_list = []

	for text in texts:
	inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)

	with torch.no_grad():
	outputs = model(**inputs, output_hidden_states=True)
	embedding = outputs.hidden_states[-1].mean(dim=1).cpu().numpy()

	embeddings_list.append(embedding.flatten())

	return np.array(embeddings_list)

	def retrieve_documents(query: str, model, processor, tokenizer, device, faiss_index, k: int = 5) -> List[Dict]:
	"""Retrieve relevant documents using FAISS"""
	if faiss_index is None:
	return []

	# Get query embedding
	query_embedding = get_embeddings([query], model, processor, tokenizer, device)

	# Search FAISS index
	distances, indices = faiss_index.search(query_embedding, k)

	# Load documents metadata (assuming you have this stored)
	results = []
	for idx in indices[0]:
	if idx >= 0:
	results.append({
	"index": idx,
	"distance": float(distances[0][list(indices[0]).index(idx)])
	})

	return results

	def generate_response(query: str, context: str, model, tokenizer, device) -> str:
	"""Generate response using Mistral"""

	prompt = f"""[INST] You are a helpful assistant answering questions about technical documentation.

	Context:
	{context}

	Question: {query} [/INST]"""

	inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True).to(device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.7,
	top_p=0.95,
	do_sample=True
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response.split("[/INST]")[1].strip() if "[/INST]" in response else response

	# ========================================
	# 🎨 STREAMLIT UI
	# ========================================
	st.title("🤖 BeRU Chat Assistant")
	st.markdown("100% Offline RAG System with Mistral 7B & VLM2Vec")

	# Sidebar Configuration
	with st.sidebar:
	st.header("⚙️ Configuration")

	device_info = "🟢 GPU" if torch.cuda.is_available() else "🔴 CPU"
	st.metric("Device", device_info)

	num_results = st.slider("Retrieve top K documents", 1, 10, 5)
	temperature = st.slider("Response Temperature", 0.1, 1.0, 0.7)

	st.divider()
	st.markdown("### 📊 Project Info")
	st.markdown("""
	- Model: Mistral 7B Instruct v0.3
	- Embeddings: VLM2Vec-Qwen2VL-2B
	- Vector Store: FAISS with 10K+ documents
	- Retrieval: Hybrid (Dense + BM25)
	""")

	# Main Chat Interface
	col1, col2 = st.columns([3, 1])

	with col1:
	st.subheader("💬 Ask a Question")

	with col2:
	if st.button("🔄 Clear Chat", use_container_width=True):
	st.session_state.messages = []
	st.rerun()

	# Initialize session state
	if "messages" not in st.session_state:
	st.session_state.messages = []

	if "models_loaded" not in st.session_state:
	st.session_state.models_loaded = False

	# Load models
	if not st.session_state.models_loaded:
	st.info("📦 Loading models on first run... This may take 2-3 minutes.")

	try:
	embed_model, processor, tokenizer_embed, embed_device = load_embedding_model()
	llm_model, tokenizer_llm, llm_device = load_llm_model()
	faiss_idx = load_faiss_index()

	st.session_state.embed_model = embed_model
	st.session_state.processor = processor
	st.session_state.tokenizer_embed = tokenizer_embed
	st.session_state.embed_device = embed_device
	st.session_state.llm_model = llm_model
	st.session_state.tokenizer_llm = tokenizer_llm
	st.session_state.llm_device = llm_device
	st.session_state.faiss_idx = faiss_idx
	st.session_state.models_loaded = True
	st.success("✅ All models loaded successfully!")

	except Exception as e:
	st.error(f"❌ Error loading models: {str(e)}")
	st.stop()

	# Chat Interface
	st.markdown("---")

	# Display chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# User input
	user_input = st.chat_input("Type your question here...", key="user_input")

	if user_input:
	# Add user message to chat
	st.session_state.messages.append({"role": "user", "content": user_input})

	with st.chat_message("user"):
	st.markdown(user_input)

	# Generate response
	with st.chat_message("assistant"):
	st.write("🔍 Retrieving relevant documents...")

	# Retrieve documents
	retrieved = retrieve_documents(
	user_input,
	st.session_state.embed_model,
	st.session_state.processor,
	st.session_state.tokenizer_embed,
	st.session_state.embed_device,
	st.session_state.faiss_idx,
	k=num_results
	)

	context = "\n\n".join([f"Document {i+1}: Context from index {doc['index']}"
	for i, doc in enumerate(retrieved)])

	st.write("💭 Generating response...")

	# Generate response
	response = generate_response(
	user_input,
	context,
	st.session_state.llm_model,
	st.session_state.tokenizer_llm,
	st.session_state.llm_device
	)

	st.markdown(response)

	# Add to chat history
	st.session_state.messages.append({"role": "assistant", "content": response})

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center; color: gray; font-size: 12px;'>
	<p>BeRU Chat Assistant \| Powered by Mistral 7B + VLM2Vec \| 100% Offline</p>
	<p><a href='https://github.com/AnwinJosy/BeRU'>GitHub</a> \|
	<a href='https://huggingface.co/AnwinJosy'>Hugging Face</a></p>
	</div>
	""", unsafe_allow_html=True)