Spaces:

VcRlAgent
/

TestLLMGen

Runtime error

App Files Files Community

TestLLMGen / hybrid_rag.py

VcRlAgent

Starter LLM Inference Call

5e7defa 3 months ago

raw

history blame contribute delete

3.07 kB

	from langchain_community.utilities import SQLDatabase
	from langchain_community.agent_toolkits import create_sql_agent
	from langchain_community.vectorstores import FAISS
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate

	# ✅ REPLACE OpenAI with HuggingFace models
	from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings

	# Alternative: Use transformers directly for more control
	from langchain_community.llms import HuggingFacePipeline
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import torch

	class HybridJiraRAG:
	"""Hybrid RAG system for HuggingFace Spaces GPU"""

	def __init__(
	self,
	sql_db_uri: str,
	vector_store_path: str,
	hf_token: str = None,
	model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
	):
	# Option 1: Use HF Inference API (doesn't use your GPU)
	# self.llm = HuggingFaceEndpoint(
	# repo_id=model_name,
	# huggingfacehub_api_token=hf_token,
	# temperature=0.1,
	# max_new_tokens=512
	# )

	# Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces)
	self.llm = self._load_local_llm(model_name)

	# SQL Agent
	self.sql_db = SQLDatabase.from_uri(sql_db_uri)
	self.sql_agent = create_sql_agent(
	self.llm,
	db=self.sql_db,
	agent_type="zero-shot-react-description", # More compatible
	verbose=True
	)

	# Embeddings - Use local HuggingFace model
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
	)

	# Vector store
	self.vector_store = FAISS.load_local(
	vector_store_path,
	embeddings,
	allow_dangerous_deserialization=True
	)

	# RAG chain
	self.rag_chain = RetrievalQA.from_chain_type(
	llm=self.llm,
	retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}),
	return_source_documents=True
	)

	def _load_local_llm(self, model_name: str):
	"""Load LLM locally to use GPU"""
	# Load model on GPU
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16, # Use FP16 for GPU efficiency
	device_map="auto", # Automatically use GPU
	trust_remote_code=True
	)

	# Create text generation pipeline
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=512,
	temperature=0.1,
	do_sample=True,
	top_p=0.95,
	repetition_penalty=1.15
	)

	# Wrap in LangChain
	return HuggingFacePipeline(pipeline=pipe)