TestLLMGen / hybrid_rag.py
VcRlAgent's picture
Starter LLM Inference Call
5e7defa
from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
# ✅ REPLACE OpenAI with HuggingFace models
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
# Alternative: Use transformers directly for more control
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
class HybridJiraRAG:
"""Hybrid RAG system for HuggingFace Spaces GPU"""
def __init__(
self,
sql_db_uri: str,
vector_store_path: str,
hf_token: str = None,
model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
):
# Option 1: Use HF Inference API (doesn't use your GPU)
# self.llm = HuggingFaceEndpoint(
# repo_id=model_name,
# huggingfacehub_api_token=hf_token,
# temperature=0.1,
# max_new_tokens=512
# )
# Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces)
self.llm = self._load_local_llm(model_name)
# SQL Agent
self.sql_db = SQLDatabase.from_uri(sql_db_uri)
self.sql_agent = create_sql_agent(
self.llm,
db=self.sql_db,
agent_type="zero-shot-react-description", # More compatible
verbose=True
)
# Embeddings - Use local HuggingFace model
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)
# Vector store
self.vector_store = FAISS.load_local(
vector_store_path,
embeddings,
allow_dangerous_deserialization=True
)
# RAG chain
self.rag_chain = RetrievalQA.from_chain_type(
llm=self.llm,
retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}),
return_source_documents=True
)
def _load_local_llm(self, model_name: str):
"""Load LLM locally to use GPU"""
# Load model on GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16, # Use FP16 for GPU efficiency
device_map="auto", # Automatically use GPU
trust_remote_code=True
)
# Create text generation pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.1,
do_sample=True,
top_p=0.95,
repetition_penalty=1.15
)
# Wrap in LangChain
return HuggingFacePipeline(pipeline=pipe)