from langchain_community.utilities import SQLDatabase from langchain_community.agent_toolkits import create_sql_agent from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate # ✅ REPLACE OpenAI with HuggingFace models from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings # Alternative: Use transformers directly for more control from langchain_community.llms import HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch class HybridJiraRAG: """Hybrid RAG system for HuggingFace Spaces GPU""" def __init__( self, sql_db_uri: str, vector_store_path: str, hf_token: str = None, model_name: str = "meta-llama/Llama-3.2-3B-Instruct" ): # Option 1: Use HF Inference API (doesn't use your GPU) # self.llm = HuggingFaceEndpoint( # repo_id=model_name, # huggingfacehub_api_token=hf_token, # temperature=0.1, # max_new_tokens=512 # ) # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces) self.llm = self._load_local_llm(model_name) # SQL Agent self.sql_db = SQLDatabase.from_uri(sql_db_uri) self.sql_agent = create_sql_agent( self.llm, db=self.sql_db, agent_type="zero-shot-react-description", # More compatible verbose=True ) # Embeddings - Use local HuggingFace model embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'} ) # Vector store self.vector_store = FAISS.load_local( vector_store_path, embeddings, allow_dangerous_deserialization=True ) # RAG chain self.rag_chain = RetrievalQA.from_chain_type( llm=self.llm, retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}), return_source_documents=True ) def _load_local_llm(self, model_name: str): """Load LLM locally to use GPU""" # Load model on GPU tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # Use FP16 for GPU efficiency device_map="auto", # Automatically use GPU trust_remote_code=True ) # Create text generation pipeline pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.1, do_sample=True, top_p=0.95, repetition_penalty=1.15 ) # Wrap in LangChain return HuggingFacePipeline(pipeline=pipe)