from langchain_community.utilities import SQLDatabase from langchain_community.agent_toolkits import create_sql_agent from langchain_community.vectorstores import FAISS #from langchain.chains.retrieval_qa.base import RetrievalQA #from langchain_community.chains import RetrievalQA from langchain_core.prompts import PromptTemplate #from langchain.prompts import PromptTemplate # ✅ REPLACE OpenAI with HuggingFace models from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings # Alternative: Use transformers directly for more control from langchain_community.llms import HuggingFacePipeline from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch class HybridJiraRAG: """Hybrid RAG system for HuggingFace Spaces GPU""" def __init__( self, sql_db_uri: str, vector_store_path: str, hf_token: str = None, model_name: str = "meta-llama/Llama-3.2-3B-Instruct" ): # Option 1: Use HF Inference API (doesn't use your GPU) # self.llm = HuggingFaceEndpoint( # repo_id=model_name, # huggingfacehub_api_token=hf_token, # temperature=0.1, # max_new_tokens=512 # ) # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces) self.llm = self._load_local_llm(model_name) # SQL Agent self.sql_db = SQLDatabase.from_uri(sql_db_uri) self.sql_agent = create_sql_agent( self.llm, db=self.sql_db, agent_type="zero-shot-react-description", # More compatible verbose=True ) # Embeddings - Use local HuggingFace model embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'} ) # Vector store self.vector_store = FAISS.load_local( vector_store_path, embeddings, allow_dangerous_deserialization=True ) # Create prompt prompt = PromptTemplate( template="Context: {context}\n\nQuestion: {question}\n\nAnswer:", input_variables=["context", "question"] ) # Format docs function def format_docs(docs): return "\n\n".join([doc.page_content for doc in docs]) # LCEL chain retriever = self.vector_store.as_retriever(search_kwargs={"k": 5}) self.rag_chain = ( { "context": retriever | format_docs, "question": RunnablePassthrough() } | prompt | self.llm | StrOutputParser() ) # RAG chain ''' self.rag_chain = RetrievalQA.from_chain_type( llm=self.llm, retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}), return_source_documents=True )''' def _load_local_llm(self, model_name: str): """Load LLM locally to use GPU""" # Load model on GPU tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # Use FP16 for GPU efficiency device_map="auto", # Automatically use GPU trust_remote_code=True ) # Create text generation pipeline pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.1, do_sample=True, top_p=0.95, repetition_penalty=1.15 ) # Wrap in LangChain return HuggingFacePipeline(pipeline=pipe)