Spaces:
Runtime error
Runtime error
| from langchain_community.utilities import SQLDatabase | |
| from langchain_community.agent_toolkits import create_sql_agent | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate | |
| # ✅ REPLACE OpenAI with HuggingFace models | |
| from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings | |
| # Alternative: Use transformers directly for more control | |
| from langchain_community.llms import HuggingFacePipeline | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import torch | |
| class HybridJiraRAG: | |
| """Hybrid RAG system for HuggingFace Spaces GPU""" | |
| def __init__( | |
| self, | |
| sql_db_uri: str, | |
| vector_store_path: str, | |
| hf_token: str = None, | |
| model_name: str = "meta-llama/Llama-3.2-3B-Instruct" | |
| ): | |
| # Option 1: Use HF Inference API (doesn't use your GPU) | |
| # self.llm = HuggingFaceEndpoint( | |
| # repo_id=model_name, | |
| # huggingfacehub_api_token=hf_token, | |
| # temperature=0.1, | |
| # max_new_tokens=512 | |
| # ) | |
| # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces) | |
| self.llm = self._load_local_llm(model_name) | |
| # SQL Agent | |
| self.sql_db = SQLDatabase.from_uri(sql_db_uri) | |
| self.sql_agent = create_sql_agent( | |
| self.llm, | |
| db=self.sql_db, | |
| agent_type="zero-shot-react-description", # More compatible | |
| verbose=True | |
| ) | |
| # Embeddings - Use local HuggingFace model | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'} | |
| ) | |
| # Vector store | |
| self.vector_store = FAISS.load_local( | |
| vector_store_path, | |
| embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| # RAG chain | |
| self.rag_chain = RetrievalQA.from_chain_type( | |
| llm=self.llm, | |
| retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}), | |
| return_source_documents=True | |
| ) | |
| def _load_local_llm(self, model_name: str): | |
| """Load LLM locally to use GPU""" | |
| # Load model on GPU | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16, # Use FP16 for GPU efficiency | |
| device_map="auto", # Automatically use GPU | |
| trust_remote_code=True | |
| ) | |
| # Create text generation pipeline | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| temperature=0.1, | |
| do_sample=True, | |
| top_p=0.95, | |
| repetition_penalty=1.15 | |
| ) | |
| # Wrap in LangChain | |
| return HuggingFacePipeline(pipeline=pipe) |