Spaces:

VcRlAgent
/

TestLLMGen

Runtime error

File size: 3,069 Bytes

5e7defa

from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# ✅ REPLACE OpenAI with HuggingFace models
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings

# Alternative: Use transformers directly for more control
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

class HybridJiraRAG:
    """Hybrid RAG system for HuggingFace Spaces GPU"""
    
    def __init__(
        self,
        sql_db_uri: str,
        vector_store_path: str,
        hf_token: str = None,
        model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
    ):
        # Option 1: Use HF Inference API (doesn't use your GPU)
        # self.llm = HuggingFaceEndpoint(
        #     repo_id=model_name,
        #     huggingfacehub_api_token=hf_token,
        #     temperature=0.1,
        #     max_new_tokens=512
        # )
        
        # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces)
        self.llm = self._load_local_llm(model_name)
        
        # SQL Agent
        self.sql_db = SQLDatabase.from_uri(sql_db_uri)
        self.sql_agent = create_sql_agent(
            self.llm,
            db=self.sql_db,
            agent_type="zero-shot-react-description",  # More compatible
            verbose=True
        )
        
        # Embeddings - Use local HuggingFace model
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
        )
        
        # Vector store
        self.vector_store = FAISS.load_local(
            vector_store_path,
            embeddings,
            allow_dangerous_deserialization=True
        )
        
        # RAG chain
        self.rag_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}),
            return_source_documents=True
        )
    
    def _load_local_llm(self, model_name: str):
        """Load LLM locally to use GPU"""
        # Load model on GPU
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,  # Use FP16 for GPU efficiency
            device_map="auto",  # Automatically use GPU
            trust_remote_code=True
        )
        
        # Create text generation pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.1,
            do_sample=True,
            top_p=0.95,
            repetition_penalty=1.15
        )
        
        # Wrap in LangChain
        return HuggingFacePipeline(pipeline=pipe)