Spaces:

VcRlAgent
/

TestLLMGen

Runtime error

App Files Files Community

VcRlAgent commited on Nov 16, 2025

Commit

5e7defa

1 Parent(s): d52e728

Starter LLM Inference Call

Browse files

Files changed (2) hide show

__init__.py → app/__init__.py +0 -0
hybrid_rag.py +89 -0

__init__.py → app/__init__.py RENAMED Viewed

File without changes

hybrid_rag.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from langchain_community.utilities import SQLDatabase
+from langchain_community.agent_toolkits import create_sql_agent
+from langchain_community.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+# ✅ REPLACE OpenAI with HuggingFace models
+from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
+# Alternative: Use transformers directly for more control
+from langchain_community.llms import HuggingFacePipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+class HybridJiraRAG:
+    """Hybrid RAG system for HuggingFace Spaces GPU"""
+    def __init__(
+        self,
+        sql_db_uri: str,
+        vector_store_path: str,
+        hf_token: str = None,
+        model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
+    ):
+        # Option 1: Use HF Inference API (doesn't use your GPU)
+        # self.llm = HuggingFaceEndpoint(
+        #     repo_id=model_name,
+        #     huggingfacehub_api_token=hf_token,
+        #     temperature=0.1,
+        #     max_new_tokens=512
+        # )
+        # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces)
+        self.llm = self._load_local_llm(model_name)
+        # SQL Agent
+        self.sql_db = SQLDatabase.from_uri(sql_db_uri)
+        self.sql_agent = create_sql_agent(
+            self.llm,
+            db=self.sql_db,
+            agent_type="zero-shot-react-description",  # More compatible
+            verbose=True
+        )
+        # Embeddings - Use local HuggingFace model
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
+        )
+        # Vector store
+        self.vector_store = FAISS.load_local(
+            vector_store_path,
+            embeddings,
+            allow_dangerous_deserialization=True
+        )
+        # RAG chain
+        self.rag_chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}),
+            return_source_documents=True
+        )
+    def _load_local_llm(self, model_name: str):
+        """Load LLM locally to use GPU"""
+        # Load model on GPU
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,  # Use FP16 for GPU efficiency
+            device_map="auto",  # Automatically use GPU
+            trust_remote_code=True
+        )
+        # Create text generation pipeline
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=512,
+            temperature=0.1,
+            do_sample=True,
+            top_p=0.95,
+            repetition_penalty=1.15
+        )
+        # Wrap in LangChain
+        return HuggingFacePipeline(pipeline=pipe)