VcRlAgent commited on
Commit
5e7defa
·
1 Parent(s): d52e728

Starter LLM Inference Call

Browse files
Files changed (2) hide show
  1. __init__.py → app/__init__.py +0 -0
  2. hybrid_rag.py +89 -0
__init__.py → app/__init__.py RENAMED
File without changes
hybrid_rag.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.utilities import SQLDatabase
2
+ from langchain_community.agent_toolkits import create_sql_agent
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain.chains import RetrievalQA
5
+ from langchain.prompts import PromptTemplate
6
+
7
+ # ✅ REPLACE OpenAI with HuggingFace models
8
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
9
+
10
+ # Alternative: Use transformers directly for more control
11
+ from langchain_community.llms import HuggingFacePipeline
12
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
13
+ import torch
14
+
15
+ class HybridJiraRAG:
16
+ """Hybrid RAG system for HuggingFace Spaces GPU"""
17
+
18
+ def __init__(
19
+ self,
20
+ sql_db_uri: str,
21
+ vector_store_path: str,
22
+ hf_token: str = None,
23
+ model_name: str = "meta-llama/Llama-3.2-3B-Instruct"
24
+ ):
25
+ # Option 1: Use HF Inference API (doesn't use your GPU)
26
+ # self.llm = HuggingFaceEndpoint(
27
+ # repo_id=model_name,
28
+ # huggingfacehub_api_token=hf_token,
29
+ # temperature=0.1,
30
+ # max_new_tokens=512
31
+ # )
32
+
33
+ # Option 2: Load model locally on GPU (RECOMMENDED for HF Spaces)
34
+ self.llm = self._load_local_llm(model_name)
35
+
36
+ # SQL Agent
37
+ self.sql_db = SQLDatabase.from_uri(sql_db_uri)
38
+ self.sql_agent = create_sql_agent(
39
+ self.llm,
40
+ db=self.sql_db,
41
+ agent_type="zero-shot-react-description", # More compatible
42
+ verbose=True
43
+ )
44
+
45
+ # Embeddings - Use local HuggingFace model
46
+ embeddings = HuggingFaceEmbeddings(
47
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
48
+ model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
49
+ )
50
+
51
+ # Vector store
52
+ self.vector_store = FAISS.load_local(
53
+ vector_store_path,
54
+ embeddings,
55
+ allow_dangerous_deserialization=True
56
+ )
57
+
58
+ # RAG chain
59
+ self.rag_chain = RetrievalQA.from_chain_type(
60
+ llm=self.llm,
61
+ retriever=self.vector_store.as_retriever(search_kwargs={"k": 5}),
62
+ return_source_documents=True
63
+ )
64
+
65
+ def _load_local_llm(self, model_name: str):
66
+ """Load LLM locally to use GPU"""
67
+ # Load model on GPU
68
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
69
+ model = AutoModelForCausalLM.from_pretrained(
70
+ model_name,
71
+ torch_dtype=torch.float16, # Use FP16 for GPU efficiency
72
+ device_map="auto", # Automatically use GPU
73
+ trust_remote_code=True
74
+ )
75
+
76
+ # Create text generation pipeline
77
+ pipe = pipeline(
78
+ "text-generation",
79
+ model=model,
80
+ tokenizer=tokenizer,
81
+ max_new_tokens=512,
82
+ temperature=0.1,
83
+ do_sample=True,
84
+ top_p=0.95,
85
+ repetition_penalty=1.15
86
+ )
87
+
88
+ # Wrap in LangChain
89
+ return HuggingFacePipeline(pipeline=pipe)