| | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
| | from langchain.vectorstores import FAISS |
| | from langchain.embeddings import HuggingFaceEmbeddings |
| | from langchain.llms import HuggingFacePipeline |
| | from langchain.chains import RetrievalQA |
| | import torch |
| |
|
| | class Handler: |
| | def __init__(self): |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | print(f"Using device: {self.device}") |
| |
|
| | |
| | print("Loading model and tokenizer...") |
| | self.model = AutoModelForCausalLM.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) |
| | self.tokenizer = AutoTokenizer.from_pretrained("PranavKeshav/upf_code_generator").to(self.device) |
| |
|
| | |
| | print("Loading FAISS index and embeddings...") |
| | self.embeddings = HuggingFaceEmbeddings() |
| | self.vectorstore = FAISS.load_local("faiss_index", self.embeddings, allow_dangerous_deserialization=True) |
| |
|
| | |
| | print("Creating Hugging Face pipeline...") |
| |
|
| | def run_inference(prompt: str): |
| | |
| | return self.model.generate( |
| | prompt, temperature=0.7, max_length=2048, top_p=0.95, repetition_penalty=1.15 |
| | ) |
| |
|
| | self.hf_pipeline = pipeline( |
| | "text-generation", |
| | model=self.model, |
| | tokenizer=self.tokenizer, |
| | temperature=0.7, |
| | max_new_tokens=2048, |
| | top_p=0.95, |
| | repetition_penalty=1.15 |
| | ) |
| |
|
| | self.hf_pipeline.model.generate = run_inference |
| | |
| | self.llm = HuggingFacePipeline(pipeline=self.hf_pipeline) |
| |
|
| | |
| | self.retriever = self.vectorstore.as_retriever() |
| | self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever) |
| |
|
| | def __call__(self, request): |
| | |
| | prompt = request.json.get("prompt") |
| |
|
| | |
| | response = self.qa_chain.run(prompt) |
| |
|
| | |
| | return {"response": response} |