|
|
"""LLM loading and initialization.""" |
|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
from langchain_community.llms import HuggingFacePipeline |
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_qwen_llm(model_name, max_new_tokens=256, temperature=0.7, top_p=0.95): |
|
|
""" |
|
|
Load Qwen LLM model. |
|
|
|
|
|
Args: |
|
|
model_name: HuggingFace model identifier |
|
|
max_new_tokens: Maximum tokens to generate |
|
|
temperature: Sampling temperature |
|
|
top_p: Nucleus sampling parameter |
|
|
|
|
|
Returns: |
|
|
HuggingFacePipeline: Wrapped LLM for LangChain |
|
|
""" |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
|
device_map="auto" if torch.cuda.is_available() else None, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
max_new_tokens=max_new_tokens, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
do_sample=True, |
|
|
return_full_text=False |
|
|
) |
|
|
|
|
|
return HuggingFacePipeline(pipeline=pipe) |
|
|
|