"""LLM loading and initialization."""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
import streamlit as st


@st.cache_resource
def load_qwen_llm(model_name, max_new_tokens=256, temperature=0.7, top_p=0.95):
    """
    Load Qwen LLM model.
    
    Args:
        model_name: HuggingFace model identifier
        max_new_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
        
    Returns:
        HuggingFacePipeline: Wrapped LLM for LangChain
    """
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True
    )
    
    # Create pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        return_full_text=False
    )
    
    return HuggingFacePipeline(pipeline=pipe)