import warnings warnings.filterwarnings(action='ignore') from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace from dotenv import load_dotenv load_dotenv() def load_huggingface_model(): """ Initialize and return a Hugging Face chat model wrapper for RAG-based graphology analysis. Creates a HuggingFaceEndpoint instance connected to the Qwen2.5-7B-Instruct model and wraps it with ChatHuggingFace for conversational compatibility with LangChain. Configuration: - Model: Qwen/Qwen2.5-7B-Instruct (7B parameter instruction-tuned model) - Temperature: 0.65 (balanced between creativity and coherence) - Max new tokens: 1024 - Top-p: 0.92 (nucleus sampling) - Repetition penalty: 1.05 (light discouragement of repetitions) Returns ------- ChatHuggingFace Configured LangChain-compatible chat model ready to be used in chains Notes ----- - Requires HUGGINGFACEHUB_API_TOKEN to be set in environment variables (loaded via dotenv) - Uses inference endpoint (cloud-based inference) — no local GPU/CPU loading - Model is reloaded every time this function is called - Current settings are optimized for structured, precise graphological analysis with controlled creativity - Consider adjusting temperature/max_new_tokens based on response length needs Raises ------ ValueError If HUGGINGFACEHUB_API_TOKEN is missing or invalid """ chat_llm = HuggingFaceEndpoint( repo_id="Qwen/Qwen2.5-7B-Instruct", task="text-generation", temperature=0.65, max_new_tokens=1024, top_p=0.92, repetition_penalty=1.05 ) model = ChatHuggingFace(llm=chat_llm) return model