import os from langchain_core.embeddings import Embeddings from typing import List import numpy as np import onnxruntime as ort from huggingface_hub import hf_hub_download from transformers import AutoTokenizer # huggingface-cli login 혹은 HF_TOKEN 환경변수 필요 hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") class OnnxGemmaWrapper(Embeddings): def __init__(self, model_id, token=None): print(f"Loading ONNX model: {model_id}...") self.tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) # ONNX 모델 및 가중치 다운로드 model_path = hf_hub_download(model_id, subfolder="onnx", filename="model.onnx", token=token) try: hf_hub_download(model_id, subfolder="onnx", filename="model.onnx_data", token=token) except Exception: pass # model.onnx_data가 없을 수도 있음 (작은 모델의 경우) # 추론 세션 생성 (GPU 사용 가능 시 CUDAProvider 사용, 없으면 CPU) available_providers = ort.get_available_providers() if 'CUDAExecutionProvider' in available_providers: print("CUDA detected. Using GPU.") providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] else: print("CUDA not detected. Using CPU.") providers = ['CPUExecutionProvider'] self.session = ort.InferenceSession(model_path, providers=providers) # Prefix 정의 self.prefixes = { "query": "task: search result | query: ", "document": "title: none | text: ", } print("ONNX Model loaded successfully.") def _run_inference(self, texts: List[str]): inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="np") # ONNX Runtime 실행 (output[0]: last_hidden_state, output[1]: pooler_output or sentence_embedding) # EmbeddingGemma ONNX 모델은 보통 두 번째 리턴값이 sentence embedding입니다. outputs = self.session.run(None, dict(inputs)) # outputs[1]이 (Batch, 768) 형태의 임베딩 return outputs[1] def encode_document(self, documents: List[str]) -> np.ndarray: # 문서용 Prefix 추가 prefixed_docs = [self.prefixes["document"] + doc for doc in documents] return self._run_inference(prefixed_docs) def encode_query(self, query: str) -> np.ndarray: # 쿼리용 Prefix 추가 (단일 쿼리도 리스트로 처리) prefixed_query = [self.prefixes["query"] + query] return self._run_inference(prefixed_query)[0] def similarity(self, query_emb: np.ndarray, doc_embs: np.ndarray) -> np.ndarray: if query_emb.ndim == 1: query_emb = query_emb.reshape(1, -1) scores = query_emb @ doc_embs.T return scores.flatten() # --- LangChain Compatibility Methods --- def embed_documents(self, texts: List[str]) -> List[List[float]]: return self.encode_document(texts).tolist() def embed_query(self, text: str) -> List[float]: return self.encode_query(text).tolist() import torch import torchvision.transforms as transforms from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights from PIL import Image # ... (existing OnnxGemmaWrapper and get_embedding_model) class EfficientNetV2Embedding: def __init__(self): print("Loading EfficientNetV2-S model...") self.weights = EfficientNet_V2_S_Weights.DEFAULT self.model = efficientnet_v2_s(weights=self.weights) self.model.eval() # Remove the classification head to get embeddings self.model.classifier = torch.nn.Identity() self.preprocess = self.weights.transforms() print("EfficientNetV2-S model loaded successfully.") def embed_image(self, image: Image.Image) -> List[float]: # Preprocess image img_tensor = self.preprocess(image).unsqueeze(0) with torch.no_grad(): embedding = self.model(img_tensor) return embedding.squeeze(0).tolist() # 전역 싱글톤 인스턴스 저장소 _embedding_model = None _image_embedding_model = None def get_embedding_model() -> OnnxGemmaWrapper: """ ONNX 임베딩 모델을 최초 1회 로드하여 싱글톤으로 재사용합니다. """ global _embedding_model if _embedding_model is None: _embedding_model = OnnxGemmaWrapper( model_id="onnx-community/embeddinggemma-300m-ONNX", token=hf_token ) return _embedding_model def get_image_embedding_model() -> EfficientNetV2Embedding: """ EfficientNetV2-S 모델을 최초 1회 로드하여 싱글톤으로 재사용합니다. """ global _image_embedding_model if _image_embedding_model is None: _image_embedding_model = EfficientNetV2Embedding() return _image_embedding_model