letstalk / src /core /embedding.py
Adrian Cowham
using finetuned mythomax embedding model
1ccbd42
from typing import List, Type
import torch
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS
from .debug import FakeEmbeddings, FakeVectorStore
from .parsing import File
class FolderIndex:
"""Index for a collection of files (a folder)"""
def __init__(self, files: List[File], index: VectorStore):
self.name: str = "default"
self.files = files
self.index: VectorStore = index
@staticmethod
def _combine_files(files: List[File]) -> List[Document]:
"""Combines all the documents in a list of files into a single list."""
all_texts = []
for file in files:
for doc in file.docs:
doc.metadata["file_name"] = file.name
doc.metadata["file_id"] = file.id
all_texts.append(doc)
return all_texts
@classmethod
def from_files(
cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
) -> "FolderIndex":
"""Creates an index from files."""
all_docs = cls._combine_files(files)
index = vector_store.from_documents(
documents=all_docs,
embedding=embeddings,
)
return cls(files=files, index=index)
def embed_files(files: List[File]) -> FolderIndex:
model_name = "adriancowham/letstalk-mythomax-embed-gte-small"
model_kwargs = {'device': 'cpu'}
if torch.cuda.is_available():
model_kwargs['device'] = 'cuda'
if torch.backends.mps.is_available():
model_kwargs['device'] = 'mps'
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
print("Loading model...")
try:
model_norm = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
except Exception as exception:
print(f"Model not found. Loading fake model...{exception}")
print("Model loaded.")
embeddings = model_norm
return FolderIndex.from_files(
files=files, embeddings=embeddings, vector_store=FAISS
)