File size: 2,299 Bytes
e71c4e6
 
c99fd41
e71c4e6
5d02356
e71c4e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d02356
1ccbd42
c99fd41
5d02356
c99fd41
 
 
 
 
5d02356
 
1e894a3
 
 
 
 
 
 
 
5d02356
 
e71c4e6
5d02356
e71c4e6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from typing import List, Type

import torch
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import VectorStore
from langchain.vectorstores.faiss import FAISS

from .debug import FakeEmbeddings, FakeVectorStore
from .parsing import File


class FolderIndex:
    """Index for a collection of files (a folder)"""

    def __init__(self, files: List[File], index: VectorStore):
        self.name: str = "default"
        self.files = files
        self.index: VectorStore = index

    @staticmethod
    def _combine_files(files: List[File]) -> List[Document]:
        """Combines all the documents in a list of files into a single list."""

        all_texts = []
        for file in files:
            for doc in file.docs:
                doc.metadata["file_name"] = file.name
                doc.metadata["file_id"] = file.id
                all_texts.append(doc)

        return all_texts

    @classmethod
    def from_files(
        cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
    ) -> "FolderIndex":
        """Creates an index from files."""

        all_docs = cls._combine_files(files)

        index = vector_store.from_documents(
            documents=all_docs,
            embedding=embeddings,
        )

        return cls(files=files, index=index)


def embed_files(files: List[File]) -> FolderIndex:
    model_name = "adriancowham/letstalk-mythomax-embed-gte-small"

    model_kwargs = {'device': 'cpu'}
    if torch.cuda.is_available():
      model_kwargs['device'] = 'cuda'
    if torch.backends.mps.is_available():
      model_kwargs['device'] = 'mps'

    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    print("Loading model...")
    try:
        model_norm = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
        )
    except Exception as exception:
        print(f"Model not found. Loading fake model...{exception}")
    print("Model loaded.")
    embeddings = model_norm
    return FolderIndex.from_files(
        files=files, embeddings=embeddings, vector_store=FAISS
    )