Spaces:
Sleeping
Sleeping
File size: 1,791 Bytes
1d8ed3b cf868d7 1d8ed3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import chromadb
from chromadb.utils import embedding_functions
from .text_processing import text_chunking
def initialize_chromadb(EMBEDDING_MODEL, local_model_path=None):
"""
Initialize ChromaDB client and embedding function, using a local model path if provided.
"""
client = chromadb.Client()
if local_model_path:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=local_model_path
)
else:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=EMBEDDING_MODEL
)
return client, embedding_func
def initialize_collection(client, embedding_func, collection_name):
"""
Initialize a collection in ChromaDB.
"""
collection = client.get_or_create_collection(
name=collection_name,
embedding_function=embedding_func,
metadata={"hnsw:space": "cosine"},
)
return collection
def update_collection(
collection,
text,
max_words=200,
min_words=100,
overlap_sentences=3,
):
"""
Update the ChromaDB collection with text chunks.
Args:
collection: ChromaDB collection object.
text (str): The text to be chunked and added.
max_words (int): Maximum number of words per chunk.
min_words (int): Minimum number of words per chunk.
overlap_sentences (int): Number of sentences to overlap between chunks.
Returns:
None
"""
chunks = text_chunking(text, max_words=max_words, min_words=min_words, overlap_sentences=overlap_sentences)
collection.add(
documents=chunks,
ids=[f"chunk_{j:04d}" for j in range(len(chunks))],
metadatas=[{"chunk_index": j} for j in range(len(chunks))]
)
|