from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings import chromadb from typing import List, Dict import os class FreeEmbeddingService: def __init__(self, embedding_provider: str, vector_db_path: str, model_name: str = "all-MiniLM-L6-v2"): self.vector_db_path = vector_db_path self.embedding_provider = embedding_provider # Initialize embedding function based on provider if embedding_provider == "sentence_transformers": self.embeddings = SentenceTransformerEmbeddings( model_name=model_name, cache_folder="./models" # Cache models locally ) elif embedding_provider == "huggingface": self.embeddings = HuggingFaceEmbeddings( model_name=model_name, cache_folder="./models" ) else: raise ValueError(f"Unsupported embedding provider: {embedding_provider}") self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""] ) def create_documents(self, files: List[Dict], repo_id: str) -> List[Document]: """Create documents from file contents""" documents = [] for file_info in files: # Create document with metadata doc = Document( page_content=file_info['content'], metadata={ 'path': file_info['path'], 'extension': file_info['extension'], 'repo_id': repo_id, 'size': file_info['size'] } ) documents.append(doc) return documents def split_documents(self, documents: List[Document]) -> List[Document]: """Split documents into chunks""" return self.text_splitter.split_documents(documents) async def create_embeddings(self, files: List[Dict], repo_id: str): """Create and store embeddings for repository files""" # Create documents documents = self.create_documents(files, repo_id) # Split into chunks chunks = self.split_documents(documents) # Create vector store collection_name = f"repo_{repo_id}" vectorstore = Chroma( collection_name=collection_name, embedding_function=self.embeddings, persist_directory=self.vector_db_path ) # Add documents to vector store in batches batch_size = 100 for i in range(0, len(chunks), batch_size): batch = chunks[i:i + batch_size] vectorstore.add_documents(batch) return vectorstore def get_vectorstore(self, repo_id: str): """Get existing vector store for repository""" collection_name = f"repo_{repo_id}" return Chroma( collection_name=collection_name, embedding_function=self.embeddings, persist_directory=self.vector_db_path )