| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.schema import Document |
| from langchain_community.vectorstores import Chroma |
| from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings |
| import chromadb |
| from typing import List, Dict |
| import os |
|
|
| class FreeEmbeddingService: |
| def __init__(self, embedding_provider: str, vector_db_path: str, model_name: str = "all-MiniLM-L6-v2"): |
| self.vector_db_path = vector_db_path |
| self.embedding_provider = embedding_provider |
| |
| |
| if embedding_provider == "sentence_transformers": |
| self.embeddings = SentenceTransformerEmbeddings( |
| model_name=model_name, |
| cache_folder="./models" |
| ) |
| elif embedding_provider == "huggingface": |
| self.embeddings = HuggingFaceEmbeddings( |
| model_name=model_name, |
| cache_folder="./models" |
| ) |
| else: |
| raise ValueError(f"Unsupported embedding provider: {embedding_provider}") |
| |
| self.text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=1000, |
| chunk_overlap=200, |
| separators=["\n\n", "\n", " ", ""] |
| ) |
| |
| def create_documents(self, files: List[Dict], repo_id: str) -> List[Document]: |
| """Create documents from file contents""" |
| documents = [] |
| |
| for file_info in files: |
| |
| doc = Document( |
| page_content=file_info['content'], |
| metadata={ |
| 'path': file_info['path'], |
| 'extension': file_info['extension'], |
| 'repo_id': repo_id, |
| 'size': file_info['size'] |
| } |
| ) |
| documents.append(doc) |
| |
| return documents |
| |
| def split_documents(self, documents: List[Document]) -> List[Document]: |
| """Split documents into chunks""" |
| return self.text_splitter.split_documents(documents) |
| |
| async def create_embeddings(self, files: List[Dict], repo_id: str): |
| """Create and store embeddings for repository files""" |
| |
| documents = self.create_documents(files, repo_id) |
| |
| |
| chunks = self.split_documents(documents) |
| |
| |
| collection_name = f"repo_{repo_id}" |
| vectorstore = Chroma( |
| collection_name=collection_name, |
| embedding_function=self.embeddings, |
| persist_directory=self.vector_db_path |
| ) |
| |
| |
| batch_size = 100 |
| for i in range(0, len(chunks), batch_size): |
| batch = chunks[i:i + batch_size] |
| vectorstore.add_documents(batch) |
| |
| return vectorstore |
| |
| def get_vectorstore(self, repo_id: str): |
| """Get existing vector store for repository""" |
| collection_name = f"repo_{repo_id}" |
| return Chroma( |
| collection_name=collection_name, |
| embedding_function=self.embeddings, |
| persist_directory=self.vector_db_path |
| ) |
|
|