git-chat / services /embedding_service.py
Chintala Venkatesh
Fix Space dependency mismatch and startup import
688791e
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
import chromadb
from typing import List, Dict
import os
class FreeEmbeddingService:
def __init__(self, embedding_provider: str, vector_db_path: str, model_name: str = "all-MiniLM-L6-v2"):
self.vector_db_path = vector_db_path
self.embedding_provider = embedding_provider
# Initialize embedding function based on provider
if embedding_provider == "sentence_transformers":
self.embeddings = SentenceTransformerEmbeddings(
model_name=model_name,
cache_folder="./models" # Cache models locally
)
elif embedding_provider == "huggingface":
self.embeddings = HuggingFaceEmbeddings(
model_name=model_name,
cache_folder="./models"
)
else:
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""]
)
def create_documents(self, files: List[Dict], repo_id: str) -> List[Document]:
"""Create documents from file contents"""
documents = []
for file_info in files:
# Create document with metadata
doc = Document(
page_content=file_info['content'],
metadata={
'path': file_info['path'],
'extension': file_info['extension'],
'repo_id': repo_id,
'size': file_info['size']
}
)
documents.append(doc)
return documents
def split_documents(self, documents: List[Document]) -> List[Document]:
"""Split documents into chunks"""
return self.text_splitter.split_documents(documents)
async def create_embeddings(self, files: List[Dict], repo_id: str):
"""Create and store embeddings for repository files"""
# Create documents
documents = self.create_documents(files, repo_id)
# Split into chunks
chunks = self.split_documents(documents)
# Create vector store
collection_name = f"repo_{repo_id}"
vectorstore = Chroma(
collection_name=collection_name,
embedding_function=self.embeddings,
persist_directory=self.vector_db_path
)
# Add documents to vector store in batches
batch_size = 100
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
vectorstore.add_documents(batch)
return vectorstore
def get_vectorstore(self, repo_id: str):
"""Get existing vector store for repository"""
collection_name = f"repo_{repo_id}"
return Chroma(
collection_name=collection_name,
embedding_function=self.embeddings,
persist_directory=self.vector_db_path
)