Spaces:
Sleeping
Sleeping
| import glob | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Optional | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.document_loaders import CSVLoader | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler('vectorize.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| class VectorizationError(Exception): | |
| """Custom exception for vectorization-related errors""" | |
| pass | |
| def load_csv_documents(csv_file_path: str) -> List[Document]: | |
| """ | |
| Load CSV documents from the specified path. | |
| Args: | |
| csv_file_path (str): Path pattern to search for CSV files. | |
| Returns: | |
| List[Document]: A list of documents loaded from the CSV files. | |
| Raises: | |
| VectorizationError: If no CSV files are found or if there's an error loading them. | |
| """ | |
| try: | |
| documents = [] | |
| csv_files = list(glob.glob(csv_file_path)) | |
| if not csv_files: | |
| raise VectorizationError(f"No CSV files found at path: {csv_file_path}") | |
| for csv_file in csv_files: | |
| logging.info(f"Loading CSV file: {csv_file}") | |
| loader = CSVLoader(csv_file, encoding="utf-8") | |
| documents.extend(loader.load()) | |
| logging.info(f"Successfully loaded {len(documents)} documents from {len(csv_files)} CSV files") | |
| return documents | |
| except Exception as e: | |
| raise VectorizationError(f"Error loading CSV documents: {str(e)}") | |
| def create_vector_store( | |
| documents: List[Document], | |
| embeddings_model: HuggingFaceEmbeddings, | |
| output_path: str, | |
| chunk_size: int = 500, | |
| chunk_overlap: int = 50 | |
| ) -> Optional[FAISS]: | |
| """ | |
| Create and save a FAISS vector store from documents. | |
| Args: | |
| documents (List[Document]): List of documents to vectorize | |
| embeddings_model (HuggingFaceEmbeddings): The embeddings model to use | |
| output_path (str): Path to save the FAISS index | |
| chunk_size (int, optional): Size of text chunks. Defaults to 500. | |
| chunk_overlap (int, optional): Overlap between chunks. Defaults to 50. | |
| Returns: | |
| Optional[FAISS]: The created FAISS index if successful, None otherwise | |
| """ | |
| try: | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| chunked_documents = text_splitter.split_documents(documents) | |
| logging.info(f"Created {len(chunked_documents)} chunks from {len(documents)} documents") | |
| faiss_index = FAISS.from_documents(chunked_documents, embeddings_model) | |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
| faiss_index.save_local(output_path) | |
| logging.info(f"Successfully saved FAISS index to {output_path}") | |
| return faiss_index | |
| except Exception as e: | |
| logging.error(f"Error creating vector store: {str(e)}") | |
| return None | |
| def main(): | |
| try: | |
| # Configuration with relative paths | |
| config = { | |
| 'msd_data_path': "./processed_data/msd/msd_processed.csv", | |
| 'medical_csv_path': "./processed_data/cbip/*.csv", | |
| 'msd_vector_path': "./vectors_data/msd_data_vec", | |
| 'medical_vector_path': "./vectors_data/med_data_vec", | |
| 'model_name': "sentence-transformers/all-MiniLM-L12-v2" | |
| } | |
| # Create vectors_data directory if it doesn't exist | |
| Path("./vectors_data").mkdir(exist_ok=True) | |
| logging.info("Starting vectorization process") | |
| # Load documents | |
| msd_data_documents = load_csv_documents(config['msd_data_path']) | |
| medical_documents = load_csv_documents(config['medical_csv_path']) | |
| # Initialize embeddings model | |
| logging.info(f"Initializing embeddings model: {config['model_name']}") | |
| embeddings_model = HuggingFaceEmbeddings(model_name=config['model_name']) | |
| # Create vector stores | |
| msd_index = create_vector_store( | |
| msd_data_documents, | |
| embeddings_model, | |
| config['msd_vector_path'] | |
| ) | |
| medical_index = create_vector_store( | |
| medical_documents, | |
| embeddings_model, | |
| config['medical_vector_path'] | |
| ) | |
| if msd_index and medical_index: | |
| logging.info("Vectorization process completed successfully") | |
| else: | |
| logging.error("Vectorization process completed with errors") | |
| except VectorizationError as ve: | |
| logging.error(f"Vectorization error: {str(ve)}") | |
| raise | |
| except Exception as e: | |
| logging.error(f"Unexpected error: {str(e)}") | |
| raise | |
| if __name__ == "__main__": | |
| main() |