import os from datasets import load_dataset from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings import torch # Use /tmp for cache directory which should be writable CACHE_DIR = "/tmp/huggingface_cache" FAISS_INDEX_PATH = "/tmp/faiss_index_scientific_papers" EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Create cache directories os.makedirs(CACHE_DIR, exist_ok=True) os.makedirs(os.path.dirname(FAISS_INDEX_PATH), exist_ok=True) def get_vector_store(): """Creates or loads the FAISS vector store.""" # Initialize embeddings with proper cache folder embeddings = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL_NAME, cache_folder=CACHE_DIR, model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"} ) # Check if FAISS index already exists if os.path.exists(FAISS_INDEX_PATH): try: return FAISS.load_local( FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True ) except Exception as e: print(f"Failed to load existing index: {e}") # Continue to create new index # Create a new FAISS index print("Creating new FAISS index...") try: full_dataset = load_dataset("franz96521/scientific_papers", split='train', streaming=True) subset_dataset_iterable = full_dataset.take(100) papers_data = list(subset_dataset_iterable) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) all_chunks = [] for paper in papers_data: chunks = text_splitter.split_text(paper['full_text']) for chunk in chunks: all_chunks.append(Document( page_content=chunk, metadata={"paper_id": paper['id']} )) print(f"Created {len(all_chunks)} document chunks") vector_store = FAISS.from_documents(all_chunks, embeddings) # Save the index vector_store.save_local(FAISS_INDEX_PATH) print("FAISS index saved successfully") return vector_store except Exception as e: print(f"Error creating vector store: {e}") raise