Spaces:
Running
Running
| import os | |
| from langchain_community.document_loaders import PyPDFLoader, TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| DATA_PATH = "data" | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DB_FAISS_PATH = os.path.join(SCRIPT_DIR, "vectorstore", "db_faiss") | |
| def create_vector_db(): | |
| documents = [] | |
| # Check if data directory exists | |
| if not os.path.exists(DATA_PATH): | |
| print(f"Directory {DATA_PATH} not found.") | |
| return | |
| # Load documents | |
| for filename in os.listdir(DATA_PATH): | |
| file_path = os.path.join(DATA_PATH, filename) | |
| if filename.endswith(".pdf"): | |
| loader = PyPDFLoader(file_path) | |
| documents.extend(loader.load()) | |
| print(f"Loaded {filename}") | |
| elif filename.endswith(".txt"): | |
| loader = TextLoader(file_path, encoding='utf-8') | |
| documents.extend(loader.load()) | |
| print(f"Loaded {filename}") | |
| if not documents: | |
| print("No documents found to ingest.") | |
| return | |
| # Split text | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| texts = text_splitter.split_documents(documents) | |
| print(f"Split documents into {len(texts)} chunks.") | |
| # Create embeddings (using HuggingFace - FREE!) | |
| print("Generating embeddings locally with sentence-transformers...") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name='sentence-transformers/all-MiniLM-L6-v2', | |
| model_kwargs={'device': 'cpu'} | |
| ) | |
| # Create vector store | |
| db = FAISS.from_documents(texts, embeddings) | |
| db.save_local(DB_FAISS_PATH) | |
| print(f"Vector store saved to {DB_FAISS_PATH}") | |
| if __name__ == "__main__": | |
| create_vector_db() | |