| import os |
| from src.utils.pdf_splitter import DataExtractor |
| from langchain_community.embeddings import HuggingFaceEmbeddings |
| from langchain_community.vectorstores import FAISS |
|
|
| class VectorDatabase: |
| def __init__(self, db_name): |
| self.db_name = db_name |
| self.persist_directory = os.path.join("vector_embedding", self.db_name) |
| |
| |
| self.embeddings = HuggingFaceEmbeddings( |
| model_name="sentence-transformers/all-MiniLM-L6-v2", |
| model_kwargs={"device": "cpu"}, |
| encode_kwargs={ |
| "padding": "max_length", |
| "max_length": 512, |
| "truncation": True, |
| "normalize_embeddings": True |
| } |
| ) |
|
|
| def create_db(self, pdf_data): |
| |
| self.vectDB = FAISS.from_documents( |
| documents=pdf_data, |
| embedding=self.embeddings |
| ) |
| self.vectDB.save_local(self.persist_directory) |
| |
|
|
| def main(): |
| pdf_directory = './data/mental_health' |
| data_extractor = DataExtractor(pdf_directory) |
| text_data = data_extractor.extract_text() |
| text_data = data_extractor.clean_and_split_text(text_data) |
| |
| |
| vector_db = VectorDatabase(db_name="mental_health_vector_db") |
| vector_db.create_db(text_data) |
| print("Vector embeddings have been generated and loaded successfully.") |
|
|
| if __name__ == "__main__": |
| main() |