| from langchain.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| import numpy as np |
| import pickle |
|
|
| |
| pdf_path = "data\Mental Health Handbook English.pdf" |
| loader = PyPDFLoader(file_path=pdf_path) |
|
|
| |
| documents = loader.load() |
|
|
| |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) |
| sections = text_splitter.split_documents(documents) |
|
|
| |
| model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| |
| section_texts = [section.page_content for section in sections] |
| embeddings = model.encode(section_texts) |
|
|
| print(embeddings.shape) |
|
|
| embeddings_np = np.array(embeddings).astype('float32') |
|
|
| |
| dimension = embeddings_np.shape[1] |
| index = faiss.IndexFlatL2(dimension) |
|
|
| |
| index.add(embeddings_np) |
|
|
| |
| faiss.write_index(index, "database/pdf_sections_index.faiss") |
|
|
| |
| sections_data = [ |
| { |
| 'content': section.page_content, |
| 'metadata': section.metadata |
| } |
| for section in sections |
| ] |
|
|
| |
| with open('database/pdf_sections_data.pkl', 'wb') as f: |
| pickle.dump(sections_data, f) |
|
|
| print("Embeddings stored in FAISS index and saved to file.") |
|
|