Spaces:
Configuration error
Configuration error
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| import pickle | |
| # Load the PDF | |
| pdf_path = "data\Mental Health Handbook English.pdf" | |
| loader = PyPDFLoader(file_path=pdf_path) | |
| # Load the content | |
| documents = loader.load() | |
| # Split the document into sections | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200) | |
| sections = text_splitter.split_documents(documents) | |
| # Load the embedding model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Generate embeddings for each section | |
| section_texts = [section.page_content for section in sections] | |
| embeddings = model.encode(section_texts) | |
| print(embeddings.shape) | |
| embeddings_np = np.array(embeddings).astype('float32') | |
| # Create a FAISS index | |
| dimension = embeddings_np.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| # Add vectors to the index | |
| index.add(embeddings_np) | |
| # Save the index to a file | |
| faiss.write_index(index, "database/pdf_sections_index.faiss") | |
| # When creating the index: | |
| sections_data = [ | |
| { | |
| 'content': section.page_content, | |
| 'metadata': section.metadata | |
| } | |
| for section in sections | |
| ] | |
| # Save sections data | |
| with open('database/pdf_sections_data.pkl', 'wb') as f: | |
| pickle.dump(sections_data, f) | |
| print("Embeddings stored in FAISS index and saved to file.") | |