import os import pickle import time from langchain_community.vectorstores import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain.docstore.document import Document from typing import List import re from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import nltk # Download NLTK stopwords (run once) nltk.download('stopwords') nltk.download('wordnet') # Specify the folder containing PDF documents folder_path = r'/mnt/e/ML/projects/my_own_projects/nutrition/documents' # Initialize stopwords stop_words = set(stopwords.words('english')) # Function to clean and preprocess text lemmatizer = WordNetLemmatizer() def clean_text(text: str) -> str: # Remove special characters (keep numbers) text = re.sub(r'[^\w\s\d]', ' ', text) # Convert to lowercase text = text.lower() # Remove stopwords text = ' '.join([word for word in text.split() if word not in stop_words]) # Lemmatize words text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) return text # Function to process PDFs and extract metadata def process_pdfs(folder_path: str) -> List[Document]: docs = [] pdf_count = 0 for filename in os.listdir(folder_path): if filename.endswith('.pdf'): pdf_count += 1 file_path = os.path.join(folder_path, filename) print(f"Processing PDF {pdf_count}: {filename}") loader = PyPDFLoader(file_path) pages = loader.load() for page in pages: # Clean the text page.page_content = clean_text(page.page_content) # Add metadata (e.g., filename) page.metadata['source'] = filename docs.extend(pages) print(f"Total number of PDFs processed: {pdf_count}") return docs # Function to split documents into chunks def split_documents(docs: List[Document]) -> List[Document]: text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) chunks = text_splitter.split_documents(docs) print(f"Total number of chunks generated for embeddings: {len(chunks)}") return chunks # Function to generate embeddings and create vectorstore def create_vectorstore(docs: List[Document], persist_directory: str = "./chroma_db_nccn") -> Chroma: # Initialize the HuggingFace embeddings function embedding_function = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} # Use 'cpu' if GPU is not available ) # Create Chroma vectorstore and persist it print("Creating vectorstore...") start_time = time.time() vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory=persist_directory) end_time = time.time() print(f"Time taken to create vectorstore: {end_time - start_time} seconds") return vectorstore # Main function def main(): # Check if processed documents already exist if os.path.exists("processed_docs.pkl"): print("Loading processed documents from file...") with open("processed_docs.pkl", "rb") as f: docs = pickle.load(f) else: print("Processing PDFs...") docs = process_pdfs(folder_path) print("Splitting documents into chunks...") docs = split_documents(docs) # Save processed documents to file with open("processed_docs.pkl", "wb") as f: pickle.dump(docs, f) # Create vectorstore vectorstore = create_vectorstore(docs) # Debugging message: Number of documents stored in vectorstore print(f"Number of documents stored in the vectorstore: {vectorstore._collection.count()}") if __name__ == "__main__": main()