| import os |
| import pickle |
| import time |
| from langchain_community.vectorstores import Chroma |
| from langchain_huggingface import HuggingFaceEmbeddings |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.docstore.document import Document |
| from typing import List |
| import re |
| from nltk.corpus import stopwords |
| from nltk.stem import WordNetLemmatizer |
| import nltk |
|
|
| |
| nltk.download('stopwords') |
| nltk.download('wordnet') |
| |
| folder_path = r'/mnt/e/ML/projects/my_own_projects/nutrition/documents' |
|
|
| |
| stop_words = set(stopwords.words('english')) |
|
|
| |
| lemmatizer = WordNetLemmatizer() |
|
|
| def clean_text(text: str) -> str: |
| |
| text = re.sub(r'[^\w\s\d]', ' ', text) |
| |
| text = text.lower() |
| |
| text = ' '.join([word for word in text.split() if word not in stop_words]) |
| |
| text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) |
| return text |
|
|
| |
| def process_pdfs(folder_path: str) -> List[Document]: |
| docs = [] |
| pdf_count = 0 |
| for filename in os.listdir(folder_path): |
| if filename.endswith('.pdf'): |
| pdf_count += 1 |
| file_path = os.path.join(folder_path, filename) |
| print(f"Processing PDF {pdf_count}: {filename}") |
| loader = PyPDFLoader(file_path) |
| pages = loader.load() |
| for page in pages: |
| |
| page.page_content = clean_text(page.page_content) |
| |
| page.metadata['source'] = filename |
| docs.extend(pages) |
| print(f"Total number of PDFs processed: {pdf_count}") |
| return docs |
|
|
| |
| def split_documents(docs: List[Document]) -> List[Document]: |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
| chunks = text_splitter.split_documents(docs) |
| print(f"Total number of chunks generated for embeddings: {len(chunks)}") |
| return chunks |
|
|
| |
| def create_vectorstore(docs: List[Document], persist_directory: str = "./chroma_db_nccn") -> Chroma: |
| |
| embedding_function = HuggingFaceEmbeddings( |
| model_name="sentence-transformers/all-MiniLM-L6-v2", |
| model_kwargs={'device': 'cpu'} |
| ) |
|
|
| |
| print("Creating vectorstore...") |
| start_time = time.time() |
| vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory=persist_directory) |
| end_time = time.time() |
| print(f"Time taken to create vectorstore: {end_time - start_time} seconds") |
| return vectorstore |
|
|
| |
| def main(): |
| |
| if os.path.exists("processed_docs.pkl"): |
| print("Loading processed documents from file...") |
| with open("processed_docs.pkl", "rb") as f: |
| docs = pickle.load(f) |
| else: |
| print("Processing PDFs...") |
| docs = process_pdfs(folder_path) |
| print("Splitting documents into chunks...") |
| docs = split_documents(docs) |
| |
| with open("processed_docs.pkl", "wb") as f: |
| pickle.dump(docs, f) |
|
|
| |
| vectorstore = create_vectorstore(docs) |
|
|
| |
| print(f"Number of documents stored in the vectorstore: {vectorstore._collection.count()}") |
|
|
| if __name__ == "__main__": |
| main() |