Saraay's picture
Upload generate_embeddings.py
2892a63 verified
Raw
History Blame Contribute Delete
3.89 kB
import os
import pickle
import time
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from typing import List
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
# Download NLTK stopwords (run once)
nltk.download('stopwords')
nltk.download('wordnet')
# Specify the folder containing PDF documents
folder_path = r'/mnt/e/ML/projects/my_own_projects/nutrition/documents'
# Initialize stopwords
stop_words = set(stopwords.words('english'))
# Function to clean and preprocess text
lemmatizer = WordNetLemmatizer()
def clean_text(text: str) -> str:
# Remove special characters (keep numbers)
text = re.sub(r'[^\w\s\d]', ' ', text)
# Convert to lowercase
text = text.lower()
# Remove stopwords
text = ' '.join([word for word in text.split() if word not in stop_words])
# Lemmatize words
text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
return text
# Function to process PDFs and extract metadata
def process_pdfs(folder_path: str) -> List[Document]:
docs = []
pdf_count = 0
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
pdf_count += 1
file_path = os.path.join(folder_path, filename)
print(f"Processing PDF {pdf_count}: {filename}")
loader = PyPDFLoader(file_path)
pages = loader.load()
for page in pages:
# Clean the text
page.page_content = clean_text(page.page_content)
# Add metadata (e.g., filename)
page.metadata['source'] = filename
docs.extend(pages)
print(f"Total number of PDFs processed: {pdf_count}")
return docs
# Function to split documents into chunks
def split_documents(docs: List[Document]) -> List[Document]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)
print(f"Total number of chunks generated for embeddings: {len(chunks)}")
return chunks
# Function to generate embeddings and create vectorstore
def create_vectorstore(docs: List[Document], persist_directory: str = "./chroma_db_nccn") -> Chroma:
# Initialize the HuggingFace embeddings function
embedding_function = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'} # Use 'cpu' if GPU is not available
)
# Create Chroma vectorstore and persist it
print("Creating vectorstore...")
start_time = time.time()
vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory=persist_directory)
end_time = time.time()
print(f"Time taken to create vectorstore: {end_time - start_time} seconds")
return vectorstore
# Main function
def main():
# Check if processed documents already exist
if os.path.exists("processed_docs.pkl"):
print("Loading processed documents from file...")
with open("processed_docs.pkl", "rb") as f:
docs = pickle.load(f)
else:
print("Processing PDFs...")
docs = process_pdfs(folder_path)
print("Splitting documents into chunks...")
docs = split_documents(docs)
# Save processed documents to file
with open("processed_docs.pkl", "wb") as f:
pickle.dump(docs, f)
# Create vectorstore
vectorstore = create_vectorstore(docs)
# Debugging message: Number of documents stored in vectorstore
print(f"Number of documents stored in the vectorstore: {vectorstore._collection.count()}")
if __name__ == "__main__":
main()