Spaces:

Saraay
/

Intelligent_Nutrition_assistant_Using_RAG

Running

App Files Files Community

Intelligent_Nutrition_assistant_Using_RAG / generate_embeddings.py

Saraay

Upload generate_embeddings.py

2892a63 verified over 1 year ago

Raw

History Blame Contribute Delete

3.89 kB

	import os
	import pickle
	import time
	from langchain_community.vectorstores import Chroma
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.docstore.document import Document
	from typing import List
	import re
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	import nltk

	# Download NLTK stopwords (run once)
	nltk.download('stopwords')
	nltk.download('wordnet')
	# Specify the folder containing PDF documents
	folder_path = r'/mnt/e/ML/projects/my_own_projects/nutrition/documents'

	# Initialize stopwords
	stop_words = set(stopwords.words('english'))

	# Function to clean and preprocess text
	lemmatizer = WordNetLemmatizer()

	def clean_text(text: str) -> str:
	# Remove special characters (keep numbers)
	text = re.sub(r'[^\w\s\d]', ' ', text)
	# Convert to lowercase
	text = text.lower()
	# Remove stopwords
	text = ' '.join([word for word in text.split() if word not in stop_words])
	# Lemmatize words
	text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
	return text

	# Function to process PDFs and extract metadata
	def process_pdfs(folder_path: str) -> List[Document]:
	docs = []
	pdf_count = 0
	for filename in os.listdir(folder_path):
	if filename.endswith('.pdf'):
	pdf_count += 1
	file_path = os.path.join(folder_path, filename)
	print(f"Processing PDF {pdf_count}: {filename}")
	loader = PyPDFLoader(file_path)
	pages = loader.load()
	for page in pages:
	# Clean the text
	page.page_content = clean_text(page.page_content)
	# Add metadata (e.g., filename)
	page.metadata['source'] = filename
	docs.extend(pages)
	print(f"Total number of PDFs processed: {pdf_count}")
	return docs

	# Function to split documents into chunks
	def split_documents(docs: List[Document]) -> List[Document]:
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	chunks = text_splitter.split_documents(docs)
	print(f"Total number of chunks generated for embeddings: {len(chunks)}")
	return chunks

	# Function to generate embeddings and create vectorstore
	def create_vectorstore(docs: List[Document], persist_directory: str = "./chroma_db_nccn") -> Chroma:
	# Initialize the HuggingFace embeddings function
	embedding_function = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'} # Use 'cpu' if GPU is not available
	)

	# Create Chroma vectorstore and persist it
	print("Creating vectorstore...")
	start_time = time.time()
	vectorstore = Chroma.from_documents(docs, embedding_function, persist_directory=persist_directory)
	end_time = time.time()
	print(f"Time taken to create vectorstore: {end_time - start_time} seconds")
	return vectorstore

	# Main function
	def main():
	# Check if processed documents already exist
	if os.path.exists("processed_docs.pkl"):
	print("Loading processed documents from file...")
	with open("processed_docs.pkl", "rb") as f:
	docs = pickle.load(f)
	else:
	print("Processing PDFs...")
	docs = process_pdfs(folder_path)
	print("Splitting documents into chunks...")
	docs = split_documents(docs)
	# Save processed documents to file
	with open("processed_docs.pkl", "wb") as f:
	pickle.dump(docs, f)

	# Create vectorstore
	vectorstore = create_vectorstore(docs)

	# Debugging message: Number of documents stored in vectorstore
	print(f"Number of documents stored in the vectorstore: {vectorstore._collection.count()}")

	if __name__ == "__main__":
	main()