LVEBOT / cDB.py
middha's picture
Upload 4 files
70c1fad verified
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
from dotenv import load_dotenv
import docx # For .docx files
import pdfplumber # For .pdf files
# Load environment variables from .env file
load_dotenv()
# Retrieve the API key
openai_api_key = os.getenv('OPENAI_API_KEY')
# Ensure the API key is set for embedding function, if needed
CHROMA_PATH = "chroma"
DATA_PATH = "data/books"
def main():
file_types = ['.txt', '.docx', '.pdf']
documents = load_documents(DATA_PATH, file_types)
print(f"Loaded {len(documents)} documents.")
generate_data_store(documents)
def generate_data_store(documents):
print("Starting to generate the data store...")
if not documents:
print("No documents to process. Exiting.")
return
chunks = split_text(documents)
if not chunks:
print("No chunks were created. Exiting.")
return
save_to_chroma(chunks)
def load_documents(directory, file_types):
documents = []
for filename in os.listdir(directory):
if not filename.endswith(tuple(file_types)):
continue # Skip files that do not match the specified types
filepath = os.path.join(directory, filename)
content = ""
if filename.endswith('.txt'):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
elif filename.endswith('.docx'):
doc = docx.Document(filepath)
content = '\n'.join(paragraph.text for paragraph in doc.paragraphs)
elif filename.endswith('.pdf'):
with pdfplumber.open(filepath) as pdf:
content = '\n'.join(page.extract_text() for page in pdf.pages if page.extract_text())
# Add your document to the list, here assuming a simple text content structure
documents.append({'content': content, 'metadata': {'source': filename}})
return documents
from langchain.schema import Document # Make sure this import remains if you choose to convert dictionaries to Document objects
def split_text(documents):
print("Splitting text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=0,
length_function=len,
add_start_index=True,
)
# Prepare documents ensuring they have necessary fields for Document objects
docs = []
for doc in documents:
if isinstance(doc, dict):
# Ensure 'page_content' exists; adapt as necessary to fit your data
page_content = doc.get('content', '') # You may need to adjust this depending on how your data is structured
metadata = doc.get('metadata', {})
# Convert to Document object; adjust if your Document class requires different/more fields
docs.append(Document(page_content=page_content, metadata=metadata))
else:
docs.append(doc) # If it's already a Document or another suitable type
chunks = text_splitter.split_documents(docs)
print(f"Split {len(docs)} documents into {len(chunks)} chunks.")
if chunks:
sample_chunk = chunks[0] # Using the first chunk for a sample
print("Sample chunk content and metadata:")
# Make sure to adjust these lines if your chunk structure differs
print(sample_chunk.page_content)
print(sample_chunk.metadata)
else:
print("No chunks were created.")
return chunks
def save_to_chroma(chunks: list[Document]):
print("Saving chunks to Chroma...")
# Clear out the database first.
if os.path.exists(CHROMA_PATH):
print(f"Clearing the existing database at {CHROMA_PATH}...")
shutil.rmtree(CHROMA_PATH)
# Create a new DB from the documents.
print("Creating new Chroma database from chunks...")
db = Chroma.from_documents(
chunks, OpenAIEmbeddings(api_key=openai_api_key), persist_directory=CHROMA_PATH # Pass API key if needed
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if __name__ == "__main__":
main()