| from langchain.document_loaders import DirectoryLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.schema import Document |
| from langchain.embeddings import OpenAIEmbeddings |
| from langchain.vectorstores.chroma import Chroma |
| import os |
| import shutil |
| from dotenv import load_dotenv |
| import docx |
| import pdfplumber |
|
|
| |
| load_dotenv() |
|
|
| |
| openai_api_key = os.getenv('OPENAI_API_KEY') |
| |
|
|
| CHROMA_PATH = "chroma" |
| DATA_PATH = "data/books" |
|
|
|
|
| def main(): |
| file_types = ['.txt', '.docx', '.pdf'] |
| documents = load_documents(DATA_PATH, file_types) |
| print(f"Loaded {len(documents)} documents.") |
| generate_data_store(documents) |
|
|
|
|
| def generate_data_store(documents): |
| print("Starting to generate the data store...") |
| if not documents: |
| print("No documents to process. Exiting.") |
| return |
|
|
| chunks = split_text(documents) |
| if not chunks: |
| print("No chunks were created. Exiting.") |
| return |
| save_to_chroma(chunks) |
|
|
|
|
|
|
| def load_documents(directory, file_types): |
| documents = [] |
| for filename in os.listdir(directory): |
| if not filename.endswith(tuple(file_types)): |
| continue |
| filepath = os.path.join(directory, filename) |
| content = "" |
| if filename.endswith('.txt'): |
| with open(filepath, 'r', encoding='utf-8') as f: |
| content = f.read() |
| elif filename.endswith('.docx'): |
| doc = docx.Document(filepath) |
| content = '\n'.join(paragraph.text for paragraph in doc.paragraphs) |
| elif filename.endswith('.pdf'): |
| with pdfplumber.open(filepath) as pdf: |
| content = '\n'.join(page.extract_text() for page in pdf.pages if page.extract_text()) |
| |
| |
| documents.append({'content': content, 'metadata': {'source': filename}}) |
| return documents |
|
|
|
|
| from langchain.schema import Document |
|
|
| def split_text(documents): |
| print("Splitting text into chunks...") |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=500, |
| chunk_overlap=0, |
| length_function=len, |
| add_start_index=True, |
| ) |
| |
| |
| docs = [] |
| for doc in documents: |
| if isinstance(doc, dict): |
| |
| page_content = doc.get('content', '') |
| metadata = doc.get('metadata', {}) |
| |
| docs.append(Document(page_content=page_content, metadata=metadata)) |
| else: |
| docs.append(doc) |
|
|
| chunks = text_splitter.split_documents(docs) |
| print(f"Split {len(docs)} documents into {len(chunks)} chunks.") |
| |
| if chunks: |
| sample_chunk = chunks[0] |
| print("Sample chunk content and metadata:") |
| |
| print(sample_chunk.page_content) |
| print(sample_chunk.metadata) |
| else: |
| print("No chunks were created.") |
| return chunks |
|
|
|
|
|
|
|
|
| def save_to_chroma(chunks: list[Document]): |
| print("Saving chunks to Chroma...") |
| |
| if os.path.exists(CHROMA_PATH): |
| print(f"Clearing the existing database at {CHROMA_PATH}...") |
| shutil.rmtree(CHROMA_PATH) |
|
|
| |
| print("Creating new Chroma database from chunks...") |
| db = Chroma.from_documents( |
| chunks, OpenAIEmbeddings(api_key=openai_api_key), persist_directory=CHROMA_PATH |
| ) |
| db.persist() |
| print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|
|
|