Spaces:
Runtime error
Runtime error
| from dotenv import load_dotenv | |
| from langchain.document_loaders import UnstructuredFileLoader | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from glob import glob | |
| import os | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| DOCUMENT_PATH = "data/raw/cixiidae" | |
| DB_DIR = "chroma" | |
| def parse_documents(path): | |
| pdf_files = glob(os.path.join(path, "*.pdf")) | |
| documents = [] | |
| for file_path in pdf_files: | |
| documents.extend(parse_document(file_path)) | |
| return documents | |
| def parse_document(file_path): | |
| try: | |
| loader = UnstructuredFileLoader(file_path) | |
| document = loader.load() | |
| print(f"File parsed: {file_path}") | |
| return document | |
| except Exception as e: | |
| print(f"An error occurred while processing the file {file_path}: {str(e)}") | |
| def split(documents): | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) | |
| return text_splitter.split_documents(documents) | |
| def persist(documents): | |
| embeddings = OpenAIEmbeddings() | |
| vectordb = Chroma.from_documents( | |
| documents, embedding=embeddings, persist_directory=DB_DIR | |
| ) | |
| vectordb.persist() | |
| def main(): | |
| documents = parse_documents(DOCUMENT_PATH) | |
| documents = split(documents) | |
| print(f"Total pages: {len(documents)}") | |
| persist(documents) | |
| if __name__ == "__main__": | |
| main() | |