Spaces:
Sleeping
Sleeping
| import os | |
| import PyPDF2 | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from qdrant_client import QdrantClient | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| PATH_TO_KNOWLEDGE_BASE = "knowledge_base" # Path where the PDFs are stored | |
| COLLECTION_NAME = "aws_faq" # Name of the collection | |
| VECTOR_DB_PATH = "./qdrant" # Change this to your own path | |
| # qdrant_client = QdrantClient(path=VECTOR_DB_PATH) | |
| # If using qdrant cloud, use the following code | |
| qdrant_client = QdrantClient( | |
| os.getenv("QDRANT_URL"), | |
| api_key=os.getenv("QDRANT_API_KEY"), | |
| ) | |
| def ingest_embeddings(): | |
| metadatas = [] | |
| text = [] | |
| for file in os.listdir(PATH_TO_KNOWLEDGE_BASE): | |
| if file.endswith('.pdf'): | |
| pdf_path = os.path.join(PATH_TO_KNOWLEDGE_BASE, file) | |
| pdf_reader = PyPDF2.PdfReader(pdf_path) | |
| page_number = 1 | |
| for page in pdf_reader.pages: | |
| text.append(page.extract_text()) | |
| metadatas.append({"page": page_number, "file": file}) | |
| page_number += 1 | |
| text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=400, chunk_overlap=50) | |
| chunked_documents = text_splitter.create_documents(text, metadatas=metadatas) | |
| chunks, metadata, ids = zip(*[(chunk.page_content, chunk.metadata, i+1) for i, chunk in enumerate(chunked_documents)]) | |
| try: | |
| qdrant_client.add( | |
| collection_name=COLLECTION_NAME, | |
| documents=chunks, | |
| metadata=metadata, | |
| ids=ids | |
| ) | |
| print("Collection created and persisted") | |
| except Exception as error: | |
| print(f"Error: {error}") | |
| if __name__ == "__main__": | |
| ingest_embeddings() |