from langchain.text_splitter import RecursiveCharacterTextSplitter from qdrant_client import models, QdrantClient from sentence_transformers import SentenceTransformer # Example embedding model (open-source) import uuid from langchain_core.documents.base import Document import pathlib # read pdf files and chunk them into pieces # upload chunks to Qdrant QDRANT_URL = "https://807708a6-1d41-4ecb-a1f3-8a41fcd48ec3.us-east4-0.gcp.cloud.qdrant.io:6333" QDRANT_API_KEY = "5q_XaDCagfcEcqWh3lr68B7b0EGN4Hd-PlUZ7acxP_-cm1rSHqAPTw" # Use your Qdrant API key #QDRANT_COLLECTION_NAME = "courses-data" # Initialize Qdrant client qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=False, api_key=QDRANT_API_KEY) def create_QDrant_collection(collectionName): """Create Qdrant collection.""" # Define collection parameters collection_name = collectionName vector_size = 384 # Size of the embedding vectors distance_metric = models.Distance.COSINE # Distance metric for vector similarity # Check if the collection already exists if qdrant_client.collection_exists(collection_name): # Optionally, delete the existing collection if you need to recreate it qdrant_client.delete_collection(collection_name) # Create the collection with the new method qdrant_client.create_collection( collection_name=collection_name, vectors_config=models.VectorParams(size=vector_size, distance=distance_metric) # Pass as vectors_config ) print(f"Collection '{collection_name}' created successfully.") def read_txt_files(directory: str) -> list[Document]: """Reads all .txt files in a given directory and returns a list of Document objects. Args: directory (str): The path to the directory containing .txt files. Returns: list[Document]: A list of Document objects containing the content of the .txt files. """ documents = [] for txt_file in pathlib.Path(directory).glob('*.txt'): with open(txt_file, 'r',encoding="utf-8") as file: content = file.read() metadata = {"filename": txt_file.stem} documents.append(Document(page_content=content, metadata=metadata)) return documents def upload_chunks_to_QDrant(documents, collectionName): records_to_upload = [] embedding_model = SentenceTransformer('all-MiniLM-L6-v2') for idx, chunk in enumerate(documents): content = chunk.page_content filename = chunk.metadata.get("filename", "unknown.txt") # Get filename from metadata # Get the embedding for the content vector = embedding_model.encode(content).tolist() # Use encode method for getting the vector record = models.PointStruct( id=idx, vector=vector, payload={"page_content": content, "filename": filename} # Store filename as part of payload ) records_to_upload.append(record) qdrant_client.upload_points( collection_name=collectionName, points=records_to_upload ) return # Example usage # def collection_create(pdf_path,collection_name): # print("collection create called") # create_QDrant_collection(collection_name) # pdf_file_path = pdf_path # chunks= chunk_pdf_text(pdf_file_path) # upload_chunks_to_qdrant(chunks,collection_name) #create_QDrant_collection("courses-data") documents = read_txt_files("courses-text-files") upload_chunks_to_QDrant(documents, "courses-data")