Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from qdrant_client import models, QdrantClient | |
| from sentence_transformers import SentenceTransformer # Example embedding model (open-source) | |
| import uuid | |
| from langchain_core.documents.base import Document | |
| import pathlib | |
| # read pdf files and chunk them into pieces | |
| # upload chunks to Qdrant | |
| QDRANT_URL = "https://807708a6-1d41-4ecb-a1f3-8a41fcd48ec3.us-east4-0.gcp.cloud.qdrant.io:6333" | |
| QDRANT_API_KEY = "5q_XaDCagfcEcqWh3lr68B7b0EGN4Hd-PlUZ7acxP_-cm1rSHqAPTw" # Use your Qdrant API key | |
| #QDRANT_COLLECTION_NAME = "courses-data" | |
| # Initialize Qdrant client | |
| qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=False, api_key=QDRANT_API_KEY) | |
| def create_QDrant_collection(collectionName): | |
| """Create Qdrant collection.""" | |
| # Define collection parameters | |
| collection_name = collectionName | |
| vector_size = 384 # Size of the embedding vectors | |
| distance_metric = models.Distance.COSINE # Distance metric for vector similarity | |
| # Check if the collection already exists | |
| if qdrant_client.collection_exists(collection_name): | |
| # Optionally, delete the existing collection if you need to recreate it | |
| qdrant_client.delete_collection(collection_name) | |
| # Create the collection with the new method | |
| qdrant_client.create_collection( | |
| collection_name=collection_name, | |
| vectors_config=models.VectorParams(size=vector_size, distance=distance_metric) # Pass as vectors_config | |
| ) | |
| print(f"Collection '{collection_name}' created successfully.") | |
| def read_txt_files(directory: str) -> list[Document]: | |
| """Reads all .txt files in a given directory and returns a list of Document objects. | |
| Args: | |
| directory (str): The path to the directory containing .txt files. | |
| Returns: | |
| list[Document]: A list of Document objects containing the content of the .txt files. | |
| """ | |
| documents = [] | |
| for txt_file in pathlib.Path(directory).glob('*.txt'): | |
| with open(txt_file, 'r',encoding="utf-8") as file: | |
| content = file.read() | |
| metadata = {"filename": txt_file.stem} | |
| documents.append(Document(page_content=content, metadata=metadata)) | |
| return documents | |
| def upload_chunks_to_QDrant(documents, collectionName): | |
| records_to_upload = [] | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| for idx, chunk in enumerate(documents): | |
| content = chunk.page_content | |
| filename = chunk.metadata.get("filename", "unknown.txt") # Get filename from metadata | |
| # Get the embedding for the content | |
| vector = embedding_model.encode(content).tolist() # Use encode method for getting the vector | |
| record = models.PointStruct( | |
| id=idx, | |
| vector=vector, | |
| payload={"page_content": content, "filename": filename} # Store filename as part of payload | |
| ) | |
| records_to_upload.append(record) | |
| qdrant_client.upload_points( | |
| collection_name=collectionName, | |
| points=records_to_upload | |
| ) | |
| return | |
| # Example usage | |
| # def collection_create(pdf_path,collection_name): | |
| # print("collection create called") | |
| # create_QDrant_collection(collection_name) | |
| # pdf_file_path = pdf_path | |
| # chunks= chunk_pdf_text(pdf_file_path) | |
| # upload_chunks_to_qdrant(chunks,collection_name) | |
| #create_QDrant_collection("courses-data") | |
| documents = read_txt_files("courses-text-files") | |
| upload_chunks_to_QDrant(documents, "courses-data") |