Spaces:
Sleeping
Sleeping
File size: 3,877 Bytes
c32cdfb c91b827 c32cdfb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Shared vector storage utilities
Handles chunking and storing documents in Qdrant
"""
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_core.documents import Document
from typing import List
load_dotenv()
def get_embeddings():
"""Get OpenAI embeddings instance"""
return OpenAIEmbeddings(
model=os.getenv("OPEN_AI_EMBEDDING_MODEL", "text-embedding-3-small")
)
def get_qdrant_client():
"""Get Qdrant client instance"""
return QdrantClient(
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY")
)
def chunk_documents(
documents: List[Document],
chunk_size: int = 1000,
chunk_overlap: int = 200
) -> List[Document]:
"""
Split documents into chunks
Args:
documents: List of LangChain Document objects
chunk_size: Maximum characters per chunk
chunk_overlap: Overlapping characters between chunks
Returns:
List of chunked Document objects
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = text_splitter.split_documents(documents)
return chunks
def store_documents(documents: List[Document]) -> tuple[int, int]:
"""
Store documents in Qdrant vector database
Args:
documents: List of Document objects with content and metadata
Returns:
Tuple of (expected_count, actual_stored_count)
"""
embeddings = get_embeddings()
client = get_qdrant_client()
collection_name = os.getenv("QDRANT_COLLECTION", "hr-intervals")
# Get count before storing
try:
before_count = client.count(collection_name=collection_name).count
except Exception:
before_count = 0
# Store documents
vectorstore = QdrantVectorStore.from_documents(
documents=documents,
embedding=embeddings,
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY"),
collection_name=collection_name
)
# Verify storage by counting after
try:
after_count = client.count(collection_name=collection_name).count
actual_stored = after_count - before_count
except Exception as e:
print(f" ⚠️ Warning: Could not verify storage: {str(e)}")
actual_stored = len(documents) # Assume success if can't verify
return len(documents), actual_stored
def process_and_store(
documents: List[Document],
chunk_size: int = 1000,
chunk_overlap: int = 200
) -> int:
"""
Complete pipeline: chunk documents and store in vector database
Args:
documents: List of Document objects
chunk_size: Maximum characters per chunk
chunk_overlap: Overlapping characters between chunks
Returns:
Number of chunks stored
"""
# 1. Chunk documents
chunks = chunk_documents(documents, chunk_size, chunk_overlap)
print(f" ✅ Created {len(chunks)} chunks")
# 2. Store in Qdrant with verification
try:
expected, actual_stored = store_documents(chunks)
if actual_stored == expected:
print(f" ✅ Stored {actual_stored} chunks in Qdrant")
elif actual_stored > 0:
print(f" ⚠️ Partial storage: expected {expected}, actually stored {actual_stored}")
else:
print(f" ❌ Storage failed: 0 chunks stored (expected {expected})")
return actual_stored
except Exception as e:
print(f" ❌ Error storing in Qdrant: {str(e)}")
raise |